In [None]:
# Standard libraries
import pandas as pd
import numpy as np
import requests

# GPU & Machine Learning
import cupy as cp
import faiss
from sklearn.feature_extraction.text import TfidfVectorizer  # Using sklearn instead of cuML

# Parallel Processing
from joblib import Parallel, delayed

In [46]:
PUSHOVER_USER_KEY = "your_user_key"
PUSHOVER_APP_TOKEN = "your_app_token"

def send_pushover_message(message, title="Jupyter Notification"):
    """Send a notification to Pushover."""
    url = "https://api.pushover.net/1/messages.json"
    data = {
        "token": PUSHOVER_APP_TOKEN,
        "user": PUSHOVER_USER_KEY,
        "message": message,
        "title": title,
    }
    requests.post(url, data=data)

In [47]:
# Load dataset
csv_path = "resource/csv/recipes.csv"
df = pd.read_csv(csv_path)

# Identify corrupted image values and create a clean image_link column
df["image_link"] = df["Images"].replace({"character(0)": ""})

# Fill NaN values
df["Name"] = df["Name"].fillna("")
df["RecipeIngredientParts"] = df["RecipeIngredientParts"].fillna("")
df["Description"] = df["Description"].fillna("")

# Combine text for similarity search (using relevant recipe information)
df["text"] = df["Name"] + " " + df["RecipeIngredientParts"] + " " + df["Description"] + " " + df["Keywords"].fillna("")

# Split into two groups: those with images and those missing images
df_with_images = df[df["image_link"] != ""]
df_missing_images = df[df["image_link"] == ""]

print(f"Loaded {len(df)} recipes, with {len(df_missing_images)} missing images.")

Loaded 522517 recipes, with 356620 missing images.


In [48]:
# Using scikit-learn's TfidfVectorizer instead of cuML
vectorizer = TfidfVectorizer(stop_words="english", max_features=1000)  # Reduced from 5000 to 1000
tfidf_matrix = vectorizer.fit_transform(df["text"])

# Check the size of the matrix before conversion
print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")
print(f"Number of non-zero entries: {tfidf_matrix.nnz}")
estimated_size_gb = tfidf_matrix.shape[0] * tfidf_matrix.shape[1] * 4 / (1024**3)
print(f"Estimated dense matrix size: {estimated_size_gb:.2f} GB")

# Process in batches to avoid memory issues
batch_size = 1000  # Adjust based on your available memory
num_batches = (tfidf_matrix.shape[0] + batch_size - 1) // batch_size

# Initialize an empty list to store batches
tfidf_numpy_batches = []

for i in range(num_batches):
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, tfidf_matrix.shape[0])
    
    # Process this batch
    batch = tfidf_matrix[start_idx:end_idx].toarray().astype('float32')
    
    # Normalize this batch
    faiss.normalize_L2(batch)
    
    # Store the normalized batch
    tfidf_numpy_batches.append(batch)
    
    print(f"Processed batch {i+1}/{num_batches}")

# Combine all batches
tfidf_numpy = np.vstack(tfidf_numpy_batches)

print("TF-IDF vectorization and normalization completed.")

TF-IDF matrix shape: (522517, 1000)
Number of non-zero entries: 15019798
Estimated dense matrix size: 1.95 GB
Processed batch 1/523
Processed batch 2/523
Processed batch 3/523
Processed batch 4/523
Processed batch 5/523
Processed batch 6/523
Processed batch 7/523
Processed batch 8/523
Processed batch 9/523
Processed batch 10/523
Processed batch 11/523
Processed batch 12/523
Processed batch 13/523
Processed batch 14/523
Processed batch 15/523
Processed batch 16/523
Processed batch 17/523
Processed batch 18/523
Processed batch 19/523
Processed batch 20/523
Processed batch 21/523
Processed batch 22/523
Processed batch 23/523
Processed batch 24/523
Processed batch 25/523
Processed batch 26/523
Processed batch 27/523
Processed batch 28/523
Processed batch 29/523
Processed batch 30/523
Processed batch 31/523
Processed batch 32/523
Processed batch 33/523
Processed batch 34/523
Processed batch 35/523
Processed batch 36/523
Processed batch 37/523
Processed batch 38/523
Processed batch 39/523
Pr

In [49]:
# Initialize FAISS index with GPU acceleration using Inner Product (for cosine similarity)
gpu_resources = faiss.StandardGpuResources()
index = faiss.GpuIndexFlatIP(gpu_resources, tfidf_numpy.shape[1])

# Add vectors to FAISS index
index.add(tfidf_numpy)

print(f"FAISS index built with {index.ntotal} recipes using cosine similarity.")

FAISS index built with 522517 recipes using cosine similarity.


In [51]:
# Process the missing images without parallelization
# This avoids the pickling error with the FAISS index
def find_best_image(idx, tf_matrix, faiss_index, dataframe):
    """Finds the closest matching recipe with an existing image using cosine similarity."""
    # Prepare the query vector (using NumPy array)
    query_vector = tf_matrix[idx].reshape(1, -1)
    
    # For cosine similarity, higher values are better (more similar)
    # Get top 10 matches to find suitable recipes with images
    k = 10  # Look for more candidates to ensure we find ones with images
    scores, closest_idx = faiss_index.search(query_vector, k)
    
    # Find the first match that has an image and is not the current recipe
    for i in range(len(closest_idx[0])):
        candidate_idx = closest_idx[0][i]
        # Skip if it's the same recipe or if it doesn't have an image
        if candidate_idx == idx or dataframe.iloc[candidate_idx]["image_link"] == "":
            continue
        return dataframe.iloc[candidate_idx]["image_link"]
    
    # If no suitable match was found among the top k, use the first recipe with an image
    first_with_image = dataframe[dataframe["image_link"] != ""].index[0]
    return dataframe.iloc[first_with_image]["image_link"]

# Process each missing image sequentially (no parallelization)
total_missing = len(df_missing_images)
print(f"Processing {total_missing} missing images...")

# Update missing images one by one with progress tracking
for i, idx in enumerate(df_missing_images.index):
    df.at[idx, "image_link"] = find_best_image(idx, tfidf_numpy, index, df)
    
    # Print progress every image
    print(f"{i+1}/{total_missing} images processed", end="\r")
    
    # Add more detailed progress every 100 images
    if (i+1) % 100 == 0 or i+1 == total_missing:
        percent_done = ((i+1) / total_missing) * 100
        print(f"{i+1}/{total_missing} images processed ({percent_done:.1f}%)")
    
print("\nImage assignment completed.")

Processing 356620 missing images...
100/356620 images processed (0.0%)
200/356620 images processed (0.1%)
300/356620 images processed (0.1%)
400/356620 images processed (0.1%)
500/356620 images processed (0.1%)
600/356620 images processed (0.2%)
700/356620 images processed (0.2%)
800/356620 images processed (0.2%)
900/356620 images processed (0.3%)
1000/356620 images processed (0.3%)
1100/356620 images processed (0.3%)
1200/356620 images processed (0.3%)
1300/356620 images processed (0.4%)
1400/356620 images processed (0.4%)
1500/356620 images processed (0.4%)
1600/356620 images processed (0.4%)
1700/356620 images processed (0.5%)
1800/356620 images processed (0.5%)
1900/356620 images processed (0.5%)
2000/356620 images processed (0.6%)
2100/356620 images processed (0.6%)
2200/356620 images processed (0.6%)
2300/356620 images processed (0.6%)
2400/356620 images processed (0.7%)
2500/356620 images processed (0.7%)
2600/356620 images processed (0.7%)
2700/356620 images processed (0.8%)
2

In [None]:
output_path = "resource/csv/completed_recipes.csv"
df.to_csv(output_path, index=False)

print(f"✅ Dataset saved at {output_path}")
#send_pushover_message("✅ Dataset saved successfully!", title="Jupyter Job Done")

✅ Dataset saved at resource/csv/completed_recipes.csv


: 