### Imports

In [7]:
import re
import pickle
from collections import Counter
from typing import List, Dict, Tuple, Optional, Union

import pandas as pd
import numpy as np

import torch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from tqdm import tqdm 

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer

import warnings
warnings.filterwarnings("ignore")

import nltk
nltk.download(["punkt", "stopwords", "wordnet", "averaged_perceptron_tagger", "punkt_tab"])

[nltk_data] Downloading package punkt to /home/amelia/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/amelia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/amelia/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/amelia/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt_tab to /home/amelia/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

### Test mode

In [8]:
# Test mode
TEST_MODE = True 
SAMPLE_SIZE = 100000 

### Data preparation

In [None]:
# DATA LOADING
recipes_df = pd.read_csv("RAW_recipes.csv")
interactions_df = pd.read_csv("RAW_interactions.csv")
if TEST_MODE:
    recipes_df = recipes_df.sample(n=min(SAMPLE_SIZE, len(recipes_df)), random_state=5)
    interactions_df = interactions_df[interactions_df['recipe_id'].isin(recipes_df['id'])]
print(f"Recipes: {len(recipes_df):,} | Interactions: {len(interactions_df):,}")


print(f"Recipes dataset: {recipes_df.shape[0]:,} recipes, {recipes_df.shape[1]} features")
print(f"Interactions dataset: {interactions_df.shape[0]:,} interactions, {interactions_df.shape[1]} features")

# Preview recipe structure
print("\nRecipe columns:", list(recipes_df.columns))
print("\nSample recipe entry:")
print(recipes_df.iloc[0])


# DATA MERGING
# Join recipes with user interactions
merged_data = recipes_df.merge(interactions_df, how="inner", left_on="id", right_on="recipe_id")

# Remove duplicate column after merge
if "recipe_id" in merged_data.columns:
    merged_data = merged_data.drop(columns=["recipe_id"])

print(f"Merged dataset size: {merged_data.shape[0]:,} rows, {merged_data.shape[1]} columns")

# Check for missing data
null_summary = merged_data.isnull().sum()
null_present = null_summary[null_summary > 0]
print("\nColumns with missing values:")
if len(null_present) > 0:
    for col, count in null_present.items():
        pct = (count / len(merged_data)) * 100
        print(f"  {col}: {count:,} ({pct:.2f}%)")
else:
    print("none found")


# NUTRITIONAL DATA EXTRACTION
nutrition_columns = ["calories", "total_fat", "sugar", "sodium", "protein", "saturated_fat", "carbohydrates"]

def parse_nutrition_string(nutrition_str: str) -> List[float]:
    try:
        cleaned = str(nutrition_str).strip("[]")
        values = [float(v.strip()) for v in cleaned.split(",")]
        if len(values) != 7:
            return [np.nan] * 7
        return values
    except (ValueError, AttributeError):
        return [np.nan] * 7

# Apply parsing and create new columns
nutrition_values = merged_data["nutrition"].apply(parse_nutrition_string)
nutrition_df = pd.DataFrame(nutrition_values.tolist(), columns=nutrition_columns, index=merged_data.index)

# Merge with dataframe
merged_data = pd.concat([merged_data, nutrition_df], axis=1)

for col in nutrition_columns:
    upper = merged_data[col].quantile(0.99)
    merged_data = merged_data[merged_data[col] <= upper]

print("Nutritional features added:")
print(merged_data[nutrition_columns].describe().round(2)) # statistical info


# RECIPE CHARACTERISTICS ANALYSIS
# Get unique recipes for analysis
unique_recipes = merged_data.drop_duplicates(subset=["id"])
print(f"Unique recipes in dataset: {len(unique_recipes):,}")


# INGREDIENT ANALYSIS
def extract_ingredients_list(ing_string: str) -> List[str]:
    try:
        cleaned = str(ing_string).replace("[", "").replace("]", "").replace("'", "")
        ingredients = [i.strip().lower() for i in cleaned.split(",") if i.strip()]
        return ingredients
    except (AttributeError, TypeError):
        return []

# Extract all ingredients
all_ingredients = []
for ing_str in unique_recipes["ingredients"]:
    all_ingredients.extend(extract_ingredients_list(ing_str))

# Count frequency
ingredient_counts = Counter(all_ingredients)
top_ingredients = pd.DataFrame(ingredient_counts.most_common(20), columns=["ingredient", "count"])

print("\nTop 20 most common ingredients:")
print(top_ingredients.to_string(index=False))


# TAG ANALYSIS
def extract_tags_list(tag_string: str) -> List[str]:
    try:
        cleaned = str(tag_string).replace("[", "").replace("]", "").replace("'", "")
        tags = [t.strip().lower() for t in cleaned.split(",") if t.strip()]
        return tags
    except (AttributeError, TypeError):
        return []

# Extract all tags
all_tags = []
for tag_str in unique_recipes["tags"]:
    all_tags.extend(extract_tags_list(tag_str))

# Count frequency
tag_counts = Counter(all_tags)
top_tags = pd.DataFrame(tag_counts.most_common(30), columns=["tag", "frequency"])

print("\nTop 30 most common tags:")
print(top_tags.to_string(index=False))

# Semantic tags for analysis
semantic_tags = ["comfort-food", "healthy", "quick", "easy", "romantic", "vegetarian", 
                 "low-carb", "dessert", "dinner", "lunch", "breakfast", "italian",
                 "mexican", "asian", "mediterranean", "summer", "winter", "holiday"]

print("\nSemantic tags present in dataset:")
for tag in semantic_tags:
    count = tag_counts.get(tag, 0)
    if count > 0:
        print(f"{tag}: {count:,} recipes")


# CORPUS CREATION
# Create corpus dataframe with unique recipes
corpus = unique_recipes[["id", "name", "tags", "description", "ingredients", "steps"]].copy()

# Clean each text field
text_columns = ["name", "tags", "description", "ingredients", "steps"]

for col in text_columns:
    corpus[col + "_clean"] = (
        corpus[col]
        .fillna("") # removing NaN
        .astype(str)
        .str.replace("[", "", regex=False)
        .str.replace("]", "", regex=False)
        .str.replace("'", "", regex=False)
        .str.replace('"', "", regex=False)
        .str.lower()
        .str.replace(r"\s+", " ", regex=True)
        .str.strip()
    )
    print(f"  Cleaned: {col}")

# Combine all text into single document per recipe
corpus["document"] = (
    corpus["name_clean"] + " " +
    corpus["tags_clean"] + " " +
    corpus["description_clean"] + " " +
    corpus["ingredients_clean"] + " " +
    corpus["steps_clean"]
)

# Final cleaning
corpus["document"] = (
    corpus["document"]
    .str.replace(r"\s+", " ", regex=True)
    .str.strip()
)

# Add word count
corpus["word_count"] = corpus["document"].str.split().str.len()

# Remove recipes with empty documents
initial_count = len(corpus)
corpus = corpus[corpus["document"].str.len() > 10].copy()
print(f"\nRemoved {initial_count - len(corpus)} recipes with insuffcient text")
print(f"Final corpus size: {len(corpus):,} recipes")


# EXPORT CORPUS
# Save full corpus with metadata
corpus_export = corpus[["id", "name", "tags_clean", "document", "word_count"]].copy()
corpus_export.columns = ["recipe_id", "recipe_name", "tags", "document", "word_count"]
corpus_export.to_csv("search_corpus.csv", index=False)

# Save recipe metadata
metadata = unique_recipes[["id", "name", "minutes", "n_ingredients", "n_steps", "description"]].copy()
metadata.columns = ["recipe_id", "recipe_name", "cooking_time", "num_ingredients", "num_steps", "description"]
metadata.to_csv("recipe_metadata.csv", index=False)


# CORPUS QUALITY CHECK
test_queries = [
    "comfort food",
    "healthy dinner",
    "quick breakfast",
    "romantic dinner",
    "vegetarian lunch",
    "low carb",
    "summer dessert",
    "holiday cookies"
]

for query in test_queries:
    query_terms = query.lower().split()
    mask = pd.Series([True] * len(corpus), index=corpus.index)
    for term in query_terms:
        mask = mask & corpus["document"].str.contains(term, regex=False)
    matches = mask.sum()
    print(f"  '{query}': {matches:,} potential matches")


# LOAD CORPUS FOR SEARCH ENGINES
corpus_df = pd.read_csv("search_corpus.csv")
metadata_df = pd.read_csv("recipe_metadata.csv")

print(f"Loaded {len(corpus_df):,} recipes")
print(f"Average document length: {corpus_df['word_count'].mean():.0f} words")

Recipes: 100,000 | Interactions: 488,928
Recipes dataset: 100,000 recipes, 12 features
Interactions dataset: 488,928 interactions, 5 features

Recipe columns: ['name', 'id', 'minutes', 'contributor_id', 'submitted', 'tags', 'nutrition', 'n_steps', 'steps', 'description', 'ingredients', 'n_ingredients']

Sample recipe entry:
name                                      best ever chicken alfredo
id                                                           131473
minutes                                                          30
contributor_id                                               204199
submitted                                                2005-07-28
tags              ['30-minutes-or-less', 'time-to-make', 'course...
nutrition                [593.8, 28.0, 9.0, 42.0, 62.0, 22.0, 24.0]
n_steps                                                           9
steps             ['in large skillet , over medium-high heat , h...
description       a quick and easy pasta dish made with mushro

### TF-IDF search engine

In [None]:
# TEXT PREPROCESSING CLASS
class TextPreprocessor:
    def __init__(self, 
                 remove_stopwords: bool = True, 
                 use_lemmatization: bool = True,
                 use_stemming: bool = False,
                 min_word_length: int = 2,
                 custom_stopwords: Optional[set] = None):
        self.remove_stopwords = remove_stopwords
        self.use_lemmatization = use_lemmatization
        self.use_stemming = use_stemming
        self.min_word_length = min_word_length
        
        # Initialize NLTK tools
        self.lemmatizer = WordNetLemmatizer() if use_lemmatization else None
        self.stemmer = PorterStemmer() if use_stemming else None
        
        # Build stopword set
        self.stopwords = set(stopwords.words('english')) if remove_stopwords else set()
        
        # Add recipe-specific stopwords
        recipe_stopwords = {
            'cup', 'cups', 'tablespoon', 'tablespoons', 'teaspoon', 'teaspoons',
            'tbsp', 'tsp', 'oz', 'ounce', 'ounces', 'pound', 'pounds', 'lb', 'lbs',
            'inch', 'inches', 'minute', 'minutes', 'hour', 'hours',
            'medium', 'large', 'small', 'fresh', 'chopped', 'minced', 'diced',
            'add', 'place', 'put', 'make', 'use', 'take', 'get', 'set',
            'recipe', 'recipes', 'ingredient', 'ingredients', 'step', 'steps',
            'one', 'two', 'three', 'four', 'five', 'six', 'time', 'preparation',
            'optional', 'needed', 'taste', 'degree', 'degrees'
        }
        self.stopwords.update(recipe_stopwords)
        
        if custom_stopwords:
            self.stopwords.update(custom_stopwords)
    
    def preprocess(self, text: str) -> str:
        
        if pd.isna(text) or not isinstance(text, str) or not text.strip():
            return ""
        text = text.lower()
        text = re.sub(r'[^a-zA-Z\s]', ' ', text)
        
        # Tokenize
        try:
            tokens = word_tokenize(text)
        except Exception:
            tokens = text.split()
        
        # Process tokens
        processed_tokens = []
        for token in tokens:
            if len(token) < self.min_word_length:
                continue
            
            if self.remove_stopwords and token in self.stopwords:
                continue
            
            if self.use_lemmatization and self.lemmatizer:
                token = self.lemmatizer.lemmatize(token, pos='v')
                token = self.lemmatizer.lemmatize(token, pos='n')
            elif self.use_stemming and self.stemmer:
                token = self.stemmer.stem(token)
            
            processed_tokens.append(token)
        return ' '.join(processed_tokens)
    
    def preprocess_batch(self, texts: Union[List[str], pd.Series], show_progress: bool = True) -> List[str]:
        
        if isinstance(texts, pd.Series): # convert to list so that len() works
            texts = texts.tolist()
        else:
            texts = list(texts)
        processed = []
        
        iterator = tqdm(texts, desc="Preprocessing", disable=not show_progress)
        for text in iterator:
            processed.append(self.preprocess(text))
        return processed

# TF-IDF SEARCH ENGINE CLASS
class TFIDFSearchEngine: 
    def __init__(self, 
                 ngram_range: Tuple[int, int] = (1, 2),
                 max_features: int = 50000, # limit for memory reasons
                 min_df: int = 2,
                 max_df: float = 0.95,
                 sublinear_tf: bool = True): # 1 + log(tf)
        self.ngram_range = ngram_range
        self.max_features = max_features
        
        self.vectorizer = TfidfVectorizer( # Converts a collection of raw documents to a matrix of TF-IDF features
            ngram_range=ngram_range,
            max_features=max_features,
            min_df=min_df,
            max_df=max_df,
            sublinear_tf=sublinear_tf,
            dtype=np.float32
        )
        
        self.preprocessor = TextPreprocessor(
            remove_stopwords=True,
            use_lemmatization=True,
            use_stemming=False
        )
        
        self.tfidf_matrix = None
        self.document_ids = None
        self.id_to_index: Dict[int, int] = {}  
        self.is_fitted = False
        
    def fit(self, documents: Union[List[str], pd.Series], 
            document_ids: Optional[List] = None, 
            preprocess: bool = True) -> None:
        
        print("Fiting TF-IDF model")
        
        # Convert to list
        if isinstance(documents, pd.Series):
            documents = documents.tolist()
        else:
            documents = list(documents)
        
        # Store document IDs
        if document_ids is not None:
            self.document_ids = list(document_ids)
        else:
            self.document_ids = list(range(len(documents)))
        
        # Build fast lookup dictionary
        self.id_to_index = {doc_id: idx for idx, doc_id in enumerate(self.document_ids)}
        
        # Preprocess documents
        if preprocess:
            print("Preprocessing documents")
            processed_docs = self.preprocessor.preprocess_batch(documents, show_progress=True)
        else:
            processed_docs = documents
        
        # Fit and transform TF-IDF
        print("Computing TF-IDF matrix")
        self.tfidf_matrix = self.vectorizer.fit_transform(processed_docs)
        self.is_fitted = True
        
        # statistics
        vocab_size = len(self.vectorizer.vocabulary_)
        n_docs = self.tfidf_matrix.shape[0]
        sparsity = 1.0 - (self.tfidf_matrix.nnz / (n_docs * vocab_size)) 
        
        print(f"\nTF-IDF Model Statistics:")
        print(f"Documents: {n_docs:,}")
        print(f"Vocabulary size: {vocab_size:,}")
        print(f"N-gram range: {self.ngram_range}")
        print(f"Matrix shape: {self.tfidf_matrix.shape}")
        print(f"Matrix sparsity: {sparsity:.2%}")
        print(f"Non-zero elements: {self.tfidf_matrix.nnz:,}")
        
    def search(self, query: str, top_k: int = 10, preprocess: bool = True) -> List[Tuple[int, float]]:
        if not self.is_fitted:
            raise ValueError("Model not fitted fit() first")
        
        # Validate query
        if not query or not query.strip():
            return []
        
        # Preprocess query
        if preprocess:
            processed_query = self.preprocessor.preprocess(query)
        else:
            processed_query = query
        
        # Check if query has any valid terms after preprocessing
        if not processed_query.strip():
            return []
        
        # Transform query to TF-IDF vector
        query_vector = self.vectorizer.transform([processed_query])
        
        # Compute cosine similarities
        similarities = cosine_similarity(query_vector, self.tfidf_matrix).flatten()
        
        # Get top-k results
        top_indices = np.argsort(similarities)[::-1][:top_k]
        
        results = []
        for idx in top_indices:
            doc_id = self.document_ids[idx]
            score = float(similarities[idx])
            if score > 0:  # Only include results with positive similarity
                results.append((doc_id, score))
        
        return results
    
    def get_query_terms(self, query: str, preprocess: bool = True) -> Dict:
        if preprocess:
            processed_query = self.preprocessor.preprocess(query)
        else:
            processed_query = query
        
        query_terms = processed_query.split()
        vocabulary = set(self.vectorizer.vocabulary_.keys())
        
        matched = [t for t in query_terms if t in vocabulary]
        unmatched = [t for t in query_terms if t not in vocabulary]
        
        matched_ngrams = []
        for n in range(2, self.ngram_range[1] + 1):
            for i in range(len(query_terms) - n + 1):
                ngram = ' '.join(query_terms[i:i+n])
                if ngram in vocabulary:
                    matched_ngrams.append(ngram)
        
        return {
            'original_query': query,
            'processed_query': processed_query,
            'matched_terms': matched,
            'matched_ngrams': matched_ngrams,
            'unmatched_terms': unmatched
        }
    
    def get_top_terms_for_document(self, doc_id: int, top_k: int = 10) -> List[Tuple[str, float]]:
        if doc_id not in self.id_to_index:
            raise ValueError(f"document id {doc_id} not found")
        
        idx = self.id_to_index[doc_id]
        feature_names = self.vectorizer.get_feature_names_out()
        doc_vector = self.tfidf_matrix[idx].toarray().flatten()
        
        top_indices = np.argsort(doc_vector)[::-1][:top_k]
        
        return [(feature_names[i], float(doc_vector[i])) for i in top_indices if doc_vector[i] > 0]
    
    def save(self, filepath: str) -> None:
        with open(filepath, 'wb') as f:
            pickle.dump({
                'vectorizer': self.vectorizer,
                'tfidf_matrix': self.tfidf_matrix,
                'document_ids': self.document_ids,
                'id_to_index': self.id_to_index,
                'preprocessor': self.preprocessor,
                'ngram_range': self.ngram_range,
                'max_features': self.max_features
            }, f)
    
    def load(self, filepath: str) -> None: # loading fitted model previously saved
        with open(filepath, 'rb') as f:
            data = pickle.load(f)
        
        self.vectorizer = data['vectorizer']
        self.tfidf_matrix = data['tfidf_matrix']
        self.document_ids = data['document_ids']
        self.id_to_index = data.get('id_to_index', {doc_id: idx for idx, doc_id in enumerate(self.document_ids)})
        self.preprocessor = data['preprocessor']
        self.ngram_range = data['ngram_range']
        self.max_features = data['max_features']
        self.is_fitted = True

# UTILITY FUNCTIONS
def display_search_results(results, metadata_df, corpus_df, query, show_snippet=True):
    if not results:
        print("No results found")
        return
    
    for rank, (recipe_id, score) in enumerate(results, 1):
        meta_row = metadata_df[metadata_df['recipe_id'] == recipe_id]
        corpus_row = corpus_df[corpus_df['recipe_id'] == recipe_id]
        
        if len(meta_row) == 0:
            continue
            
        meta = meta_row.iloc[0]
        
        print(f"\n[{rank}] {meta['recipe_name']}")
        print(f"Score: {score:.4f}")
        print(f"Cooking Time: {meta['cooking_time']} min | "
              f"Ingredients: {meta['num_ingredients']} | "
              f"Steps: {meta['num_steps']}")
        
        if show_snippet and len(corpus_row) > 0:
            doc = corpus_row.iloc[0]['document']
            snippet = doc[:200] + "..." if len(doc) > 200 else doc
            print(f"Preview: {snippet}")
        
        if pd.notna(meta['description']) and str(meta['description']) != 'nan':
            desc = str(meta['description'])[:150]
            print(f"Description: {desc}...")


def analyze_query_matching(search_engine: TFIDFSearchEngine, query: str) -> None:
    
    analysis = search_engine.get_query_terms(query)
    
    print(f"\n--- Query Analysis for: '{query}' ---")
    print(f"Processed query: '{analysis['processed_query']}'")
    print(f"Matched unigrams: {analysis['matched_terms']}")
    print(f"Matched n-grams: {analysis['matched_ngrams']}")
    print(f"Unmatched terms: {analysis['unmatched_terms']}")



# BUILD TF-IDF SEARCH ENGINE
tfidf_engine = TFIDFSearchEngine(
    ngram_range=(1, 2),
    max_features=50000,
    min_df=3, 
    max_df=0.90,
    sublinear_tf=True
)

tfidf_engine.fit(
    documents=corpus_df['document'],
    document_ids=corpus_df['recipe_id'].tolist(),
    preprocess=True
)

tfidf_engine.save("tfidf_search_engine.pkl")


# TEST TF-IDF ENGINE
test_queries = [
    "chocolate cake",
    "",
    "chicken soup",
    "comfort food for a rainy day",
    "healthy dinner after gym",
    "quick and easy breakfast",
    "romantic dinner for two",
    "light summer salad",
]

for query in test_queries[:4]:  # test first 4 queries
    analyze_query_matching(tfidf_engine, query)
    results = tfidf_engine.search(query, top_k=3)
    display_search_results(results, metadata_df, corpus_df, query, show_snippet=False)

Fiting TF-IDF model
Preprocessing documents


Preprocessing:   0%|          | 401/93410 [00:00<01:09, 1335.33it/s]

Preprocessing: 100%|██████████| 93410/93410 [01:07<00:00, 1383.16it/s]


Computing TF-IDF matrix

TF-IDF Model Statistics:
Documents: 93,410
Vocabulary size: 50,000
N-gram range: (1, 2)
Matrix shape: (93410, 50000)
Matrix sparsity: 99.72%
Non-zero elements: 13,038,065

--- Query Analysis for: 'chocolate cake' ---
Processed query: 'chocolate cake'
Matched unigrams: ['chocolate', 'cake']
Matched n-grams: ['chocolate cake']
Unmatched terms: []

[1] store bought chocolate cake and milk
Score: 0.3790
Cooking Time: 5 min | Ingredients: 2 | Steps: 4
Description: this is a recipe that my friend's dad would always make when we were in elementary school. they bought a chocolate cake with vanilla icing. sliced it ...

[2] cake mix chocolate cookies
Score: 0.3639
Cooking Time: 35 min | Ingredients: 5 | Steps: 4
Description: here is a real easy one.  just use a chocolate cake mix.  i add a hand full of chocolate chips to get a double chocolaute fix....

[3] miracle whip chocolate cake
Score: 0.3487
Cooking Time: 55 min | Ingredients: 8 | Steps: 3
Description: this is a 

### Neural embeddings search engine

In [None]:
# Check for GPU (shorter run time needed)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
if device == "cuda":
    print(f"GPU: {torch.cuda.get_device_name(0)}")


# DOCUMENT PREPARER
class DocumentPreparer: 
    def __init__(self, max_length_words: int = None):
        self.max_length_words = max_length_words 
        # turns recipe dataframe rows into optimised strings for embeddings (name, tag, content)
    
    def prepare_document(self, row: pd.Series) -> str: 
        # processes a sigle string by extracting and formatting relevant fields
        parts = []
        
        # Recipe name
        recipe_name = row.get('recipe_name', '')
        if pd.notna(recipe_name) and str(recipe_name).strip():
            name = str(recipe_name).strip()
            parts.append(f"Recipe: {name}")
        
        # Tags (necessary for semantic matching)
        tags = row.get('tags', '')
        if pd.notna(tags) and str(tags).strip():
            tags_clean = str(tags).strip().replace(',', ', ')
            parts.append(f"Tags: {tags_clean}")
        
        # Full document content
        document = row.get('document', '')
        if pd.notna(document) and str(document).strip():
            doc = str(document).strip()
            if self.max_length_words is not None:
                words = doc.split()
                if len(words) > self.max_length_words:
                    doc = ' '.join(words[:self.max_length_words])
            parts.append(doc)
        
        return ' '.join(parts) # returns one string
    
    def prepare_batch(self, df: pd.DataFrame, show_progress: bool = True) -> List[str]: 
        # elaborates the entire dataframe iterating row by row and for each it gives a formatted string ready for embedding
        prepared = []
        iterator = tqdm(df.iterrows(), total=len(df), desc="Preparing documents", disable=not show_progress)
        
        for _, row in iterator:
            prepared.append(self.prepare_document(row))
        
        return prepared 



# EMBEDDINGS SEARCH ENGINE 

class EmbeddingsSearchEngine: 
    # converts documents and queries into dense vector representations (embeddings) using a SentenceTransformer model
    
    def __init__(self, model_name: str = "all-MiniLM-L6-v2", batch_size: int = 64, max_seq_length: int = 384):
        self.model_path = model_name
        self.batch_size = batch_size
        self.model = SentenceTransformer(self.model_path, device=device)

        # Set the maximum sequence length (tokens) used by the model tokenizer for truncation
        self.model.max_seq_length = max_seq_length
        print(f"Max sequence length (tokens): {self.model.max_seq_length}")

        # embedding dimension
        self.embedding_dim = self.model.get_sentence_embedding_dimension()
        print(f"Embedding dimension: {self.embedding_dim}")

        self.embeddings: Optional[np.ndarray] = None  
        self.document_ids: Optional[List] = None
        self.id_to_index: Dict[int, int] = {}
        self.is_fitted = False

        self.doc_preparer = DocumentPreparer(max_length_words=None)  

    def fit(self, documents: Union[pd.DataFrame, List[str]], document_ids: Optional[List] = None, show_progress: bool = True) -> None:
        # calculates embeddings for all documents in the corpus

        if isinstance(documents, pd.DataFrame):
            doc_list = self.doc_preparer.prepare_batch(documents, show_progress=show_progress)
            if document_ids is None and 'recipe_id' in documents.columns:
                document_ids = documents['recipe_id'].tolist()
        elif isinstance(documents, pd.Series):
            doc_list = documents.tolist()
        else:
            doc_list = list(documents)

        # Store document IDs
        if document_ids is not None:
            self.document_ids = list(document_ids)
        else:
            self.document_ids = list(range(len(doc_list)))

        # O(1) lookup
        self.id_to_index = {doc_id: idx for idx, doc_id in enumerate(self.document_ids)}

        # Compute embeddings in batches
        self.embeddings = self.model.encode(
            doc_list,
            batch_size=self.batch_size,
            show_progress_bar=show_progress,
            convert_to_numpy=True,
            normalize_embeddings=True  # important for cosine via dot-product
        ).astype(np.float32)

        print(f"Embeddings shape: {self.embeddings.shape}")

        self.is_fitted = True

        print(f"\nEmbeddings search engine statistics:")
        print(f"Documents: {len(self.document_ids):,}")
        print(f"Embedding dimension: {self.embedding_dim}")

    def encode_query(self, query: str) -> np.ndarray:
        embedding = self.model.encode(query, convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
        return embedding

    def _top_k_from_scores(self, scores: np.ndarray, top_k: int) -> List[int]:
        n = scores.shape[0]
        if n == 0:
            return []
        k = min(top_k, n)

        # argpartition for speed, then sort those k
        idx_part = np.argpartition(scores, -k)[-k:]
        idx_sorted = idx_part[np.argsort(scores[idx_part])[::-1]]
        return idx_sorted.tolist()

    def search(self, query: str, top_k: int = 10) -> List[Tuple[int, float]]:
        # finds recipes similar to a query by calculating similarity score between embeddings and wueries
        if not self.is_fitted:
            raise ValueError("Model not fitted, do fit() first") # important

        if not query or not query.strip():
            return []

        q = self.encode_query(query)
        # cosine similarity because both sides are normalized
        scores = self.embeddings @ q 

        top_indices = self._top_k_from_scores(scores, top_k)

        results = []
        for idx in top_indices:
            doc_id = self.document_ids[idx]
            results.append((doc_id, float(scores[idx])))
        return results

    def search_batch(self, queries: List[str], top_k: int = 10) -> Dict[str, List[Tuple[int, float]]]:
        # calculates all similarities between queries and documents with one matrix product
        if not self.is_fitted:
            raise ValueError("Model not fitted, fit() first")

        valid_queries = [q for q in queries if q and q.strip()]
        if not valid_queries:
            return {q: [] for q in queries}

        query_embeddings = self.model.encode(valid_queries, batch_size=self.batch_size, convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)  # (Q, D)

        # (Q, N) cosine similarities via dot-product
        score_matrix = query_embeddings @ self.embeddings.T

        results: Dict[str, List[Tuple[int, float]]] = {}
        for i, query in enumerate(valid_queries):
            scores = score_matrix[i]
            top_indices = self._top_k_from_scores(scores, top_k)
            results[query] = [(self.document_ids[idx], float(scores[idx])) for idx in top_indices]

        for query in queries:
            if query not in results:
                results[query] = []

        return results

    def get_similar_recipes(self, recipe_id: int, top_k: int = 10) -> List[Tuple[int, float]]:
        # gets recipes similar ro a recipe, not a query
        if not self.is_fitted:
            raise ValueError("Model not fitted do fit() first")

        if recipe_id not in self.id_to_index:
            raise ValueError(f"Recipe ID {recipe_id} not found in index")

        idx = self.id_to_index[recipe_id]
        v = self.embeddings[idx] 

        scores = self.embeddings @ v 
        scores[idx] = -np.inf  # exclude self by setting its score to -infinity

        top_indices = self._top_k_from_scores(scores, top_k)

        return [(self.document_ids[i], float(scores[i])) for i in top_indices]
    

    def save(self, filepath: str) -> None:
        # saves embeddings 
        with open(filepath, 'wb') as f:
            pickle.dump({
                'embeddings': self.embeddings,
                'document_ids': self.document_ids,
                'id_to_index': self.id_to_index,
                'model_path': self.model_path,
                'embedding_dim': self.embedding_dim
            }, f)

    def load(self, filepath: str) -> None:
        # loads embeddings
        with open(filepath, 'rb') as f:
            data = pickle.load(f)

        self.embeddings = data['embeddings'].astype(np.float32)
        self.document_ids = data['document_ids']
        self.id_to_index = data.get(
            'id_to_index',
            {doc_id: idx for idx, doc_id in enumerate(self.document_ids)}
        )
        self.embedding_dim = data['embedding_dim']

        self.is_fitted = True

# UTILITY FUNCTIONS 
def display_search_results_embeddings(results: List[Tuple[int, float]], 
                                      metadata_df: pd.DataFrame, 
                                      corpus_df: pd.DataFrame, 
                                      query: str, 
                                      engine_name: str = "EMBEDDINGS") -> None:
    
    print(f"\n{engine_name} RESULTS FOR: '{query}'")
    
    if not results:
        print("No results found")
        return
    
    for rank, (recipe_id, score) in enumerate(results, 1):
        meta_row = metadata_df[metadata_df['recipe_id'] == recipe_id]
        
        if len(meta_row) == 0:
            continue
            
        meta = meta_row.iloc[0]
        
        print(f"\n[{rank}] {meta['recipe_name']}")
        print(f"Similarity Score: {score:.4f}")
        print(f"Cooking Time: {meta['cooking_time']} min | "
              f"Ingredients: {meta['num_ingredients']} | "
              f"Steps: {meta['num_steps']}")
        
        # Show tags if available
        corpus_row = corpus_df[corpus_df['recipe_id'] == recipe_id]
        if len(corpus_row) > 0:
            tags = corpus_row.iloc[0].get('tags', '')
            if pd.notna(tags) and str(tags) != 'nan':
                tags_preview = str(tags)[:100]
                print(f"Tags: {tags_preview}...")
        
        # Show description
        if pd.notna(meta['description']) and str(meta['description']) != 'nan':
            desc = str(meta['description'])[:150]
            print(f"Description: {desc}...")



# BUILD EMBEDDINGS SEARCH ENGINE
embeddings_engine = EmbeddingsSearchEngine(model_name='all-MiniLM-L6-v2', batch_size=64)
embeddings_engine.fit(documents=corpus_df, document_ids=corpus_df['recipe_id'].tolist(), show_progress=True)
embeddings_engine.save("embeddings_search_engine.pkl")


# TEST EMBEDDINGS ENGINE
test_queries_embeddings = [
    # Simple keyword queries
    "chocolate cake",
    "pasta carbonara",
    "chicken soup",
    
    # Semantic/high-level queries
    "comfort food for a rainy day",
    "healthy dinner after gym",
    "quick and easy breakfast",
    "romantic dinner for two",
    "light summer salad",
    "warm winter soup",
    "kid friendly lunch",
    "low carb vegetarian",
    
    # Abstract/mood-based queries
    "something sweet and indulgent",
    "meal prep for the week",
    "impressive dish for guests",
    "nostalgic childhood favorite"
]

# Test first 5 queries
for query in test_queries_embeddings[:5]:
    results = embeddings_engine.search(query, top_k=3) # returns 3 most similar recipes to query
    display_search_results_embeddings(results, metadata_df, corpus_df, query)

Using device: cpu


Max sequence length (tokens): 384
Embedding dimension: 384


Preparing documents: 100%|██████████| 93410/93410 [00:01<00:00, 52605.93it/s]
Batches: 100%|██████████| 1460/1460 [57:44<00:00,  2.37s/it]


Embeddings shape: (93410, 384)

Embeddings search engine statistics:
Documents: 93,410
Embedding dimension: 384

EMBEDDINGS RESULTS FOR: 'chocolate cake'

[1] the ultimate fudgy chocolate cake of love
Similarity Score: 0.6525
Cooking Time: 70 min | Ingredients: 8 | Steps: 5
Tags: time-to-make, course, preparation, occasion, desserts, easy, dinner-party, holiday-event, kid-friend...
Description: recently a co-worker had a birthday, and someone made this cake for them.  i was lucky enough to snag a piece, and fell in love.  it is very moist, de...

[2] heavenly chocolate cake
Similarity Score: 0.6173
Cooking Time: 50 min | Ingredients: 8 | Steps: 11
Tags: 60-minutes-or-less, time-to-make, course, preparation, occasion, for-large-groups, desserts, oven, e...
Description: this is a sinfully rich cake that would be wonderful for a birthday party. i found it in our local newspaper years ago and have make it numrous times....

[3] ultra moist chocolate cake
Similarity Score: 0.6097
Cooking Ti

### Evaluation

In [None]:
# QUERIES
evaluation_queries = {
    'keyword': [
        "chocolate cake",
        "chicken soup",
        "pasta carbonara",
        "banana bread",
        "grilled salmon",
        "caesar salad",
        "beef stew",
        "apple pie",
        "garlic bread",
        "fried rice"
    ],
    
    'semantic': [
        "comfort food for a rainy day",
        "healthy dinner after gym",
        "quick weeknight meal",
        "romantic dinner for two",
        "impressive dish for guests",
        "light summer meal",
        "warm cozy winter food",
        "kid friendly lunch",
        "lazy sunday breakfast",
        "something sweet and indulgent"
    ],
    
    'dietary_cuisine': [
        "vegan dessert",
        "gluten free dinner",
        "italian pasta dish",
        "asian stir fry",
        "low carb meal"
    ]
}

# Flatten queries
all_queries = []
query_categories = {}
for category, queries in evaluation_queries.items():
    for q in queries:
        all_queries.append(q)
        query_categories[q] = category

print(f"Total queries: {len(all_queries)}")
for cat, queries in evaluation_queries.items():
    print(f"  {cat}: {len(queries)}")



# RUN BOTH ENGINES AND COLLECT TOP-k RESULTS
TOP_K = 10
results_data = []

for query in tqdm(all_queries, desc="Processing queries"):
    category = query_categories[query]
    
    # Get results from both engines
    tfidf_results = tfidf_engine.search(query, top_k=TOP_K)
    embed_results = embeddings_engine.search(query, top_k=TOP_K)
    
    # Extract IDs and scores
    tfidf_ids = [r[0] for r in tfidf_results]
    tfidf_scores = [r[1] for r in tfidf_results]
    embed_ids = [r[0] for r in embed_results]
    embed_scores = [r[1] for r in embed_results]
    
    # Calculate overlap
    tfidf_set = set(tfidf_ids)
    embed_set = set(embed_ids)
    overlap = tfidf_set.intersection(embed_set)
    overlap_pct = len(overlap) / TOP_K * 100 if TOP_K > 0 else 0
    
    results_data.append({
        'query': query,
        'category': category,
        'tfidf_ids': tfidf_ids,
        'tfidf_scores': tfidf_scores,
        'embed_ids': embed_ids,
        'embed_scores': embed_scores,
        'overlap_count': len(overlap),
        'overlap_pct': overlap_pct
    })

# Convert to DataFrame
results_df = pd.DataFrame(results_data)


# OVERLAP ANALYSIS

# statistics
avg_overlap = results_df['overlap_count'].mean()
avg_overlap_pct = results_df['overlap_pct'].mean()

print(f"\nOverall Statistics (Top-{TOP_K} results):")
print(f"Average overlap: {avg_overlap:.1f} recipes ({avg_overlap_pct:.1f}%)")
print(f"Min overlap: {results_df['overlap_count'].min()}")
print(f"Max overlap: {results_df['overlap_count'].max()}")

# Overlap by category
print(f"\nOverlap by Query Category:")
for category in evaluation_queries.keys():
    cat_data = results_df[results_df['category'] == category]
    cat_avg = cat_data['overlap_pct'].mean()
    print(f"  {category}: {cat_avg:.1f}% average overlap")

# Detailed per-query overlap
print(f"\nPer-Query Overlap:")
print("Query Category Overlap")
for _, row in results_df.iterrows():
    print(f"{row['query']} {row['category']} {row['overlap_count']}/{TOP_K}")



# SAMPLE RESULTS
print("\nSAMPLE RESULTS COMPARISON")
sample_queries = [
    "chocolate cake",           # keyword
    "comfort food for a rainy day",  # semantic
    "vegan dessert"             # dietary
]

def get_recipe_name(recipe_id):
    match = metadata_df[metadata_df['recipe_id'] == recipe_id]
    return match.iloc[0]['recipe_name'] if len(match) > 0 else f"Recipe {recipe_id}"

for query in sample_queries:
    row = results_df[results_df['query'] == query].iloc[0]
    print(f"\nQUERY: '{query}' ({row['category']})")
    print(f"Overlap: {row['overlap_count']}/{TOP_K} recipes in common")
    print("Rank TF-IDF Result Embeddings Result")

    n = min(5, len(row["tfidf_ids"]), len(row["embed_ids"])) # top 5
    for rank in range(n):
        tfidf_name = get_recipe_name(row['tfidf_ids'][rank])[:33]
        embed_name = get_recipe_name(row['embed_ids'][rank])[:33]

        # Mark if same recipe
        same = "same" if row['tfidf_ids'][rank] == row['embed_ids'][rank] else ""

        print(f"{rank+1} {tfidf_name} {embed_name} {same}")



# MANUAL EVALUATION FIlE (creation)
eval_template = []
for query in all_queries:
    row = results_df[results_df['query'] == query].iloc[0]

    n_tfidf = min(5, len(row["tfidf_ids"]))
    for rank in range(n_tfidf):
        eval_template.append({
            'query': query,
            'category': query_categories[query],
            'engine': 'TF-IDF',
            'rank': rank + 1,
            'recipe_id': row['tfidf_ids'][rank],
            'recipe_name': get_recipe_name(row['tfidf_ids'][rank]),
            'score': row['tfidf_scores'][rank],
            'relevant': '' 
        })

    n_embed = min(5, len(row["embed_ids"]))
    for rank in range(n_embed):
        eval_template.append({
            'query': query,
            'category': query_categories[query],
            'engine': 'Embeddings',
            'rank': rank + 1,
            'recipe_id': row['embed_ids'][rank],
            'recipe_name': get_recipe_name(row['embed_ids'][rank]),
            'score': row['embed_scores'][rank],
            'relevant': ''
        })

eval_template_df = pd.DataFrame(eval_template)
eval_template_df.to_csv("manual_evaluation_template.csv", index=False)

print(f"\nTotal judgments needed: {len(eval_template_df)}")
print(f"  - {len(all_queries)} queries × 5 results × 2 engines = {len(all_queries) * 5 * 2}")



# ANALYSIS OF MANUAL EVALUATIONS
def analyze_manual_evaluations(filepath="manual_evaluation_complete.csv"):
    print("\nANALYSIS OF MANUAL EVALUATIONS")
    try:
        eval_df = pd.read_csv(filepath)
    except FileNotFoundError:
        print(f"\nFile '{filepath}' not found")
        print("Please complete the manual evaluation first.")
        return None

    # Check if evaluations are complete
    if eval_df['relevant'].isna().any() or (eval_df['relevant'] == '').any():
        missing = eval_df['relevant'].isna().sum() + (eval_df['relevant'] == '').sum()
        print(f"\n{missing} judgments are missing")
        print(" complete all evaluations")
        return None

    # Convert to int
    eval_df['relevant'] = eval_df['relevant'].astype(int)

    # Calculate Precision@5 for each query-engine pair
    precision_results = []

    for query in all_queries:
        query_data = eval_df[eval_df['query'] == query]
        category = query_categories[query]

        for engine in ['TF-IDF', 'Embeddings']:
            engine_data = query_data[query_data['engine'] == engine]
            relevant_count = engine_data['relevant'].sum()
            precision = relevant_count / 5

            precision_results.append({
                'query': query,
                'category': category,
                'engine': engine,
                'relevant_in_top5': relevant_count,
                'precision@5': precision
            })

    precision_df = pd.DataFrame(precision_results)

    # Overall comparison
    print("\nOVERALL RESULTS")

    tfidf_precision = precision_df[precision_df['engine'] == 'TF-IDF']['precision@5'].mean()
    embed_precision = precision_df[precision_df['engine'] == 'Embeddings']['precision@5'].mean()

    print("Average Precision@5:")
    print(f"TF-IDF: {tfidf_precision:.3f} ({tfidf_precision*100:.1f}%)")
    print(f"Embeddings: {embed_precision:.3f} ({embed_precision*100:.1f}%)")
    print(f"Difference: {embed_precision - tfidf_precision:+.3f}")

    # Comparison by category
    print("\nRESULTS BY QUERY CATEGORY")
    print("Category TF-IDF P@5 Embed P@5 Diff Winner")

    category_summary = []
    for category in evaluation_queries.keys():
        cat_data = precision_df[precision_df['category'] == category]
        tfidf_p = cat_data[cat_data['engine'] == 'TF-IDF']['precision@5'].mean()
        embed_p = cat_data[cat_data['engine'] == 'Embeddings']['precision@5'].mean()
        diff = embed_p - tfidf_p # calculate differenxe

        if abs(diff) < 0.05:
            winner = "Tie"
        elif diff > 0:
            winner = "Embeddings"
        else:
            winner = "TF-IDF"

        print(f"{category} {tfidf_p:.3f} {embed_p:.3f} {diff:+.3f} {winner}")

        category_summary.append({
            'category': category,
            'tfidf_precision': tfidf_p,
            'embed_precision': embed_p,
            'winner': winner
        })

    # Per-query breakdown (P@5 for queries)
    print("\nPER-QUERY BREAKDOWN")
    print("Query TF-IDF Embed Winner")

    query_winners = {'TF-IDF': 0, 'Embeddings': 0, 'Tie': 0}

    for query in all_queries:
        q_data = precision_df[precision_df['query'] == query]
        tfidf_p = q_data[q_data['engine'] == 'TF-IDF']['precision@5'].values[0]
        embed_p = q_data[q_data['engine'] == 'Embeddings']['precision@5'].values[0]

        if abs(tfidf_p - embed_p) < 0.01:
            winner = "Tie"
        elif tfidf_p > embed_p:
            winner = "TF-IDF"
        else:
            winner = "Embeddings"

        query_winners[winner] += 1
        print(f"{query} {tfidf_p:.2f} {embed_p:.2f} {winner}")

    # Win/Loss summary
    print("\nWIN/LOSS SUMMARY")
    print(f"TF-IDF wins: {query_winners['TF-IDF']} queries")
    print(f"Embeddings wins: {query_winners['Embeddings']} queries")
    print(f"Ties: {query_winners['Tie']} queries")

    # Save results
    precision_df.to_csv("evaluation_results.csv", index=False)

    return {
        'precision_df': precision_df,
        'tfidf_overall': tfidf_precision,
        'embed_overall': embed_precision,
        'category_summary': category_summary,
        'query_winners': query_winners
    }


results = analyze_manual_evaluations("manual_evaluation_complete.csv")  # only run after completing file


# QUICK VISUAL SUMMARY

print("\nDIFFERENCES BETWEEN ENGINES")
print(f"""
1. RESULT OVERLAP:
- Average overlap in top-{TOP_K}: {avg_overlap_pct:.1f}%
- so {100 - avg_overlap_pct:.1f}% of results are different

2. BY QUERY TYPE:""")
for category in evaluation_queries.keys():
    cat_data = results_df[results_df['category'] == category]
    cat_avg = cat_data['overlap_pct'].mean()
    print(f"- {category}: {cat_avg:.1f}% overlap")

Total queries: 25
  keyword: 10
  semantic: 10
  dietary_cuisine: 5


Processing queries:  12%|█▏        | 3/25 [00:00<00:05,  3.70it/s]

Processing queries: 100%|██████████| 25/25 [00:07<00:00,  3.31it/s]


Overall Statistics (Top-10 results):
Average overlap: 0.9 recipes (9.2%)
Min overlap: 0
Max overlap: 3

Overlap by Query Category:
  keyword             : 14.0% average overlap
  semantic            : 5.0% average overlap
  dietary_cuisine     : 8.0% average overlap

Per-Query Overlap:
Query                                    Category           Overlap
chocolate cake                           keyword              1/10
chicken soup                             keyword              2/10
pasta carbonara                          keyword              2/10
banana bread                             keyword              1/10
grilled salmon                           keyword              1/10
caesar salad                             keyword              3/10
beef stew                                keyword              0/10
apple pie                                keyword              2/10
garlic bread                             keyword              2/10
fried rice                               




In [13]:
query = "easy meal for kids"

# TF-IDF Results
tfidf_results = tfidf_engine.search(query, top_k=5)
display_search_results(tfidf_results, metadata_df, corpus_df, query, show_snippet=False)

# Embeddings Results
embed_results = embeddings_engine.search(query, top_k=5)
display_search_results_embeddings(embed_results, metadata_df, corpus_df, query, engine_name="EMBEDDINGS")


[1] cheesy chicken sandwiches
Score: 0.2570
Cooking Time: 20 min | Ingredients: 5 | Steps: 5
Description: a quick and easy meal and a kid pleaser!...

[2] 1 pot  4 item   sausage suprise
Score: 0.2481
Cooking Time: 50 min | Ingredients: 4 | Steps: 6
Description: i needed to make a quick easy meal for my kids one day and all i had was some kielbasa sausage, onions, potatoes and carrots.  therefore, i cut them u...

[3] hamburger tater tots casserole
Score: 0.2337
Cooking Time: 80 min | Ingredients: 8 | Steps: 4
Description: this is a tasty, easy meal that most kids love! i like to serve this with a warm loaf of bread and a salad....

[4] cheap hamburger potato casserole
Score: 0.2271
Cooking Time: 25 min | Ingredients: 3 | Steps: 7
Description: this is a super quick, inexpensive, and easy meal to make and it is kid friendly too!  it's comfort food at it's best!...

[5] grilled pork chops packets
Score: 0.1788
Cooking Time: 24 min | Ingredients: 7 | Steps: 11
Description: a fun and easy