In [1]:
"""
Content-based Recipe Recommender — Colab-ready script

This is content-based filtering — not using ratings nor collaborative signals.
"""
# If running in Colab you can optionally mount Drive:
# from google.colab import drive

import re
import pickle
from typing import List, Optional, Dict
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import os

# -----------------------
# Configuration / Helpers
# -----------------------
TFIDF_PICKLE = "tfidf_matrix.pkl"
VECTORIZER_PICKLE = "tfidf_vectorizer.pkl"
COSINE_PICKLE = "cosine_sim.pkl"
INDICES_PICKLE = "indices.pkl"

def clean_text(s: Optional[str]) -> str:
    if s is None or (isinstance(s, float) and np.isnan(s)):
        return ""
    s = str(s).lower()
    # keep alphanumerics, commas, hyphens, hashes (for tags), spaces
    s = re.sub(r"[^a-z0-9 ,#-]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def make_soup(row: pd.Series, fields: List[str] = None) -> str:
    # fields order matters if you want to weight them manually later
    if fields is None:
        fields = ["title", "ingredients", "tags", "cuisine"]
    parts = []
    for f in fields:
        if f in row:
            parts.append(clean_text(row.get(f, "")))
    # join with spaces
    return " ".join([p for p in parts if p])

# -----------------------
# Core pipeline functions
# -----------------------
def build_or_load_model(df: pd.DataFrame,
                        soup_fields: List[str] = None,
                        force_rebuild: bool = False,
                        persist_dir: Optional[str] = None):
    """
    Build TF-IDF matrix, cosine similarity matrix and indices.
    If files exist in persist_dir and force_rebuild is False, they will be loaded.
    Returns: (tfidf_matrix, vectorizer, cosine_sim, indices)
    """
    if persist_dir is None:
        persist_dir = os.getcwd()

    tfidf_path = os.path.join(persist_dir, TFIDF_PICKLE)
    vectorizer_path = os.path.join(persist_dir, VECTORIZER_PICKLE)
    cosine_path = os.path.join(persist_dir, COSINE_PICKLE)
    indices_path = os.path.join(persist_dir, INDICES_PICKLE)

    # Create 'soup'
    df = df.copy()
    df['soup'] = df.apply(lambda r: make_soup(r, fields=soup_fields), axis=1)

    # Try loading precomputed artifacts
    if (not force_rebuild) and os.path.exists(tfidf_path) and os.path.exists(vectorizer_path) and os.path.exists(cosine_path) and os.path.exists(indices_path):
        try:
            with open(vectorizer_path, "rb") as f:
                vectorizer = pickle.load(f)
            with open(tfidf_path, "rb") as f:
                tfidf_matrix = pickle.load(f)
            with open(cosine_path, "rb") as f:
                cosine_sim = pickle.load(f)
            with open(indices_path, "rb") as f:
                indices = pickle.load(f)
            print("Loaded precomputed model artifacts from", persist_dir)
            return tfidf_matrix, vectorizer, cosine_sim, indices, df
        except Exception as e:
            print("Failed loading artifacts, rebuilding. Reason:", e)

    # Vectorize soups
    # Using uni- and bi-grams often helps capture short phrases like 'garlic butter'
    vectorizer = TfidfVectorizer(ngram_range=(1,2), stop_words='english', max_features=20000)
    tfidf_matrix = vectorizer.fit_transform(df['soup'].values)
    print("Built TF-IDF matrix:", tfidf_matrix.shape)

    # Compute cosine similarity (linear_kernel faster with TF-IDF)
    cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
    print("Computed cosine similarity matrix")

    # Build indices map (lowercased title -> index), drop duplicates keeping first
    indices = pd.Series(df.index, index=df['title'].str.lower()).drop_duplicates()

    # Persist artifacts
    try:
        with open(vectorizer_path, "wb") as f:
            pickle.dump(vectorizer, f)
        with open(tfidf_path, "wb") as f:
            pickle.dump(tfidf_matrix, f)
        with open(cosine_path, "wb") as f:
            pickle.dump(cosine_sim, f)
        with open(indices_path, "wb") as f:
            pickle.dump(indices, f)
        print("Saved model artifacts to", persist_dir)
    except Exception as e:
        print("Warning: failed to persist artifacts:", e)

    return tfidf_matrix, vectorizer, cosine_sim, indices, df

def get_recommendations(title: str,
                        df: pd.DataFrame,
                        cosine_sim: np.ndarray,
                        indices: pd.Series,
                        topn: int = 5) -> List[Dict]:
    """
    Return topn similar recipes for a given recipe title.
    Output: list of dicts with keys: id (if exists), title, cuisine (if exists), score
    """
    title_key = title.lower()
    if title_key not in indices:
        raise ValueError(f"Title '{title}' not found in dataset. Available sample titles:\n{', '.join(df['title'].head(20).tolist())}")
    idx = indices[title_key]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = [(i, s) for i, s in sim_scores if i != idx]
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    top = sim_scores[:topn]
    results = []
    for i, score in top:
        entry = {
            "index": int(i),
            "score": float(score),
            "title": str(df.at[i, 'title'])
        }
        if 'id' in df.columns:
            try:
                entry['id'] = int(df.at[i, 'id'])
            except Exception:
                entry['id'] = df.at[i, 'id']
        if 'cuisine' in df.columns:
            entry['cuisine'] = df.at[i, 'cuisine']
        results.append(entry)
    return results

# -----------------------
# Example / CLI usage
# -----------------------
SAMPLE_DATA = [
    {"id": 1, "title": "Classic Margherita Pizza", "ingredients": "flour, water, yeast, tomato, mozzarella, basil, olive oil, salt", "cuisine": "Italian", "tags": "vegetarian, pizza, baked"},
    {"id": 2, "title": "Spicy Chicken Tikka Masala", "ingredients": "chicken, yogurt, garam masala, tomato, ginger, garlic, cream", "cuisine": "Indian", "tags": "spicy, curry, chicken"},
    {"id": 3, "title": "Vegetable Stir Fry", "ingredients": "broccoli, bell pepper, carrot, soy sauce, garlic, sesame oil", "cuisine": "Chinese", "tags": "vegetarian, quick, stir-fry"},
    {"id": 4, "title": "Beef Tacos", "ingredients": "beef, taco shells, lettuce, tomato, cheddar, onion, cumin", "cuisine": "Mexican", "tags": "handheld, beef"},
    {"id": 5, "title": "Penne alla Vodka", "ingredients": "penne, tomato, cream, vodka, parmesan, garlic", "cuisine": "Italian", "tags": "pasta, creamy"},
    {"id": 6, "title": "Pad Thai", "ingredients": "rice noodles, shrimp, tamarind, fish sauce, egg, bean sprouts, peanuts", "cuisine": "Thai", "tags": "noodles, sweet-sour"},
    {"id": 7, "title": "Chana Masala", "ingredients": "chickpeas, tomato, onion, garam masala, cumin, coriander", "cuisine": "Indian", "tags": "vegetarian, protein-rich"},
    {"id": 8, "title": "Grilled Cheese Sandwich", "ingredients": "bread, cheddar, butter", "cuisine": "American", "tags": "quick, comfort"},
    {"id": 9, "title": "Falafel Wrap", "ingredients": "chickpeas, parsley, cumin, garlic, pita, tahini", "cuisine": "Mediterranean", "tags": "vegetarian, street-food"},
    {"id": 10, "title": "Shrimp Scampi", "ingredients": "shrimp, garlic, butter, lemon, parsley, linguine", "cuisine": "Italian", "tags": "seafood, pasta"},
    {"id": 11, "title": "Black Bean Burrito", "ingredients": "black beans, rice, tortilla, avocado, salsa, cheese", "cuisine": "Mexican", "tags": "vegetarian, handheld"},
    {"id": 12, "title": "Green Curry with Tofu", "ingredients": "tofu, coconut milk, green curry paste, basil, eggplant", "cuisine": "Thai", "tags": "vegetarian, spicy"}
]

def load_data_from_csv(csv_path: Optional[str] = None) -> pd.DataFrame:
    if csv_path:
        df = pd.read_csv(csv_path)
        print(f"Loaded CSV from {csv_path}, shape: {df.shape}")
    else:
        df = pd.DataFrame(SAMPLE_DATA)
        print("Using sample dataset, shape:", df.shape)
    # ensure 'title' exists
    if 'title' not in df.columns:
        raise ValueError("Data must contain a 'title' column.")
    # fillna for common columns to avoid errors
    for col in ['ingredients', 'tags', 'cuisine']:
        if col not in df.columns:
            df[col] = ""
        else:
            df[col] = df[col].fillna("")
    # If id doesn't exist, create a numeric id
    if 'id' not in df.columns:
        df['id'] = range(1, len(df) + 1)
    return df

if __name__ == "__main__":
    # Example: in Colab set CSV_PATH to your file or leave None to use sample
    CSV_PATH = None  # e.g. '/content/drive/MyDrive/recipes.csv'
    df = load_data_from_csv(CSV_PATH)

    # Build model (set force_rebuild=True to skip loading any persisted artifacts)
    tfidf_matrix, vectorizer, cosine_sim, indices, df = build_or_load_model(df, soup_fields=None, force_rebuild=True, persist_dir="/content")

    # Example queries
    queries = [
        "Classic Margherita Pizza",
        "Chana Masala",
        "Penne alla Vodka",
        "Pad Thai",
    ]

    for q in queries:
        print("\nQuery:", q)
        try:
            recs = get_recommendations(q, df=df, cosine_sim=cosine_sim, indices=indices, topn=5)
            for r in recs:
                cuisine = r.get('cuisine', 'N/A')
                print(f" - {r['title']} (cuisine: {cuisine}) — score: {r['score']:.3f}")
        except Exception as e:
            print("  Error:", e)

    # Save a small CSV of the top recommendation for each item (example)
    summary_rows = []
    for idx, row in df.iterrows():
        try:
            recs = get_recommendations(row['title'], df=df, cosine_sim=cosine_sim, indices=indices, topn=1)
            best = recs[0] if recs else {}
            summary_rows.append({
                "source_id": row['id'],
                "source_title": row['title'],
                "rec_title": best.get('title', ''),
                "rec_id": best.get('id', ''),
                "score": best.get('score', 0.0)
            })
        except Exception:
            continue
    summary_df = pd.DataFrame(summary_rows)
    outpath = "/content/recommendation_summary.csv"
    summary_df.to_csv(outpath, index=False)
    print("\nSaved sample recommendation summary to", outpath)

Using sample dataset, shape: (12, 5)
Built TF-IDF matrix: (12, 250)
Computed cosine similarity matrix
Saved model artifacts to /content

Query: Classic Margherita Pizza
 - Penne alla Vodka (cuisine: Italian) — score: 0.035
 - Green Curry with Tofu (cuisine: Thai) — score: 0.033
 - Vegetable Stir Fry (cuisine: Chinese) — score: 0.031
 - Chana Masala (cuisine: Indian) — score: 0.026
 - Shrimp Scampi (cuisine: Italian) — score: 0.023

Query: Chana Masala
 - Spicy Chicken Tikka Masala (cuisine: Indian) — score: 0.192
 - Falafel Wrap (cuisine: Mediterranean) — score: 0.075
 - Beef Tacos (cuisine: Mexican) — score: 0.066
 - Classic Margherita Pizza (cuisine: Italian) — score: 0.026
 - Penne alla Vodka (cuisine: Italian) — score: 0.016

Query: Penne alla Vodka
 - Shrimp Scampi (cuisine: Italian) — score: 0.076
 - Spicy Chicken Tikka Masala (cuisine: Indian) — score: 0.050
 - Classic Margherita Pizza (cuisine: Italian) — score: 0.035
 - Falafel Wrap (cuisine: Mediterranean) — score: 0.017
 - C