In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import joblib  # Used for saving and loading scikit-learn models
import os

# --- 1. Define File Names ---
DATA_FILE = 'Cleaned_Indian_Food_Dataset.csv'
VECTORIZER_FILE = 'tfidf_vectorizer.joblib'
DATA_OUTPUT_FILE = 'processed_dishes.csv'

def train_model():
    """
    Loads the data, trains the TF-IDF vectorizer, and saves the
    artifacts (vectorizer and processed data) to disk.
    """
    
    print(f"--- Starting Model Training ---")
    
    # --- 2. Load and Prepare Data ---
    if not os.path.exists(DATA_FILE):
        print(f"Error: Data file not found at {DATA_FILE}")
        return

    print(f"Loading data from {DATA_FILE}...")
    df = pd.read_csv(DATA_FILE)
    
    # Select relevant columns and drop rows with no ingredients
    # We keep URL and other info to show in the recommendation
    df_processed = df[['TranslatedRecipeName', 'Cleaned-Ingredients', 'Cuisine', 'TotalTimeInMins', 'URL']].copy()
    
    # Critically, drop any rows where ingredients are missing
    df_processed = df_processed.dropna(subset=['Cleaned-Ingredients'])
    
    # Reset index to ensure our .iloc[] matching works correctly
    df_processed = df_processed.reset_index(drop=True)
    
    print(f"Data loaded and processed. Total dishes: {len(df_processed)}")

    # --- 3. "Train" the TF-IDF Vectorizer ---
    print("Training TF-IDF vectorizer...")
    
    # Initialize the vectorizer. You could tune this, e.g.:
    # max_features=5000 (to limit vocab size)
    # min_df=2 (to ignore ingredients that appear in only 1 dish)
    tfidf_vectorizer = TfidfVectorizer()

    # Fit and transform the ingredients column.
    # This step "learns" the vocabulary and IDF weights.
    # The resulting tfidf_matrix is our numerical representation of all dishes.
    tfidf_matrix = tfidf_vectorizer.fit_transform(df_processed['Cleaned-Ingredients'])
    
    print(f"Vectorizer trained. TF-IDF matrix shape: {tfidf_matrix.shape}")

    # --- 4. Save the "Model" Artifacts ---
    # We save the vectorizer so we can use the *exact same* vocabulary
    # and weights to transform user input later.
    print(f"Saving vectorizer to {VECTORIZER_FILE}...")
    joblib.dump(tfidf_vectorizer, VECTORIZER_FILE)
    
    # We save the processed data to match matrix indices to dish names
    print(f"Saving processed data to {DATA_OUTPUT_FILE}...")
    df_processed.to_csv(DATA_OUTPUT_FILE, index=False)
    
    print("--- Model training finished. Artifacts saved. ---")


# --- 5. Define the Recommendation Function ---
# This function would typically be in your main application (e.g., Flask app),
# not in the training script. We include it here for a complete example.
# It shows how to LOAD the artifacts you just saved.

def get_recommendations(user_ingredients, top_n=5):
    """
    Loads the trained model artifacts and returns dish recommendations.
    
    Args:
        user_ingredients (list): A list of ingredient strings.
        top_n (int): Number of recommendations to return.
        
    Returns:
        pandas.DataFrame: Top N recommended dishes.
    """
    
    print(f"\n--- Getting Recommendations for: {user_ingredients} ---")
    
    # --- 1. Load Model Artifacts ---
    # In a real app, you'd load these once when the app starts.
    try:
        vectorizer = joblib.load(VECTORIZER_FILE)
        data = pd.read_csv(DATA_OUTPUT_FILE)
    except FileNotFoundError:
        print("Error: Model files not found. Please run the train_model() function first.")
        return pd.DataFrame()

    # --- 2. Re-create the TF-IDF Matrix ---
    # We don't need to save the giant matrix. We can recreate it 
    # on the fly using the saved vectorizer and data.
    # Use .transform() as the vectorizer is already fitted.
    tfidf_matrix = vectorizer.transform(data['Cleaned-Ingredients'])

    # --- 3. Process User Input ---
    # Join the list into a single string
    input_string = " ".join(user_ingredients)
    
    # Transform the user's input using the FITTED vectorizer
    input_vector = vectorizer.transform([input_string])
    
    # --- 4. Compute Cosine Similarity ---
    # Calculate the similarity between the user's input and all dishes
    cosine_scores = cosine_similarity(input_vector, tfidf_matrix)
    
    # Flatten the 2D array to a 1D array of scores
    scores = cosine_scores.flatten()
    
    # --- 5. Get Top N Matches ---
    # Get the indices of the highest-scoring dishes
    top_indices = scores.argsort()[::-1][:top_n]
    
    # --- 6. Format and Return Results ---
    recommendations = data.iloc[top_indices].copy()
    recommendations['similarity_score'] = scores[top_indices]
    
    # Filter out results with 0 similarity
    recommendations = recommendations[recommendations['similarity_score'] > 0]
    
    return recommendations[['TranslatedRecipeName', 'Cuisine', 'TotalTimeInMins', 'similarity_score', 'URL']]

# --- Main execution block ---
if __name__ == "__main__":
    
    # Step 1: Run the training
    # You only need to do this ONCE (or when your data changes)
    train_model()
    
    # Step 2: Test the recommendation function
    # This is what your application will do
    
    print("\n" + "="*50)
    print("Example 1: 'Rice', 'Milk', 'Sugar'")
    test_ingredients_1 = ['rice', 'milk', 'sugar', 'cardamom']
    recs1 = get_recommendations(test_ingredients_1)
    print(recs1)
    print("="*50)

    print("\n" + "="*50)
    print("Example 2: 'Chicken', 'Onion', 'Tomato'")
    test_ingredients_2 = ['chicken', 'onion', 'tomato', 'ginger', 'garlic']
    recs2 = get_recommendations(test_ingredients_2, top_n=3)
    print(recs2)
    print("="*50)

    print("\n" + "="*50)
    print("Example 3: 'Spinach', 'Paneer', 'Cream'")
    test_ingredients_3 = ['spinach', 'paneer', 'cream', 'ghee']
    recs3 = get_recommendations(test_ingredients_3)
    print(recs3)
    print("="*50)