In [5]:
# --- 1. Import Necessary Libraries ---
import pandas as pd
import ast
import re
import nltk
import pickle # Used to save our Python objects

from sklearn.feature_extraction.text import TfidfVectorizer

# --- 2. Download NLTK Data (if not already downloaded) ---
try:
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('corpora/wordnet')
    nltk.data.find('corpora/stopwords')
except LookupError:
    print("Downloading NLTK data...")
    nltk.download('punkt', quiet=True)
    nltk.download('wordnet', quiet=True)
    nltk.download('stopwords', quiet=True)
    print("NLTK data downloaded.")

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# --- 3. Define the Preprocessing Function (exactly as before) ---
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
remove_words = set([
    'cup', 'cups', 'teaspoon', 'teaspoons', 'tablespoon', 'tablespoons', 'g', 'kg',
    'oz', 'ounce', 'ounces', 'pound', 'pounds', 'lb', 'lbs', 'clove', 'cloves',
    'fresh', 'chopped', 'sliced', 'diced', 'minced', 'large', 'small', 'medium',
    'package', 'can', 'jar', 'bottle', 'pinch', 'dash', 'to', 'taste', 'or'
])

def clean_and_process_ingredients(ingredient_list_str):
    try:
        ingredient_list = ast.literal_eval(ingredient_list_str)
    except (ValueError, SyntaxError):
        return ""

    cleaned_ingredients = []
    for ingredient_text in ingredient_list:
        ingredient_text = ingredient_text.lower()
        ingredient_text = re.sub(r'\d+/\d+|\d+', '', ingredient_text)
        ingredient_text = re.sub(r'[^\w\s]', '', ingredient_text)
        tokens = word_tokenize(ingredient_text)
        for word in tokens:
            lemmatized_word = lemmatizer.lemmatize(word)
            if lemmatized_word and lemmatized_word not in stop_words and lemmatized_word not in remove_words:
                cleaned_ingredients.append(lemmatized_word)
    return ' '.join(cleaned_ingredients)

# --- 4. Load, Process, and Save the Data ---
print("Starting the model building process...")

try:
    # Load the dataset
    print("Loading dataset: Recipe_dataset.csv")
    df = pd.read_csv('Recipe_dataset.csv')

    # Clean the ingredients
    print("Processing ingredients for all recipes... (This might take a few minutes)")
    df['cleaned_ingredients'] = df['ingredients'].apply(clean_and_process_ingredients)

    # Create and fit the TF-IDF Vectorizer
    print("Building TF-IDF matrix...")
    tfidf = TfidfVectorizer(stop_words='english', min_df=2)
    tfidf_matrix = tfidf.fit_transform(df['cleaned_ingredients'])

    # --- 5. Save the Necessary Objects to .pkl Files ---
    print("Saving objects to .pkl files...")
    
    # Save the TF-IDF Vectorizer
    with open('tfidf_vectorizer.pkl', 'wb') as f:
        pickle.dump(tfidf, f)
        
    # Save the TF-IDF Matrix
    with open('tfidf_matrix.pkl', 'wb') as f:
        pickle.dump(tfidf_matrix, f)
        
    # Save the DataFrame (we need it to look up recipe details)
    # We'll drop the cleaned column to save space
    df_to_save = df.drop(columns=['cleaned_ingredients'])
    with open('recipe_df.pkl', 'wb') as f:
        pickle.dump(df_to_save, f)

    print("\nProcess complete! The following files have been created:")
    print("- tfidf_vectorizer.pkl")
    print("- tfidf_matrix.pkl")
    print("- recipe_df.pkl")
    print("\nYou are now ready to run the Streamlit app.")

except FileNotFoundError:
    print("Error: '1_Recipe_csv.csv' not found. Please place it in the same directory.")
except Exception as e:
    print(f"An error occurred: {e}")


Downloading NLTK data...
NLTK data downloaded.
Starting the model building process...
Loading dataset: Recipe_dataset.csv
Processing ingredients for all recipes... (This might take a few minutes)
Building TF-IDF matrix...
Saving objects to .pkl files...

Process complete! The following files have been created:
- tfidf_vectorizer.pkl
- tfidf_matrix.pkl
- recipe_df.pkl

You are now ready to run the Streamlit app.
