In [11]:
##############################################
# 1. Upload 'archive.zip' from Your Local Machine
##############################################
from google.colab import files

# This will prompt you to select a file on your local machine.
uploaded = files.upload()

# 'uploaded' is a dictionary: { filename: file_bytes }
# Make sure the file you upload is named 'archive.zip'
# or adjust commands accordingly.


Saving archive.zip to archive (1).zip


In [12]:
##############################################
# 2. Verify File and Unzip
##############################################
# List the files in the current working directory to ensure 'archive.zip' is there.
!ls

# Unzip the file. The '-o' flag overwrites existing files if necessary.
!unzip -o archive.zip


'archive (1).zip'   interactions_test.csv	  PP_recipes.csv	 RAW_recipes.csv
 archive.zip	    interactions_train.csv	  PP_users.csv		 sample_data
 ingr_map.pkl	    interactions_validation.csv   RAW_interactions.csv
Archive:  archive.zip
  inflating: PP_recipes.csv          
  inflating: PP_users.csv            
  inflating: RAW_interactions.csv    
  inflating: RAW_recipes.csv         
  inflating: ingr_map.pkl            
  inflating: interactions_test.csv   
  inflating: interactions_train.csv  
  inflating: interactions_validation.csv  


In [13]:
##############################################
# 3. Install & Import Dependencies (If Needed)
##############################################
# Generally, pandas, numpy, sklearn come pre-installed in Colab.
# But if you need something else, you can install it here. For example:
# !pip install pandas --quiet
# !pip install scikit-learn --quiet

import pandas as pd
import numpy as np
import ast
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [14]:
##############################################
# 4. Load & Inspect the Dataset
##############################################
# After unzipping, you should have 'RAW_recipes.csv' in the directory.
# If your CSV has a different name, update this path accordingly.

csv_file_path = 'RAW_recipes.csv'
recipes_df = pd.read_csv(csv_file_path)

print("Preview of RAW_recipes.csv:")
display(recipes_df.head())

print("\nColumns in the dataset:")
print(list(recipes_df.columns))
# Example columns might include:
# ['id', 'name', 'minutes', 'contributor_id', 'submitted', 'tags',
#  'nutrition', 'n_steps', 'steps', 'description', 'ingredients',
#  'n_ingredients']


Preview of RAW_recipes.csv:


Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13
3,alouette potatoes,59389,45,68585,2003-04-14,"['60-minutes-or-less', 'time-to-make', 'course...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"['weeknight', 'time-to-make', 'course', 'main-...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8



Columns in the dataset:
['name', 'id', 'minutes', 'contributor_id', 'submitted', 'tags', 'nutrition', 'n_steps', 'steps', 'description', 'ingredients', 'n_ingredients']


In [15]:
##############################################
# 5. Basic Data Cleaning & Preprocessing
##############################################
# Convert 'ingredients' and 'tags' from string to Python list
# (They are often stored as stringified lists, like "['salt', 'pepper']".)

if 'ingredients' in recipes_df.columns and recipes_df['ingredients'].dtype == object:
    try:
        recipes_df['ingredients'] = recipes_df['ingredients'].apply(ast.literal_eval)
    except:
        print("Could not parse 'ingredients' as a list. Check your CSV format.")

if 'tags' in recipes_df.columns and recipes_df['tags'].dtype == object:
    try:
        recipes_df['tags'] = recipes_df['tags'].apply(ast.literal_eval)
    except:
        print("Could not parse 'tags' as a list. Check your CSV format.")

# Drop rows missing or empty 'ingredients'
recipes_df.dropna(subset=['ingredients'], inplace=True)
recipes_df = recipes_df[recipes_df['ingredients'].map(len) > 0]

# Fill missing recipe names/description with empty strings
recipes_df['name'] = recipes_df['name'].fillna('')
if 'description' in recipes_df.columns:
    recipes_df['description'] = recipes_df['description'].fillna('')
else:
    recipes_df['description'] = ''

# If 'tags' doesn't exist, create an empty list column
if 'tags' not in recipes_df.columns:
    recipes_df['tags'] = [[] for _ in range(len(recipes_df))]

print("\nData after cleaning:")
display(recipes_df.head())
print(f"Number of recipes after cleaning: {len(recipes_df)}")



Data after cleaning:


Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"[60-minutes-or-less, time-to-make, course, mai...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"[winter squash, mexican seasoning, mixed spice...",7
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"[30-minutes-or-less, time-to-make, course, mai...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"[prepared pizza crust, sausage patty, eggs, mi...",6
2,all in the kitchen chili,112140,130,196586,2005-02-25,"[time-to-make, course, preparation, main-dish,...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"[ground beef, yellow onions, diced tomatoes, t...",13
3,alouette potatoes,59389,45,68585,2003-04-14,"[60-minutes-or-less, time-to-make, course, mai...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","[spreadable cheese with garlic and herbs, new ...",11
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"[weeknight, time-to-make, course, main-ingredi...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"[tomato juice, apple cider vinegar, sugar, sal...",8


Number of recipes after cleaning: 231637


In [16]:
##############################################
# 6. Create a Combined Text Field
##############################################
# Merge the recipe's name, description, ingredients, and tags
# into a single 'combined_text' column to feed into TF-IDF.

def list_to_string(lst):
    """Helper function to convert a list of items into a single space-separated string."""
    return ' '.join(str(item) for item in lst)

recipes_df['ingredients_text'] = recipes_df['ingredients'].apply(list_to_string)
recipes_df['tags_text'] = recipes_df['tags'].apply(list_to_string)

recipes_df['combined_text'] = (
    recipes_df['name'] + ' ' +
    recipes_df['description'] + ' ' +
    recipes_df['ingredients_text'] + ' ' +
    recipes_df['tags_text']
)

print("\nSample combined_text:")
display(recipes_df[['name', 'combined_text']].head(3))



Sample combined_text:


Unnamed: 0,name,combined_text
0,arriba baked winter squash mexican style,arriba baked winter squash mexican style aut...
1,a bit different breakfast pizza,a bit different breakfast pizza this recipe c...
2,all in the kitchen chili,all in the kitchen chili this modified versio...


In [17]:
##############################################
# 7. TF-IDF Vectorization
##############################################
tfidf = TfidfVectorizer(stop_words='english', lowercase=True)
recipe_tfidf = tfidf.fit_transform(recipes_df['combined_text'])

print("TF-IDF matrix shape:", recipe_tfidf.shape)
# (# of recipes, # of unique terms)


TF-IDF matrix shape: (231637, 79562)


In [18]:
##############################################
# 8. Recommendation Function
##############################################
def recommend_meals(user_preference, top_n=5):
    """
    Recommend top_n recipes based on a textual user_preference.

    :param user_preference: (str) e.g., "vegan low-calorie dinner with tofu"
    :param top_n: (int) Number of recipes to return
    :return: A DataFrame with the top_n most similar recipes plus similarity scores
    """
    # Convert user preference to TF-IDF
    user_vec = tfidf.transform([user_preference])

    # Calculate cosine similarity between user_vec and all recipes
    cos_sim = cosine_similarity(user_vec, recipe_tfidf).flatten()

    # Get top_n recipe indices
    top_indices = cos_sim.argsort()[-top_n:][::-1]

    # Build result DataFrame
    results = recipes_df.iloc[top_indices].copy()
    results['similarity_score'] = cos_sim[top_indices]

    # Sort by similarity score descending
    results = results.sort_values('similarity_score', ascending=False)
    return results


In [21]:
##############################################
# 9. Test the Recommendation System
##############################################
test_preferences = [
    "gluten-free dairy-free breakfast",
    "gluten-free dairy-free dinner with chicken",
    "gluten-free dairy-free dessert with chocolate",
    "high protein chicken salad",
    "gluten-free dairy-free quick dinner"
]

for pref in test_preferences:
    print("=================================================")
    print(f"User Preference: {pref}")
    print("=================================================")
    recommendations = recommend_meals(pref, top_n=3)

    for i, row in recommendations.iterrows():
        print(f"Title: {row['name']}")
        print(f"Similarity Score: {row['similarity_score']:.4f}")
        # Optionally, print more details:
        # print(f"Ingredients: {row['ingredients']}")
        # print(f"Tags: {row['tags']}")
        print("-------------------------------------------------")
    print("\n")


User Preference: gluten-free dairy-free breakfast
Title: gfcf breakfast bar
Similarity Score: 0.7143
-------------------------------------------------
Title: quinoa rice breakfast  gluten free  dairy free
Similarity Score: 0.7089
-------------------------------------------------
Title: dairy free cream soup   white sauce
Similarity Score: 0.6796
-------------------------------------------------


User Preference: gluten-free dairy-free dinner with chicken
Title: gluten free fried chicken
Similarity Score: 0.7006
-------------------------------------------------
Title: dairy free cream soup   white sauce
Similarity Score: 0.6990
-------------------------------------------------
Title: sweet and sour pineapple chicken
Similarity Score: 0.6888
-------------------------------------------------


User Preference: gluten-free dairy-free dessert with chocolate
Title: fudgy gluten free chocolate cake
Similarity Score: 0.7141
-------------------------------------------------
Title: gluten free 

In [20]:
##############################################
# 10. Next Steps (Optional)
##############################################
# 1. Incorporate the 'nutrition' column to filter or sort by calories, protein, etc.
# 2. If you have 'RAW_interactions.csv', combine it for user ratings (collaborative filtering).
# 3. Deploy via a web framework (Flask, FastAPI, or Streamlit) for easy user interaction.
#
# Enjoy experimenting!
