In [1]:
# Common imports
import numpy as np
import pandas as pd
import zipfile as zp
import spacy
import os

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
from google.colab import drive
drive.mount('/content/drive')

# folder_path= ('/content/drive/Othercomputers/Asus Zenbook 14/Degree/Y3 S2/ML Applications/Project')
folder_path = "/content/drive/My Drive/MLA Project"
os.chdir(folder_path)

Mounted at /content/drive


In [3]:
# Upload information about the recipes
recipe_cuisine_df = pd.read_csv('recipes/recipe_links.csv')

In [4]:
recipe_cuisine_df.columns

Index(['Cuisine', 'Title', 'Link'], dtype='object')

In [5]:
recipe_meta_df= pd.read_csv('recipes/recipe_metadata.csv')

In [8]:
recipe_meta_df.head()

Unnamed: 0,recipe_id,recipe,prep_time,cook_time,total_time,servings,calories,fat,carbs,protein
0,1,Cheesy Amish Breakfast Casserole,10 mins,55 mins,1 hr 15 mins,12,314.0,23g,12g,22g
1,2,Best Vinegar Coleslaw,15 mins,5 mins,20 mins,8,224.0,9g,35g,2g
2,3,Pennsylvania-Dutch Pickled Beets and Eggs,15 mins,30 mins,2 days 45 mins,8,252.0,5g,45g,7g
3,4,Amish Macaroni Salad,15 mins,10 mins,1 hr 25 mins,6,532.0,25g,66g,9g
4,5,Amish Friendship Bread Starter,30 mins,,10 days 40 mins,120,34.0,0g,8g,1g


In [6]:
len(recipe_meta_df)

887

In [7]:
recipe_meta_df['recipe'].nunique()

886

In [8]:
# Detecting duplicated recipe
recipe_meta_df[recipe_meta_df['recipe'] == 'Speculaas Cookies or Spicy Sinterklass Cakes']

Unnamed: 0,recipe_id,recipe,prep_time,cook_time,total_time,servings,calories,fat,carbs,protein
552,553,Speculaas Cookies or Spicy Sinterklass Cakes,15 mins,15 mins,1 hr 30 mins,30,97.0,5g,11g,1g
573,574,Speculaas Cookies or Spicy Sinterklass Cakes,15 mins,15 mins,1 hr 30 mins,30,97.0,5g,11g,1g


In [9]:
# Drop the 'Link' column
recipe_cuisine_df = recipe_cuisine_df.drop(columns=['Link'])

# Merge the DataFrames
recipe_df = pd.merge(recipe_cuisine_df, recipe_meta_df, left_on='Title', right_on='recipe', how='inner')

recipe_df.head()

Unnamed: 0,Cuisine,Title,recipe_id,recipe,prep_time,cook_time,total_time,servings,calories,fat,carbs,protein
0,Amish and Mennonite,Cheesy Amish Breakfast Casserole,1,Cheesy Amish Breakfast Casserole,10 mins,55 mins,1 hr 15 mins,12,314.0,23g,12g,22g
1,Amish and Mennonite,Best Vinegar Coleslaw,2,Best Vinegar Coleslaw,15 mins,5 mins,20 mins,8,224.0,9g,35g,2g
2,Amish and Mennonite,Pennsylvania-Dutch Pickled Beets and Eggs,3,Pennsylvania-Dutch Pickled Beets and Eggs,15 mins,30 mins,2 days 45 mins,8,252.0,5g,45g,7g
3,Amish and Mennonite,Amish Macaroni Salad,4,Amish Macaroni Salad,15 mins,10 mins,1 hr 25 mins,6,532.0,25g,66g,9g
4,Amish and Mennonite,Amish Friendship Bread Starter,5,Amish Friendship Bread Starter,30 mins,,10 days 40 mins,120,34.0,0g,8g,1g


In [10]:
len(recipe_df)

920

In [11]:
# Check for duplicates in the join key on recipe_cuisine_df
recipe_cuisine_df['Title'].value_counts().head(10)

Unnamed: 0_level_0,count
Title,Unnamed: 1_level_1
Spicy Potato Noodles (Bataka Sev),3
Torsk (Scandinavian Cod),3
Swedish Cream Wafers,2
Kroppkakor - Swedish Potato Dumplings,2
Sweet and Sour Red Cabbage,2
Swedish Meatballs with Creamy Dill Sauce,2
Healthier Swedish Meatballs,2
Swedish Meatballs I,2
Swedish Chocolate Balls (Chokladbollar),2
Swedish Sticky Chocolate Cake (Kladdkaka),2


In [12]:
# Define the desired order for the first few columns
first_cols = ['recipe_id', 'Title', 'Cuisine']

# Get the remaining columns automatically
rest_cols = [col for col in recipe_df.columns if col not in first_cols]

# Reorder the DataFrame
recipe_df = recipe_df[first_cols + rest_cols]

In [13]:
recipe_df.head()

Unnamed: 0,recipe_id,Title,Cuisine,recipe,prep_time,cook_time,total_time,servings,calories,fat,carbs,protein
0,1,Cheesy Amish Breakfast Casserole,Amish and Mennonite,Cheesy Amish Breakfast Casserole,10 mins,55 mins,1 hr 15 mins,12,314.0,23g,12g,22g
1,2,Best Vinegar Coleslaw,Amish and Mennonite,Best Vinegar Coleslaw,15 mins,5 mins,20 mins,8,224.0,9g,35g,2g
2,3,Pennsylvania-Dutch Pickled Beets and Eggs,Amish and Mennonite,Pennsylvania-Dutch Pickled Beets and Eggs,15 mins,30 mins,2 days 45 mins,8,252.0,5g,45g,7g
3,4,Amish Macaroni Salad,Amish and Mennonite,Amish Macaroni Salad,15 mins,10 mins,1 hr 25 mins,6,532.0,25g,66g,9g
4,5,Amish Friendship Bread Starter,Amish and Mennonite,Amish Friendship Bread Starter,30 mins,,10 days 40 mins,120,34.0,0g,8g,1g


In [14]:
recipe_df.Cuisine.value_counts()

Unnamed: 0_level_0,count
Cuisine,Unnamed: 1_level_1
Brazilian,64
Chinese,64
Canadian,64
Cuban,64
German,64
French,64
Filipino,64
Greek,64
Cajun and Creole,63
Australian and New Zealander,62


In [15]:
# Get titles that appear more than once
duplicated_titles = recipe_df['Title'].value_counts()[lambda x: x > 1].index

# Filter the rows with those duplicated titles and show their Title and Cuisine
dup_df=recipe_df[recipe_df['Title'].isin(duplicated_titles)][['Title', 'Cuisine', 'recipe_id']].sort_values(by='Title')
dup_df

Unnamed: 0,Title,Cuisine,recipe_id
85,"After Traveling Around the World, These Are 10...",Argentinian,86
897,"After Traveling Around the World, These Are 10...",Peruvian,86
914,Authentic Louisiana Red Beans and Rice,Southern Recipes,258
257,Authentic Louisiana Red Beans and Rice,Cajun and Creole,258
913,Best Jambalaya,Southern Recipes,257
...,...,...,...
895,Spicy Potato Noodles (Bataka Sev),Pakistani,179
912,Spicy Potato Noodles (Bataka Sev),South African,179
908,Torsk (Scandinavian Cod),Scandinavian,532
532,Torsk (Scandinavian Cod),Danish,532


In [16]:
recipe_df[recipe_df['recipe_id'] == 329][['Title', 'Cuisine', 'prep_time']]

Unnamed: 0,Title,Cuisine,prep_time
328,Crispy Ginger Beef,Canadian,25 mins
436,Crispy Ginger Beef,Chinese,25 mins


In [None]:
recipe_df[recipe_df['Title']=='Chicken Tikka Masala'][['Title', 'Cuisine', 'recipe_id']]

Unnamed: 0,Title,Cuisine,recipe_id
888,Chicken Tikka Masala,Indian,886


In [None]:
# Iterate this code over the recipes that have bee included twice (as present in two cuisines)
recipe_df = recipe_df[~((recipe_df['Cuisine'] == 'Danish') & (recipe_df['Title'] == 'Speculaas Cookies or Spicy Sinterklass Cakes'))]


In [None]:
recipe_df = recipe_df[recipe_df['Cuisine'] != 'Scandinavian']

In [None]:
recipe_df = recipe_df[recipe_df['recipe_id'] != 574]

In [None]:
len(recipe_df)

889

In [None]:
# drop the redundant 'recipe' column if it's the same as 'Title'
recipe_df = recipe_df.drop(columns=['recipe'])

In [None]:
# Save cleaned (no duplicates) recipe_df
recipe_df.to_csv('recipe_df.csv', index=False)

In [17]:
# RECOVER RECIPE_DF AND ELILIMINATE RECIPES THAT HAVE NO REVIEWS, TO MAINTAIN CONSISTENCY WITH CORPUS GENERATED IN NLP
# Upload information about the recipes
recipes_df = pd.read_csv('recipe_df.csv')

In [18]:
len(recipes_df)

886

In [19]:
# REcover the reviews df and group lemmas by recipe
reviews_all_df = pd.read_csv('all_reviews_lemmas_2.csv')
print(len(reviews_all_df))

111613


In [23]:
# Eliminate reviews that only have stars, not text
reviews_df = reviews_all_df[reviews_all_df['nltk_lemmas'].notna()].copy()

In [24]:
len(reviews_df)

109550

In [22]:
# Recalculate the lemmas per recipe from reviews_df before removing items from reviews_df
# When uploading from csv, list in lemmas col is read as a string, need to transform it into a python list
import ast
reviews_df['nltk_lemmas'] = reviews_df['nltk_lemmas'].apply(ast.literal_eval)
# Group by 'recipe_id' and 'recipe', and combine all 'nltk_lemmas' lists into one
grouped_lemmas = reviews_df.groupby(['recipe_id', 'recipe'])['nltk_lemmas'].apply(lambda lemmas: sum(lemmas, []))

# Create a new DataFrame
recipes_corpus_df = grouped_lemmas.reset_index()

In [25]:
len(recipes_corpus_df)

824

In [26]:
# Look into the difference in length between recipes_df and recipes_corpus_df
recipes_ids_all = set(recipes_df['recipe_id'])
recipes_ids_with_reviews = set(grouped_lemmas.index.get_level_values('recipe_id'))

In [27]:
missing_ids = recipes_ids_all - recipes_ids_with_reviews
recipes_missing_reviews = recipes_df[recipes_df['recipe_id'].isin(missing_ids)]

In [28]:
# Filter out those recipes from recipes_df
recipes_with_reviews_df = recipes_df[~recipes_df['recipe_id'].isin(missing_ids)]

In [29]:
len(recipes_with_reviews_df)

823

In [30]:
# Save only the recipes with reviews
recipes_with_reviews_df.to_csv('recipes_with_reviews.csv', index=False)