# Recipes dataset EDA and Cleaning

Link to datset: https://www.kaggle.com/datasets/irkaal/foodcom-recipes-and-reviews

In [1]:
import pandas as pd
import numpy as np
import gensim.downloader
import re
from gensim.models import word2vec, phrases
from gensim.parsing.preprocessing import remove_stopwords, strip_punctuation, strip_numeric,\
                    strip_non_alphanum, strip_multiple_whitespaces, strip_short
from textblob import TextBlob, Word
import collections
from numpy import dot
from numpy.linalg import norm
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [2]:
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_colwidth', None)
# pd.set_option('display.max_columns', None)

In [28]:
recipe_df = pd.read_csv('../../data/test_data/kaggle_recipes/r_food_recipes.csv')

In [29]:
#----check for null values
recipe_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 522517 entries, 0 to 522516
Data columns (total 28 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   RecipeId                    522517 non-null  int64  
 1   Name                        522517 non-null  object 
 2   AuthorId                    522517 non-null  int64  
 3   AuthorName                  522517 non-null  object 
 4   CookTime                    439972 non-null  object 
 5   PrepTime                    522517 non-null  object 
 6   TotalTime                   522517 non-null  object 
 7   DatePublished               522517 non-null  object 
 8   Description                 522512 non-null  object 
 9   Images                      522516 non-null  object 
 10  RecipeCategory              521766 non-null  object 
 11  Keywords                    505280 non-null  object 
 12  RecipeIngredientQuantities  522514 non-null  object 
 13  RecipeIngredie

## Cleaning DF

In [30]:
#----keep wanted columns
recipe_df = recipe_df[["Name", "CookTime","PrepTime","TotalTime","Images","RecipeCategory","Keywords","RecipeIngredientQuantities","RecipeIngredientParts","Calories","RecipeServings","RecipeInstructions"]]
                       

In [31]:
#---make a copy so can go back to original dataset if needed
new_recipe_df = recipe_df.copy()

In [32]:
new_recipe_df.head(1)

Unnamed: 0,Name,CookTime,PrepTime,TotalTime,Images,RecipeCategory,Keywords,RecipeIngredientQuantities,RecipeIngredientParts,Calories,RecipeServings,RecipeInstructions
0,Low-Fat Berry Blue Frozen Dessert,PT24H,PT45M,PT24H45M,"c(""https://img.sndimg.com/food/image/upload/w_...",Frozen Desserts,"c(""Dessert"", ""Low Protein"", ""Low Cholesterol"",...","c(""4"", ""1/4"", ""1"", ""1"")","c(""blueberries"", ""granulated sugar"", ""vanilla ...",170.9,4.0,"c(""Toss 2 cups berries with sugar."", ""Let stan..."


In [33]:
new_recipe_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 522517 entries, 0 to 522516
Data columns (total 12 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   Name                        522517 non-null  object 
 1   CookTime                    439972 non-null  object 
 2   PrepTime                    522517 non-null  object 
 3   TotalTime                   522517 non-null  object 
 4   Images                      522516 non-null  object 
 5   RecipeCategory              521766 non-null  object 
 6   Keywords                    505280 non-null  object 
 7   RecipeIngredientQuantities  522514 non-null  object 
 8   RecipeIngredientParts       522517 non-null  object 
 9   Calories                    522517 non-null  float64
 10  RecipeServings              339606 non-null  float64
 11  RecipeInstructions          522517 non-null  object 
dtypes: float64(2), object(10)
memory usage: 47.8+ MB


In [34]:
#----Fill Null values with 'Not Provided'

new_recipe_df["CookTime"] = new_recipe_df['CookTime'].fillna("Not Provided")
new_recipe_df['RecipeCategory'] = new_recipe_df['RecipeCategory'].fillna("Not Provided")
new_recipe_df['RecipeServings'] = new_recipe_df['RecipeServings'].fillna(0).astype("int64")
new_recipe_df["Images"] = new_recipe_df["Images"].fillna("[haracter(0]")

In [35]:
#----Clean receipe servings column 

def servings_clean(number):
    if number == 0:
        number = "Not Provided"
    else:
        number = number
    return number


In [36]:
#----function to clean cook time, prep time and total time column into readable format

def time_clean(string):
    string = string.strip("PT")
    string = string.replace("H", " Hour(s) ")
    string = string.replace("M", " Minute(s) ")
    string = string.replace("S", " Second(s)")
    
    return string


In [37]:
#----function to further clean instructions column to tidy up words and remove non-alpha + non-num values 

def further_clean_instructions(lst):
    new_lst =[]
    for x in lst:
        x = x.lstrip('\n"').rstrip('.').capitalize()
        x = x +"."
        new_lst.append(x)
    return new_lst


In [38]:
#----function to further clean images column to tidy up words and remove non-alpha + non-num values 

def further_clean_images(lst):
    new_lst =[]
    for x in lst:
        if x == "haracter(0":
            x = "Not Provided"
        else:
            x = x.lstrip('\n"').rstrip('.')
        new_lst.append(x)
    return new_lst

In [39]:
#----ingredients, and keywords and instructions are in string format, need to convert strings to list

def string_to_list(string):
    try:
        st = string.strip("c()").split(", ")
        s = [i.strip('""').lower() for i in st]
        return s
    except:
        s = ["null"]
        return s

In [40]:
#----apply cleaning to all columns

new_recipe_df["images"] = new_recipe_df["Images"].apply(string_to_list).apply(further_clean_images)
new_recipe_df["cook_time"] = new_recipe_df["CookTime"].apply(time_clean)
new_recipe_df["prep_time"] = new_recipe_df["PrepTime"].apply(time_clean)
new_recipe_df["total_time"] = new_recipe_df["TotalTime"].apply(time_clean)
new_recipe_df['recipe_servings'] = new_recipe_df['RecipeServings'].apply(servings_clean)
new_recipe_df["tags"] = new_recipe_df["Keywords"].apply(string_to_list)
new_recipe_df["instructions"] = new_recipe_df["RecipeInstructions"].apply(string_to_list).apply(further_clean_instructions)
new_recipe_df["ingredient_quantities"] = new_recipe_df["RecipeIngredientQuantities"].apply(string_to_list)

new_recipe_df["ingredients"] = new_recipe_df["RecipeIngredientParts"].apply(string_to_list)


In [41]:
recipe_df_2 = new_recipe_df.drop(columns = ["Images", "Keywords", "RecipeIngredientQuantities"\
                                            , "RecipeIngredientParts", "RecipeServings","RecipeInstructions"\
                                            , "TotalTime", "CookTime", "PrepTime"])


## Saving cleaned DF as pickle file for future use (not csv to retain list formats)

In [43]:
# uncomment to save cleaned df as a pickle file
# recipe_df_2.to_pickle("r_cleaned_recipes.pkl")

## EDA on tags, ingredients, categories

In [13]:
#----EDA on keyword/tags --> df of tags and counts in descending order

tag_list = []
for x in recipe_df_2["tags"]:
    for y in x:
        tag_list.append(y)
        
tag_df = pd.DataFrame(tag_list ,columns = ["tags"])
tag_df["n_tags"] = 1
tags_order = tag_df.groupby("tags").count().sort_values("n_tags", ascending = False)

In [None]:
# Uncomment to save csv file 
# tags_order.to_csv(r_n_tags_list.csv)

In [62]:
#----EDA on categories --> df of categories and counts in descending order

cat_list = []
cat_df = recipe_df_2[["RecipeCategory"]] 
cat_df['n_cat'] = 1
cat_order = cat_df.groupby("RecipeCategory").sum().sort_values("n_cat", ascending = False)


In [None]:
# Uncomment to save csv file 
# cat_order.to_csv(r_n_cat_list.csv)

In [63]:
#----EDA on ingredients --> df of ingredients and counts in descending order

ingred_list =[]
for x in new_recipe_df["ingredients"]:
    for y in x:
        ingred_list.append(y)
        
ingred_df = pd.DataFrame(ingred_list, columns=["ingredients"])
ingred_df["n_ingred"] = 1
ingred_order = ingred_df.groupby("ingredients").sum().sort_values("n_ingred", ascending=False)
ingred_order


In [None]:
# Uncomment to save csv file 
# ingred_order.to_csv(r_n_ingred_list.csv)