# ✅Step 6: Analysis of Ingredients and Cuisines

## 🎯0. Import libraries and load data

In [93]:
import pandas as pd
import numpy as np
import sys
from plotnine import *
import altair as alt

# Import our own modules
sys.path.append("../scripts/")
import chadtools

In [94]:
df = pd.read_json("../data/merged_data_for_analysis.json", orient="records")
df.head(3)

Unnamed: 0,id,title,gpt_ingredients,cuisine,ingredient_comment,score,upvote_ratio,link_flair_text,author,created_utc,...,bbcgf_title,calories,salt,fat,sugars,saturates,carbs,protein,fibre,bbcgf_ratings
0,1ah8m5s,Thai Green Curry Chicken Satay,"[chicken satay, homemade green curry paste, ve...",thai,You could use any curry paste you like. Do you...,49,0.879883,Recipe,butchec,1706893611000,...,Thai green chicken curry,257,0.6,15.0,3.0,10.0,9.0,19.0,2.0,0.9
1,1afoma3,Sweet and Sour Tofu,"[tofu, ground black pepper, salt, cornstarch, ...",chinese,I love figuring out ways to make tofu deliciou...,59,0.890137,Recipe,parisrosaries,1706725321000,...,Sweet & sour tofu,530,1.2,17.0,18.0,2.0,75.0,15.0,8.0,0.82
2,1acagoz,Chocolate Fudgy Brownie,"[butter, dark chocolate, cocoa powder, white s...",american,Full Chocolate Fudgy Brownie recipe: https://w...,114,0.910156,Recipe,butchec,1706360858000,...,Fudgy brownies,1043,0.62,62.0,76.0,35.0,105.0,14.0,6.0,0.86


## 🎯1. Analysis of Ingredients

We first do some ingredient analysis for all the recipes. In particular, we are interested in finding how much of the *top 10 most common ingredients* are used in each cuisine.

In [95]:
all_ingredient_list = []
for i in df["gpt_ingredients"].tolist():
    all_ingredient_list.extend(i)

all_ingredient_series = pd.Series(all_ingredient_list)
top_10_ingredients = all_ingredient_series.value_counts().head(10)

In [96]:
ingredient_frequency_df = pd.DataFrame(top_10_ingredients).reset_index()
ingredient_frequency_df.columns = ["ingredient", "frequency"]
ingredient_frequency_df = ingredient_frequency_df.sort_values("frequency", ascending=False)

In [97]:
plot = (ggplot(ingredient_frequency_df, aes(x="ingredient", y="frequency")) +
       geom_bar(stat="identity", fill='#5c3da4') +
       coord_flip() +
       scale_x_discrete(limits=ingredient_frequency_df["ingredient"][::-1]) +
       theme(plot_title = element_text(weight='bold', color="black", size=10, ha="right")) +
       theme(aspect_ratio=9/16) +
       labs(title="Top 10 ingredients by frequency in both BBC GoodFood and Reddit recipes", 
            x="Ingredient", y="Number of recipes")
)
plot.save("../plots/top_10_ingredients_frequency.jpg", format="jpg", dpi=600)



Hence, the top 10 ingredients found in recipes appearing on both BBC GoodFood and Reddit are:
1. Salt
2. Pepper
3. Garlic
4. Butter
5. Sugar
6. Olive oil
7. Water
8. Black Pepper
9. Eggs
10. Yellow Onion

## 🎯2. Analysis of Cuisines

We are also interested in the breakdown of cuisines in the recipes.

In [98]:
top_10_cuisines = df["cuisine"].value_counts().head(10)

## 🎯3. Putting both together

What ingredients feature prominently in each cuisine? We will put the top 10 ingredients and cuisines together to find out.

In [99]:
nutrients = ["calories", "carbs", "fat", "protein", "saturates", "sugars", "fibre", "salt"]

cuisines_nutrients_df = df[["cuisine"] + nutrients].groupby(["cuisine"]).mean().filter(top_10_cuisines.index, axis=0)
cuisines_nutrients_df

Unnamed: 0_level_0,calories,carbs,fat,protein,saturates,sugars,fibre,salt
cuisine,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
italian,549.126126,53.207207,24.567568,27.126126,10.195495,7.810811,5.415315,1.276396
american,405.042857,37.977143,21.842857,13.134286,10.795714,22.1,2.36,1.079143
chinese,358.428571,26.342857,16.6,24.971429,5.314286,6.894286,3.425714,1.626
thai,376.15625,22.63125,20.40625,25.46875,7.9375,8.5,3.325,1.630313
french,497.12,38.656,30.68,16.672,13.708,23.76,2.28,0.8
japanese,355.904762,30.77619,18.285714,15.571429,5.714286,10.961905,3.890476,1.490476
indian,380.333333,27.944444,19.666667,21.277778,9.111111,11.5,6.333333,0.978889
british,504.294118,51.264706,26.217647,17.005882,12.205882,21.941176,2.605882,1.007059
mexican,466.0,34.75,22.625,28.9375,7.0625,8.5625,6.75,1.38
greek,529.5,33.166667,27.666667,33.5,10.916667,8.666667,4.75,1.635


In [102]:
cuisines_nutrients_df_normalised = pd.DataFrame()
for nutrient in nutrients:
    series = cuisines_nutrients_df[nutrient]
    normalized_series = (series - series.min()) / (series.max() - series.min()) # apply min-max scaling
    cuisines_nutrients_df_normalised[f"{nutrient}_normalized"] = normalized_series # add normalised column

In [103]:
cuisines_nutrients_df_normalised

Unnamed: 0_level_0,calories_normalized,carbs_normalized,fat_normalized,protein_normalized,saturates_normalized,sugars_normalized,fibre_normalized,salt_normalized
cuisine,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
italian,1.0,1.0,0.565878,0.687029,0.581532,0.054343,0.701413,0.570535
american,0.25431,0.501894,0.372362,0.0,0.65304,0.901575,0.017897,0.334303
chinese,0.013062,0.12139,0.0,0.581229,0.0,0.0,0.256312,0.989222
thai,0.10481,0.0,0.27033,0.605648,0.312521,0.095206,0.233781,0.994386
french,0.730847,0.524096,1.0,0.173709,1.0,1.0,0.0,0.0
japanese,0.0,0.266384,0.119724,0.119669,0.047655,0.241177,0.360286,0.826918
indian,0.126428,0.17377,0.217803,0.399863,0.452342,0.273081,0.906786,0.214238
british,0.767976,0.93647,0.683072,0.190104,0.821043,0.892159,0.072904,0.247975
mexican,0.569788,0.396349,0.427912,0.775972,0.208277,0.098912,1.0,0.694611
greek,0.898427,0.344565,0.785985,1.0,0.66745,0.105088,0.552573,1.0


In [90]:
type(cuisines_nutrients_df["calories"])

pandas.core.series.Series

In [104]:
cuisines_nutrients_df_normalised.to_csv("../data/top10cuisines_nutrients_normalised.csv")