# ✅Step 6: Analysis of Ingredients and Cuisines

## 🎯0. Import libraries and load data

In [61]:
import pandas as pd
import numpy as np
import sys
from plotnine import *
import altair as alt

# Import our own modules
sys.path.append("../scripts/")
import chadtools

In [62]:
df = pd.read_json("../data/merged_data_for_analysis.json", orient="records")
df.head(3)

Unnamed: 0,id,title,gpt_ingredients,cuisine,ingredient_comment,score,upvote_ratio,link_flair_text,author,created_utc,...,bbcgf_title,calories,salt,fat,sugars,saturates,carbs,protein,fibre,bbcgf_ratings
0,1ah8m5s,Thai Green Curry Chicken Satay,"[chicken satay, homemade green curry paste, ve...",thai,You could use any curry paste you like. Do you...,49,0.879883,Recipe,butchec,1706893611000,...,Thai green chicken curry,257,0.6,15.0,3.0,10.0,9.0,19.0,2.0,0.9
1,1afoma3,Sweet and Sour Tofu,"[tofu, ground black pepper, salt, cornstarch, ...",chinese,I love figuring out ways to make tofu deliciou...,59,0.890137,Recipe,parisrosaries,1706725321000,...,Sweet & sour tofu,530,1.2,17.0,18.0,2.0,75.0,15.0,8.0,0.82
2,1acagoz,Chocolate Fudgy Brownie,"[butter, dark chocolate, cocoa powder, white s...",american,Full Chocolate Fudgy Brownie recipe: https://w...,114,0.910156,Recipe,butchec,1706360858000,...,Fudgy brownies,1043,0.62,62.0,76.0,35.0,105.0,14.0,6.0,0.86


## 🎯1. Analysis of Ingredients

We first do some ingredient analysis for all the recipes. In particular, we are interested in finding how much of the *top 10 most common ingredients* are used in each cuisine.

In [63]:
all_ingredient_list = []
for i in df["gpt_ingredients"].tolist():
    all_ingredient_list.extend(i)

all_ingredient_series = pd.Series(all_ingredient_list)
top_10_ingredients = all_ingredient_series.value_counts().head(10)

In [64]:
ingredient_frequency_df = pd.DataFrame(top_10_ingredients).reset_index()
ingredient_frequency_df.columns = ["ingredient", "frequency"]
ingredient_frequency_df = ingredient_frequency_df.sort_values("frequency", ascending=False)

In [65]:
plot = (ggplot(ingredient_frequency_df, aes(x="ingredient", y="frequency")) +
       geom_bar(stat="identity", fill='#5c3da4') +
       coord_flip() +
       scale_x_discrete(limits=ingredient_frequency_df["ingredient"][::-1]) +
       theme(plot_title = element_text(weight='bold', color="black", size=10, ha="right")) +
       theme(aspect_ratio=9/16) +
       labs(title="Top 10 ingredients by frequency in both BBC GoodFood and Reddit recipes", 
            x="Ingredient", y="Number of recipes")
)
plot.save("../plots/top_10_ingredients_frequency.jpg", format="jpg", dpi=600)



Hence, the top 10 ingredients found in recipes appearing on both BBC GoodFood and Reddit are:
1. Salt
2. Pepper
3. Garlic
4. Butter
5. Sugar
6. Olive oil
7. Water
8. Black Pepper
9. Eggs
10. Yellow Onion

## 🎯2. Analysis of Cuisines

We are also interested in the breakdown of cuisines in the recipes.

In [66]:
top_10_cuisines = df["cuisine"].value_counts().head(10)

## 🎯3. Putting both together

What ingredients feature prominently in each cuisine? We will put the top 10 ingredients and cuisines together to find out. We group the recipes by cuisines and find the mean quantity of each nutrient for the top 10 cuisines.

In [67]:
nutrients = ["calories", "carbs", "fat", "protein", "saturates", "sugars", "fibre", "salt"]
healthy_nutrient_amounts = [750, 40, 20, 19, 10, 10, 10, 2]
healthy_nutrient_dict = {
    "cuisine": " healthy threshold",
    "calories": 750,
    "carbs": 40,
    "fat": 20,
    # "protein": 19,
    "saturates": 10,
    "sugars": 10,
    # "fibre": 10,
    "salt": 2
}

cuisines_nutrients_df = df[["cuisine"] + list(healthy_nutrient_dict.keys())[1:]].groupby(["cuisine"]).mean().filter(top_10_cuisines.index, axis=0).reset_index()
cuisines_nutrients_df

Unnamed: 0,cuisine,calories,carbs,fat,saturates,sugars,salt
0,italian,549.126126,53.207207,24.567568,10.195495,7.810811,1.276396
1,american,405.042857,37.977143,21.842857,10.795714,22.1,1.079143
2,chinese,358.428571,26.342857,16.6,5.314286,6.894286,1.626
3,thai,376.15625,22.63125,20.40625,7.9375,8.5,1.630313
4,french,497.12,38.656,30.68,13.708,23.76,0.8
5,japanese,355.904762,30.77619,18.285714,5.714286,10.961905,1.490476
6,indian,380.333333,27.944444,19.666667,9.111111,11.5,0.978889
7,british,504.294118,51.264706,26.217647,12.205882,21.941176,1.007059
8,mexican,466.0,34.75,22.625,7.0625,8.5625,1.38
9,greek,529.5,33.166667,27.666667,10.916667,8.666667,1.635


We also set a healthy threshold for each nutrient based on BBC GoodFood and NHS guidelines. This will motivate our health comparisons.

In [68]:
new_row = pd.DataFrame(healthy_nutrient_dict, index=[0])
new_row

Unnamed: 0,cuisine,calories,carbs,fat,saturates,sugars,salt
0,healthy threshold,750,40,20,10,10,2


In [69]:
cuisines_nutrients_df = pd.concat([cuisines_nutrients_df, new_row]).reset_index(drop=True)
cuisines_nutrients_df

Unnamed: 0,cuisine,calories,carbs,fat,saturates,sugars,salt
0,italian,549.126126,53.207207,24.567568,10.195495,7.810811,1.276396
1,american,405.042857,37.977143,21.842857,10.795714,22.1,1.079143
2,chinese,358.428571,26.342857,16.6,5.314286,6.894286,1.626
3,thai,376.15625,22.63125,20.40625,7.9375,8.5,1.630313
4,french,497.12,38.656,30.68,13.708,23.76,0.8
5,japanese,355.904762,30.77619,18.285714,5.714286,10.961905,1.490476
6,indian,380.333333,27.944444,19.666667,9.111111,11.5,0.978889
7,british,504.294118,51.264706,26.217647,12.205882,21.941176,1.007059
8,mexican,466.0,34.75,22.625,7.0625,8.5625,1.38
9,greek,529.5,33.166667,27.666667,10.916667,8.666667,1.635


To plot a radar chart, will use the `ggradar` library in R. To show the relative amount of each ingredient among the cuisines, we need to normalise the quantities with a min-max scaling method.

In [70]:
cuisines_nutrients_df_normalised = cuisines_nutrients_df.iloc[:, :2].copy()

for nutrient in list(healthy_nutrient_dict.keys())[1:]:
    series = cuisines_nutrients_df[nutrient]
    normalized_series = (series - series.min()) / (series.max() - series.min()) # apply min-max scaling
    cuisines_nutrients_df_normalised[nutrient] = normalized_series # add normalised column

# Capitalise cuisine names
cuisines_nutrients_df_normalised["cuisine"] = cuisines_nutrients_df_normalised["cuisine"].apply(lambda x: x.capitalize())

In [71]:
cuisines_nutrients_df_normalised

Unnamed: 0,cuisine,calories,carbs,fat,saturates,sugars,salt
0,Italian,0.490291,1.0,0.565878,0.581532,0.054343,0.396997
1,American,0.124686,0.501894,0.372362,0.65304,0.901575,0.232619
2,Chinese,0.006404,0.12139,0.0,0.0,0.0,0.688333
3,Thai,0.051387,0.0,0.27033,0.312521,0.095206,0.691927
4,French,0.358328,0.524096,1.0,1.0,1.0,0.0
5,Japanese,0.0,0.266384,0.119724,0.047655,0.241177,0.575397
6,Indian,0.061986,0.17377,0.217803,0.452342,0.273081,0.149074
7,British,0.376532,0.93647,0.683072,0.821043,0.892159,0.172549
8,Mexican,0.279362,0.396349,0.427912,0.208277,0.098912,0.483333
9,Greek,0.440491,0.344565,0.785985,0.66745,0.105088,0.695833


### Save to CSV to use in R

In [72]:
cuisines_nutrients_df_normalised.to_csv("../data/top10cuisines_nutrients_normalised.csv", index=False)

The code for visualisation can be found in [radar_plots.r](../scripts/radar_plots.r)