## 0. 🎯Import libraries

In [10]:
import pandas as pd
from plotnine import ggplot, aes, geom_histogram, geom_bar, facet_wrap, theme_minimal, ggtitle, themes, element_text, geom_bar, coord_flip, scale_x_discrete, labs
from plotnine.data import mtcars
from pprint import pprint

## 1. 🎯Load DataFrame

In [11]:
# Load the DataFrame from the file
filtered_df_posts = pd.read_json('../data/cleaned_posts_with_ingredient_list.json')
filtered_df_posts.head()

Unnamed: 0,id,title,gpt_ingredients,ingredient_comment_truncated,score,upvote_ratio,link_flair_text,author,created_utc,url,permalink
0,1ah8m5s,Thai Green Curry Chicken Satay,"[chicken satay, homemade green curry paste, ve...",You could use any curry paste you like. Do you...,41,0.870117,Recipe,butchec,1706893611000,https://i.redd.it/g5xtfz2ve7gc1.jpeg,https://reddit.com/r/recipes/comments/1ah8m5s/...
1,1afoma3,Sweet and Sour Tofu,"[tofu, ground black pepper, salt, cornstarch, ...",I love figuring out ways to make tofu deliciou...,54,0.890137,Recipe,parisrosaries,1706725321000,https://i.redd.it/9nhm0k2mitfc1.jpeg,https://reddit.com/r/recipes/comments/1afoma3/...
2,1aez4r2,Pork Tenderloin Sous Vide,"[tenderloin, salt, fresh ground pepper, smoked...",**Ingredients:**\n\n**Tenderloin**\n\n* 1 Pork...,28,0.689941,Recipe,hoosyourdaddyo,1706648218000,https://i.redd.it/tq0z0nkd5nfc1.jpeg,https://reddit.com/r/recipes/comments/1aez4r2/...
3,1aeyb09,Marble Cookies (Recipe),"[butter, brown sugar, white sugar, vanilla ext...",[Recipe Link](https://www.sarahfreia.com/blog/...,67,0.930176,Recipe,sarahfreia,1706646194000,https://i.redd.it/r0bq289bzmfc1.png,https://reddit.com/r/recipes/comments/1aeyb09/...
4,1acg1pq,SAVORY CREPE CAKE WITH MINCED MEAT AND BELL PE...,"[milk, eggs, water, salt, flour, oil, ground b...",See how I made them [here](https://youtu.be/W8...,9,0.919922,Recipe,Minastella,1706376629000,https://i.redd.it/hd43dtgrn0fc1.jpeg,https://reddit.com/r/recipes/comments/1acg1pq/...


In [12]:
flair_names = ['Recipe', 'Dessert', 'Pasta', 'Poultry', 'Vegetarian', 'Drink', 'Beef', 'Pork', 'Seafood']

## 2. 📊Plot data

We first plot a histogram of the upvote ratio of all the posts.

In [6]:
plot = (
    ggplot(
    filtered_df_posts, aes(x='upvote_ratio')) + 
    geom_histogram(binwidth=0.01, fill='#5c3da4') + 
    themes.theme_matplotlib() + 
    ggtitle("Upvote ratio for all posts") + 
    themes.theme(plot_title = element_text(weight='bold', color="black")) +
    themes.theme(aspect_ratio=9/16)
)
plot.save(f"../plots/plot_all_upvote_ratio.jpg", format="jpg", dpi=600)



We now separate the data by flair type to understand how well-received each type of recipe is.

In [7]:
for flair in flair_names:
    plot = (
        ggplot(filtered_df_posts[filtered_df_posts["link_flair_text"] == flair], aes(x='upvote_ratio')) + 
        geom_histogram(binwidth=0.01, fill='#5c3da4') +
        themes.theme_matplotlib() +
        ggtitle(f"Frequency of {flair} posts by upvote ratio") +
        themes.theme(plot_title = element_text(weight='bold', color="black")) +
        themes.theme(aspect_ratio=9/16)
    )
    plot.save(f"../plots/plot_{flair}_upvote_ratio.jpg".replace("\\", ""), format="jpg", dpi=600)



Another way to investigate the reception of different recipe types is to count the number of each type of recipe found in the most popular recipes. We will use the top 10% of posts by upvote ratio.

In [15]:
# Sort by upvote ratio in descending order
df_sorted_by_upvote_ratio = filtered_df_posts.sort_values(by='upvote_ratio', ascending=False)

# Calculate the number of rows for the top 10%
top_10_percent_by_upvote_ratio = int(0.1 * len(df_sorted_by_upvote_ratio))

# Take the top 10% of the DataFrame
top_10_df = df_sorted_by_upvote_ratio.head(top_10_percent_by_upvote_ratio)

ordered_flair_list = top_10_df['link_flair_text'].value_counts().index.tolist()
# Plot a bar graph showing the number of posts from different flairs
plot = (ggplot(top_10_df, aes(x="link_flair_text")) +
       geom_bar(fill='#5c3da4') +
       coord_flip() +
       ggtitle("Top 10% of posts by upvote ratio") +
       scale_x_discrete(limits=ordered_flair_list[::-1]) +
       themes.theme(plot_title = element_text(weight='bold', color="black")) +
       themes.theme(aspect_ratio=9/16) +
       labs(x="Flair", y="Number of posts")
)

plot.save(f"../plots/plot_top_10_percent_upvote_ratio.jpg", format="jpg", dpi=600)



What about for absolute popularity? We repeat the same analysis but use the number of upvotes instead of the upvote ratio.

In [16]:
df_sorted_by_score = filtered_df_posts.sort_values(by='score', ascending=False)

# Calculate the number of rows for the top 10%
top_10_percent_by_score = int(0.1 * len(df_sorted_by_score))

# Take the top 10% of the DataFrame
top_10_df = df_sorted_by_score.head(top_10_percent_by_score)

ordered_flair_list = top_10_df['link_flair_text'].value_counts().index.tolist()
# Plot a bar graph showing the number of posts from different flairs
plot = (ggplot(top_10_df, aes(x="link_flair_text")) +
       geom_bar(fill='#5c3da4') +
       coord_flip() +
       ggtitle("Top 10% of posts by absolute score") +
       scale_x_discrete(limits=ordered_flair_list[::-1]) +
       themes.theme(plot_title = element_text(weight='bold', color="black")) +
       themes.theme(aspect_ratio=9/16) +
       labs(x="Flair", y="Number of posts")
)

plot.save(f"../plots/plot_top_10_percent_score.jpg", format="jpg", dpi=600)



Further elaboration on the plots can be found on the website.