# ✅Step 4. Analysis of Reddit Data

## 0. 🎯Import libraries

In [1]:
import pandas as pd
from plotnine import ggplot, aes, geom_histogram, geom_bar, facet_wrap, theme_minimal, ggtitle, themes, element_text, geom_bar, coord_flip, scale_x_discrete, labs
from plotnine.data import mtcars
from pprint import pprint

## 1. 🎯Load DataFrame

In [3]:
# Load the DataFrame from the file
filtered_df_posts = pd.read_json('../data/cleaned_posts_with_cuisine_and_ingredient_list.json')
filtered_df_posts.head()

Unnamed: 0,id,title,gpt_ingredients,cuisine,ingredient_comment,score,upvote_ratio,link_flair_text,author,created_utc,url,permalink
0,1ai8ccv,Crispy Jumbo Potato Chips,"[fresh russets, canola oil, lye water (koon ch...",american,# Crispy Jumbo Potato Chips \n\n**Ingredients*...,139,0.910156,Recipe,TimSumrall,1707000025000,https://i.redd.it/8gyi3cc47ggc1.jpeg,https://reddit.com/r/recipes/comments/1ai8ccv/...
1,1ah8m5s,Thai Green Curry Chicken Satay,"[chicken satay, homemade green curry paste, ve...",thai,You could use any curry paste you like. Do you...,49,0.879883,Recipe,butchec,1706893611000,https://i.redd.it/g5xtfz2ve7gc1.jpeg,https://reddit.com/r/recipes/comments/1ah8m5s/...
2,1ag9jx2,Potato Pavé,[flavor. \n\ningredients:\n- potatoes\n- garli...,french,A French side dish that goes well with anythin...,268,0.97998,Recipe,TimSumrall,1706789454000,https://i.redd.it/1fy6x4t3tyfc1.jpeg,https://reddit.com/r/recipes/comments/1ag9jx2/...
3,1afs8r5,Pretzels 🥨,"[warm water, warm milk, sugar, active dry yeas...",german,Yield: 8 medium pretzels\n\nSee how I made the...,120,0.939941,Recipe,shushyum,1706734068000,https://i.redd.it/v8e6vvxh8ufc1.jpeg,https://reddit.com/r/recipes/comments/1afs8r5/...
4,1afoma3,Sweet and Sour Tofu,"[tofu, ground black pepper, salt, cornstarch, ...",chinese,I love figuring out ways to make tofu deliciou...,59,0.890137,Recipe,parisrosaries,1706725321000,https://i.redd.it/9nhm0k2mitfc1.jpeg,https://reddit.com/r/recipes/comments/1afoma3/...


In [4]:
flair_names = ['Recipe', 'Dessert', 'Pasta', 'Poultry', 'Fruit\\Vegetarian', 'Drink', 'Beef', 'Pork', 'Seafood']

## 2. 📊Plot data

We first plot a histogram of the upvote ratio of all the posts.

In [5]:
plot = (
    ggplot(
    filtered_df_posts, aes(x='upvote_ratio')) + 
    geom_histogram(binwidth=0.01, fill='#5c3da4') + 
    themes.theme_matplotlib() + 
    ggtitle("Upvote ratio for all posts") + 
    themes.theme(plot_title = element_text(weight='bold', color="black")) +
    themes.theme(aspect_ratio=9/16)
)
plot.save(f"../plots/plot_all_upvote_ratio.jpg", format="jpg", dpi=600)

# Save another copy to the ../docs directory
plot.save("../docs/plot_all_upvote_ratio.jpg", format="jpg", dpi=600)



We now separate the data by flair type to understand how well-received each type of recipe is.

In [6]:
for flair in flair_names:
    plot = (
        ggplot(filtered_df_posts[filtered_df_posts["link_flair_text"] == flair], aes(x='upvote_ratio')) + 
        geom_histogram(binwidth=0.01, fill='#5c3da4') +
        themes.theme_matplotlib() +
        ggtitle(f"Frequency of {flair} posts by upvote ratio") +
        themes.theme(plot_title = element_text(weight='bold', color="black")) +
        themes.theme(aspect_ratio=9/16)
    )
    plot.save(f"../plots/plot_{flair}_upvote_ratio.jpg".replace("\\", ""), format="jpg", dpi=600)



Another way to investigate the reception of different recipe types is to count the number of each type of recipe found in the most popular recipes. We will use the top 10% of posts by upvote ratio.

In [7]:
# Sort by upvote ratio in descending order
df_sorted_by_upvote_ratio = filtered_df_posts.sort_values(by='upvote_ratio', ascending=False)

# Calculate the number of rows for the top 10%
top_10_percent_by_upvote_ratio = int(0.1 * len(df_sorted_by_upvote_ratio))

# Take the top 10% of the DataFrame
top_10_df = df_sorted_by_upvote_ratio.head(top_10_percent_by_upvote_ratio)

ordered_flair_list = top_10_df['link_flair_text'].value_counts().index.tolist()
# Plot a bar graph showing the number of posts from different flairs
plot = (ggplot(top_10_df, aes(x="link_flair_text")) +
       geom_bar(fill='#5c3da4') +
       coord_flip() +
       ggtitle("Top 10% of posts by upvote ratio") +
       scale_x_discrete(limits=ordered_flair_list[::-1]) +
       themes.theme(plot_title = element_text(weight='bold', color="black")) +
       themes.theme(aspect_ratio=9/16) +
       labs(x="Flair", y="Number of posts")
)

plot.save(f"../plots/plot_top_10_percent_upvote_ratio.jpg", format="jpg", dpi=600)

# Save another copy to the ../docs directory
plot.save("../docs/plot_top_10_percent_upvote_ratio.jpg", format="jpg", dpi=600)



What about for absolute popularity? We repeat the same analysis but use the number of upvotes instead of the upvote ratio.

In [8]:
df_sorted_by_score = filtered_df_posts.sort_values(by='score', ascending=False)

# Calculate the number of rows for the top 10%
top_10_percent_by_score = int(0.1 * len(df_sorted_by_score))

# Take the top 10% of the DataFrame
top_10_df = df_sorted_by_score.head(top_10_percent_by_score)

ordered_flair_list = top_10_df['link_flair_text'].value_counts().index.tolist()
# Plot a bar graph showing the number of posts from different flairs
plot = (ggplot(top_10_df, aes(x="link_flair_text")) +
       geom_bar(fill='#5c3da4') +
       coord_flip() +
       ggtitle("Top 10% of posts by absolute score") +
       scale_x_discrete(limits=ordered_flair_list[::-1]) +
       themes.theme(plot_title = element_text(weight='bold', color="black")) +
       themes.theme(aspect_ratio=9/16) +
       labs(x="Flair", y="Number of posts")
)

plot.save(f"../plots/plot_top_10_percent_score.jpg", format="jpg", dpi=600)



Further elaboration on the plots can be found on the website.