## 0. 🎯Import libraries

In [2]:
import pandas as pd
from plotnine import ggplot, aes, geom_histogram, geom_bar, facet_wrap, theme_minimal, ggtitle, themes, element_text, geom_bar, coord_flip, scale_x_discrete, labs
from plotnine.data import mtcars
from pprint import pprint


## 1. 🎯

In [3]:
# Load the DataFrame from the file
filtered_df_posts = pd.read_csv('../data/posts.csv')

filtered_df_posts.head()

Unnamed: 0,approved_at_utc,subreddit,selftext,author_fullname,saved,mod_reason_title,gilded,clicked,title,link_flair_richtext,...,num_crossposts,media,is_video,is_gallery,media_metadata,gallery_data,poll_data,author_cakeday,crosspost_parent_list,crosspost_parent
0,,recipes,,t2_i9192ot8,False,,0,False,Classic Tiramisu Recipe (original Italian pizz...,"[{'e': 'text', 't': 'Recipe'}]",...,0,,False,,,,,,,
1,,recipes,,t2_g90hdupc,False,,0,False,Orange Cookies 🍊🧡,"[{'e': 'text', 't': 'Recipe'}]",...,1,,False,,,,,,,
2,,recipes,,t2_mudt5t8m,False,,0,False,"Stir Fry Supreme – Chives, cashews and Shrimp","[{'e': 'text', 't': 'Recipe'}]",...,0,,False,,,,,,,
3,,recipes,,t2_7xjeg,False,,0,False,Sous Vide Chicken and Potatoes,"[{'e': 'text', 't': 'Recipe'}]",...,0,,False,,,,,,,
4,,recipes,,t2_dl64q0hy,False,,0,False,Chicken Riggies,"[{'e': 'text', 't': 'Recipe'}]",...,0,,False,,,,,,,


In [4]:
selected_columns = ['id', 'title', 'score', 'num_comments', 'created_utc', 'upvote_ratio', 'link_flair_text', 'author', 'url']
filtered_df_posts = filtered_df_posts[selected_columns]
flair_names = ['Recipe', 'Dessert', 'Pasta', 'Poultry', 'Vegetarian', 'Drink', 'Beef', 'Pork', 'Seafood']

In [5]:
filtered_df_posts[filtered_df_posts['link_flair_text'] == "Fruit\\Vegetarian"]

Unnamed: 0,id,title,score,num_comments,created_utc,upvote_ratio,link_flair_text,author,url
1829,181or4u,Clear-the-fridge Biscuits,37,2,1.700701e+09,0.87,Fruit\Vegetarian,TzuChiCultureMission,https://i.redd.it/5qnpls1xvz1c1.png
1830,17qoiom,Gazpacho Moreliano Mexican Tropical Fruit Salad,122,7,1.699458e+09,0.96,Fruit\Vegetarian,dobbernationloves,https://i.redd.it/2toj6d6na5zb1.jpg
1831,17lh5ss,Kashke Bademjan Persian Eggplant Dip,114,9,1.698859e+09,0.95,Fruit\Vegetarian,dobbernationloves,https://i.redd.it/7es8gk1asrxb1.jpg
1832,17d4ec4,Stuffed tomatoes and peppers. A traditional Gr...,30,8,1.697900e+09,0.94,Fruit\Vegetarian,CookedWithLove,https://i.redd.it/8sb9a53vkkvb1.jpg
1833,175unb3,Coconut Date Squares,50,9,1.697074e+09,0.92,Fruit\Vegetarian,TzuChiCultureMission,https://i.redd.it/asv00ffnbotb1.png
...,...,...,...,...,...,...,...,...,...
2062,cwwkcq,Bhindi,16,3,1.567057e+09,0.74,Fruit\Vegetarian,mark30322,https://i.redd.it/y6698lnkqbj31.jpg
2063,cwwfal,Restaurant Style Phool Gobhi Masala Recipe,21,1,1.567056e+09,0.88,Fruit\Vegetarian,mark30322,https://i.redd.it/ycwjgo0pnbj31.jpg
2064,csv234,Celery and Soy Stuffed Butternut Squash,7,1,1.566290e+09,0.71,Fruit\Vegetarian,mortoray,https://imgur.com/OyakVfz
2065,cs2z3v,Grilled Nectarine Caprese Salad,1725,22,1.566144e+09,0.97,Fruit\Vegetarian,codelycat,https://i.redd.it/tzjwjnulc8h31.jpg


In [6]:
plot = (ggplot(
       filtered_df_posts, aes(x='upvote_ratio')) + 
       geom_histogram(binwidth=0.01, fill='#5c3da4') + 
       themes.theme_matplotlib() + 
       ggtitle("Upvote ratio for all posts") + 
       themes.theme(plot_title = element_text(weight='bold', color="black")) +
        themes.theme(aspect_ratio=9/16))
plot.save(f"../plots/plot_all.jpg", format="jpg", dpi=600)



In [7]:
for flair in flair_names:
    plot = (
        ggplot(filtered_df_posts[filtered_df_posts["link_flair_text"] == flair], aes(x='upvote_ratio')) + 
        geom_histogram(binwidth=0.01, fill='#5c3da4') +
        themes.theme_matplotlib() +
        ggtitle(f"Frequency of {flair} posts by upvote ratio") +
        themes.theme(plot_title = element_text(weight='bold', color="black")) +
        themes.theme(aspect_ratio=9/16)
    )
    plot.save(f"../plots/plot_{flair}.jpg", format="jpg", dpi=600)



In [8]:
plot = ggplot(
       filtered_df_posts[filtered_df_posts["link_flair_text"] == "Fruit\Vegetarian"], aes(x='upvote_ratio')) + \
       geom_histogram(binwidth=0.01, fill='#5c3da4') + \
       themes.theme_matplotlib() + \
       ggtitle("Frequency of FruitVegetable posts by upvote ratio") + \
       themes.theme(plot_title = element_text(weight='bold', color="black"))+ \
       themes.theme(aspect_ratio=9/16)
plot.save(f"../plots/plot_FruitVegetable.jpg", format="jpg", dpi=600)



In [9]:
# Sort by 'upvote ratio' in descending order
df_sorted = filtered_df_posts.sort_values(by='upvote_ratio', ascending=False)

# Calculate the number of rows for the top 10%
top_10_percent = int(0.1 * len(df_sorted))

# Take the top 10% of the DataFrame
top_10_df = df_sorted.head(top_10_percent)

ordered_flair_list = top_10_df['link_flair_text'].value_counts().index.tolist()
# Plot a bar graph showing the number of posts from different flairs
plot = ggplot(top_10_df, aes(x="link_flair_text")) + \
       geom_bar(fill='#5c3da4') + \
       coord_flip() + \
       ggtitle("Top 10% of posts by upvote ratio") + \
       scale_x_discrete(limits=top_10_df["link_flair_text"].value_counts().index.tolist()[::-1]) + \
       themes.theme(plot_title = element_text(weight='bold', color="black")) + \
       themes.theme(aspect_ratio=9/16) + \
       labs(x="Flair", y="Number of posts")


plot.save(f"../plots/plot_top_10_percent.jpg", format="jpg", dpi=600)

