# What is the Relationship Between Cooking Complexity and Average Rating?

**Name(s)**: Aman Kar, Daniel Mathew

**Website Link**: https://akar247.github.io/RecipesDurationAnalysis/

## Code

In [1]:
import pandas as pd
import numpy as np
import os

import plotly.express as px
pd.options.plotting.backend = 'plotly'

### Cleaning and EDA

Cleaning
1. Merged two datasets on the recipes
2. Made series of average rating for each recipe and merged it with current dataframe
3. Changed tags, nutrition, steps, and ingredients column datatypes to lists of correct value types
4. Changed submitted and data column datatypes to datetime
5. Replaced ratings of 0 with NaN values 

In [2]:
interactions_fp = os.path.join('food_data', 'RAW_interactions.csv')
recipes_fp = os.path.join('food_data', 'RAW_recipes.csv')
raw_interactions = pd.read_csv(interactions_fp)
raw_recipes = pd.read_csv(recipes_fp)
display(raw_interactions, raw_recipes)

FileNotFoundError: [Errno 2] No such file or directory: 'food_data\\RAW_interactions.csv'

In [None]:
reviews = raw_recipes.merge(raw_interactions, left_on='id', right_on='recipe_id', how='left')
reviews

In [None]:
reviews['rating'] = reviews['rating'].replace(0, np.NaN)
recipe_ratings = reviews.groupby('name')['rating'].mean().to_frame()
final_reviews = reviews.merge(recipe_ratings, left_on='name', right_index=True, suffixes=('_individual', '_average'))

In [None]:
# convert columns: 
#    tags, nutrition, steps, ingredients of strings to lists (DONE)
#    user_id, recipe_id to int (NO since changing type doesn't work on NA and no need for our purposes to change)
#    rating_individual to int (not necessary)
#    submitted, date to datetime
def convert_column(ser):
    return ser.str.slice(start=1, stop=-1).str.replace("'", '').str.split(', ')

In [None]:
final_reviews[['tags', 'nutrition', 'steps', 'ingredients']] = final_reviews[['tags', 'nutrition', 'steps', 'ingredients']].apply(convert_column)
final_reviews['nutrition'] = final_reviews['nutrition'].transform(lambda lst: list(map(float, lst)))

In [None]:
final_reviews['submitted'] = pd.to_datetime(final_reviews['submitted'])
final_reviews['date'] = pd.to_datetime(final_reviews['date'])
final_reviews = final_reviews[final_reviews['minutes'] <= 1440]

In [None]:
relevant_columns = final_reviews[['name', 'minutes', 'n_steps', 'n_ingredients', 'rating_average']]
less_than_day = relevant_columns[relevant_columns['minutes'] <= 1440]
grouped_data = less_than_day.groupby('name').mean()
grouped_data

In [None]:
fig = px.histogram(grouped_data, x='minutes', nbins=400, title='Distribution of Cooking Times')
fig

In [None]:
fig = px.histogram(grouped_data, x='n_steps', nbins=20, title='Distribution of Number of Steps in Recipes')
fig

In [None]:
fig = px.histogram(grouped_data, x='n_ingredients', nbins=10, title='Distribution of Number of Ingredients in Recipes')
fig

In [None]:
fig = px.histogram(grouped_data, x='rating_average', nbins=5, title='Distribution of Average Ratings for Recipes')
fig

In [None]:
# minutes with steps
minutes_and_steps = grouped_data.groupby('minutes').mean().reset_index()
minutes_and_steps
fig = px.histogram(minutes_and_steps, 'n_steps', 'minutes', nbins=25, histfunc='avg')
fig

In [None]:
# ingredients with minutes
minutes_and_steps = grouped_data.groupby('minutes').mean().reset_index()
fig = px.histogram(minutes_and_steps, 'n_ingredients', 'minutes', nbins=10, histfunc='avg')
fig

In [None]:
grouped_data['minute_intervals'] = pd.qcut(grouped_data['minutes'], q=10)
grouped_data['step_intervals'] = pd.qcut(grouped_data['n_steps'], q=10)
minute_step_count_pt = grouped_data.reset_index().pivot_table(index='minute_intervals', columns='step_intervals', values='name', aggfunc='count')

In [None]:
grouped_data['minute_intervals'] = pd.qcut(grouped_data['minutes'], q=10)
grouped_data['step_intervals'] = pd.qcut(grouped_data['n_steps'], q=10)
minute_step_rating_pt = grouped_data.reset_index().pivot_table(index='minute_intervals', columns='step_intervals', values='rating_average', aggfunc='mean')

### Assessment of Missingness

NMAR Assessment purely on website, nothing to show on notebook. 
Most likely, the "description" column is NMAR. While it is possible that descriptions may be missing because the title is self-explanatory, there are descriptions that have nothing to do with the title or the food item itself. This implies that descriptions aren't missing dependent on other columns. They could be missing dependent on themselves because the owner of the recipe felt that they couldn't think of a description that warranted writing it in the first place. The owner felt that there was nothing important to say about the recipe, so there was no need to write a description. The missingness of description is dependent on the descriptions themselves.

In [None]:
# use rating_individual as missing column
# hypothesize that rating_individual is missing dependent on minutes
# and it is missing not dependent on name
final_reviews.head()

In [None]:
missingness1 = final_reviews[['minutes', 'rating_individual']]
rating_missing_1 = missingness1[missingness1['rating_individual'].isna()]
fig = px.histogram(rating_missing_1, 'minutes')
rating_not_missing_1 = missingness1[~missingness1['rating_individual'].isna()]
fig2 = px.histogram(rating_not_missing_1, 'minutes')
# fig.show(), fig2.show()
ratings_missing_mean = rating_missing_1['minutes'].mean()
ratings_not_missing_mean = rating_not_missing_1['minutes'].mean()
observed = abs(ratings_missing_mean - ratings_not_missing_mean)

In [None]:
reps = 100
def run_perm(df, N):
    df_copy = df.copy()
    diffs = []
    for _ in range(N):
        df_copy['rating_individual'] = np.random.permutation(df_copy['rating_individual'])
        missing = df_copy[['minutes', 'rating_individual']]
        ratings_missing_mean = missing[missing['rating_individual'].isna()]['minutes'].mean()
        ratings_not_missing_mean = missing[~missing['rating_individual'].isna()]['minutes'].mean()
        diffs.append(abs(ratings_missing_mean - ratings_not_missing_mean))
    return diffs

In [None]:
arr = run_perm(final_reviews, 100)

In [None]:
(np.array(arr) >= observed).mean()

In [None]:
def season(date):
    month = date.month
    if month < 4:
        return 'Q1'
    elif month < 7:
        return 'Q2'
    elif month < 10:
        return 'Q3'
    else:
        return 'Q4'
missingness2 = final_reviews[['date', 'rating_individual']]
missingness2.loc[:, 'quarter'] = missingness2.loc[:, 'date'].transform(season)
missingness2

In [None]:
fig = px.histogram(missingness2[['rating_individual', 'quarter']], 'quarter', 'rating_individual', nbins=10, histfunc='count')
fig

pt = missingness2.pivot_table(index='quarter', columns='rating_individual', values='date', aggfunc='count')
pt / pt.sum()

In [None]:
rating_missing_2 = missingness2[missingness2['rating_individual'].isna()]['quarter'].value_counts(normalize=True)
rating_not_missing_2 = missingness2[~missingess2['rating_individual'].isna()]['quarter'].value_counts(normalize=True)
obs = (rating_missing_2 - rating_not_missing_2).abs().sum() / 2
obs

(rating_missing_2 - rating_not_missing_2)

In [None]:
tag = final_reviews[['tags', 'rating_individual']]
tag['num_tags'] = tag['tags'].apply(lambda x: len(x))

rm_2 = tag[tag['rating_individual'].isna()]['num_tags'].mean()
rnm_2 = tag[~tag['rating_individual'].isna()]['num_tags'].mean()
obs = abs(rm_2 - rnm_2)
obs

tag_copy = tag.copy()
tag_copy['rating_individual'] = np.random.permutation(tag_copy['rating_individual'])
rm_2 = tag_copy[tag_copy['rating_individual'].isna()]['num_tags'].mean()
rnm_2 = tag_copy[~tag_copy['rating_individual'].isna()]['num_tags'].mean()
abs(rm_2 - rnm_2)

In [None]:
crazy = final_reviews[['user_id', 'rating_individual']]

rm3 = crazy[crazy['rating_individual'].isna()]['user_id'].mean()
rnm3 = crazy[~crazy['rating_individual'].isna()]['user_id'].mean()
cra_obs = abs(rm3 - rnm3)
cra_obs

rm3, rnm3, cra_obs

In [None]:
def run_perm_2(df, col, N):
    df_copy = df.copy()
    tvds = []
    for _ in range(N):
        df_copy['rating_individual'] = np.random.permutation(df_copy['rating_individual'])
        rating_missing_2 = df_copy[df_copy['rating_individual'].isna()][col].value_counts(normalize=True)
        rating_not_missing_2 = df_copy[~df_copy['rating_individual'].isna()][col].value_counts(normalize=True)
        tvd = (rating_missing_2 - rating_not_missing_2).abs().sum() / 2
        tvds.append(tvd)
    return np.array(tvds)

In [None]:
def run_means_perm(df, col, N):
    df_copy = df.copy()
    difs = []
    for _ in range(N):
        df_copy['rating_individual'] = np.random.permutation(df_copy['rating_individual'])
        rm2 = df_copy[df_copy['rating_individual'].isna()][col].mean()
        rnm2 = df_copy[~df_copy['rating_individual'].isna()][col].mean()
#         print(rm2, rnm2, abs(rm2 - rnm2))
        difs.append(abs(rm2 - rnm2))
    return np.array(difs)

In [None]:
arr = run_perm_2(missingness2, 'quarter', 100)
arr[:10]

In [None]:
arr = run_means_perm(tag, 'num_tags', 10)
arr[:10]

In [None]:
arr = run_means_perm(crazy, 'user_id', 100)
arr[:10]

In [None]:
(arr >= obs).mean()

### Hypothesis Testing

Null: The average rating of a recipe is not related to the cooking duration of that recipe.

Alternate: The average rating of a recipe decreases the longer the cooking duration of that recipe is.

Test Statistic: The difference in average cooking duration for low rated recipes on average [1, 3) and high rated recipes on average [3, 5]

In [None]:
grouped = final_reviews.groupby('name')[['minutes', 'rating_average']].mean()
grouped['High or Low'] = grouped['rating_average'].transform(lambda x: 'High' if x >= 3.5 else 'Low')
obs = grouped.groupby('High or Low').mean().diff().abs().iloc[-1]['minutes']

In [None]:
diffs = []
for _ in range(1000):
    grouped_copy = grouped.copy()
    grouped_copy['High or Low'] = np.random.permutation(grouped_copy['High or Low'])
    diff = grouped_copy.groupby('High or Low').mean().diff().abs().iloc[-1]['minutes']
    diffs.append(diff)
diffs = np.array(diffs)

In [None]:
(diffs >= obs).mean()

Since we got a p-value of 0.0, we reject the null hypothesis and there is evidence that the average rating of a recipe decreaes the longer the cooking duration of that recipe. 