# DSC 80 Final Project

**Name(s)**: Borui Lin, Junshu Xin

**Website Link**: https://github.com/boruilin/Number-of-Step-and-Ratings-in-Recipe.git

In [26]:
import pandas as pd
import numpy as np
from itertools import combinations
from pathlib import Path
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, Binarizer
from sklearn.compose import make_column_transformer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.tree import plot_tree
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report, accuracy_score
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.ensemble import RandomForestRegressor
from datetime import datetime
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, r2_score
pd.options.plotting.backend = 'plotly'
%matplotlib inline
from dsc80_utils import *


## Step 1: Introduction

In [27]:
# TODO

## Step 2: Data Cleaning and Exploratory Data Analysis

In [28]:
# Importing the data csv
recipe = '/Users/albertlin/Desktop/dsc80-2024-fa/projects/project04/food_data/RAW_recipes.csv'
inter = '/Users/albertlin/Desktop/dsc80-2024-fa/projects/project04/food_data/RAW_interactions.csv'
# Load the CSV file into a DataFrame
recipes_data=pd.read_csv(recipe)
inter_data=pd.read_csv(inter)
# merging the data and replacing the 0s in ratings with nan
rated_recipes = recipes_data.merge(inter_data, left_on = 'id', right_on = 'recipe_id', how = 'left').drop(columns=['id'])
rated_recipes['rating'] = rated_recipes['rating'].replace(0, np.nan)
#Adding a column for average rating
average_rating = rated_recipes.groupby('recipe_id')['rating'].mean().reset_index()
average_rating.rename(columns={'rating': 'average_rating'}, inplace=True)
rated_recipes = rated_recipes.merge(average_rating, on = 'recipe_id', how = 'left')
#Adding a boolean column indicating whether n_steps is larger than 9
rated_recipes['high_step'] = rated_recipes['n_steps']>9


#Histogram of Distribution of number of steps in recipes
# plt.figure(figsize=(10, 6))
# plt.hist(recipes_data['n_steps'], bins=30, edgecolor='black', alpha=0.7)
# plt.title('Histogram of Number of Steps in Recipes', fontsize=16)
# plt.xlabel('Number of Steps', fontsize=14)
# plt.ylabel('Frequency', fontsize=14)
# plt.grid(axis='y', alpha=0.75)
# plt.show()



In [29]:
x=rated_recipes.columns[rated_recipes.isna().any()].tolist()
y=rated_recipes['review'].isna().sum()
rated_recipes['n_ingredients'].value_counts()

n_ingredients
8     25553
9     24716
7     24406
      ...  
32        4
33        1
37        1
Name: count, Length: 34, dtype: int64

In [41]:
obs2, obs1

(np.float64(0.04826454186632742), np.float64(0.06701035472139882))

## Step 3: Assessment of Missingness

In [40]:
# Find out the columns that contains missing values
na_counts = rated_recipes.isna().sum()
na_counts_in_cols_with_na = na_counts[na_counts > 0]

# Filter for columns with at least one missing value
na_counts_in_cols_with_na = na_counts[na_counts > 0]
# The description column contains some missing values, the missing mechanism is mostly like NMAR
rated_recipes['no description']=rated_recipes['description'].isna()
rated_recipes['no rating']=rated_recipes['rating'].isna()
rated_recipes['submitted'] = pd.to_datetime(rated_recipes['submitted'], errors='coerce')
current_year = datetime.now().year
rated_recipes['years_since_submission'] = current_year - rated_recipes['submitted'].dt.year

# permutation test for determinig whether rating is MAR on n_steps
stats1, obs1=permutation_test(rated_recipes, "n_steps",'no rating', ks)
pval_1=np.mean(stats1>=obs1) 
# permutation test for determinig whether rating is MAR on years_since_submission
stats2, obs2=permutation_test(rated_recipes, "years_since_submission",'no rating', tvd)
pval_2=np.mean(stats2>=obs2) 




# rating is MAR depended on n_steps, n_ingredients (diff_in_means), and years_since_submission (ks) 


np.float64(0.04826454186632742)

In [36]:
#Graph for Distribution of Number of Steps by Missingness of Rating
fig = create_kde_plotly(
    rated_recipes,
    group_col='no rating',
    group1=True,
    group2=False,
    vals_col='n_steps',
    title="Distribution of Number of Steps by Missingness of Rating",
    x_label="Number of Steps",
    y_label="Density",
    legend_title="Missing Rating",
    colors=['red', 'blue']
)
plotly.offline.plot(fig, filename='n_step_dist_plot.html', auto_open=True) 

'n_step_dist_plot.html'

In [37]:
#Graph for Distribution of # Years Since Submission by Missingness of Rating
fig = create_kde_plotly(
    rated_recipes,
    group_col='no rating',
    group1=True,
    group2=False,
    vals_col='years_since_submission',
    title="Distribution of # Years Since Submission by Missingness of Rating",
    x_label="Number of Years Since Submission",
    y_label="Density",
    legend_title="Missing Rating",
    colors=['red', 'blue']
)
plotly.offline.plot(fig, filename='#_year_dist_plot.html', auto_open=True) 


'#_year_dist_plot.html'

In [43]:
# KS Stats in Number of Step plot
# Generate the histogram for the permutation test statistics
fig = px.histogram(
    pd.DataFrame(stats1, columns=["Permutation Statistic"]),  # Convert stats to DataFrame
    x="Permutation Statistic", 
    nbins=50, 
    histnorm="probability", 
    title="Custom Title for KS Statistic Distribution"  # Set custom title here
)

# Add a vertical line for the observed statistic
fig.add_vline(
    x=obs1, 
    line_color="red", 
    line_width=2, 
    opacity=1,
    annotation_text=f"Observed = {round(obs, 4)}",
    annotation_position="top right"
)

# Adjust layout and axis ranges
fig.update_layout(
    xaxis_range=[0, 0.08],  # Adjusted range to match your desired shape
    yaxis_range=[0, 0.12],  # Adjusted to maintain proportionality
    xaxis_title="KS Statistic",  # Set custom x-axis title
    yaxis_title="Probability",   # Set custom y-axis title
    title=dict(
        text="KS Distribution in Number of Steps",  # Custom title
        x=0.5,  # Center-align the title
        xanchor="center"
    )
)

# Show the plot
plotly.offline.plot(fig, filename='n_step_ks_plot.html', auto_open=True) 

'n_step_ks_plot.html'

In [42]:
# TVD in Number of Step plot
# Generate the histogram for the permutation test statistics
fig = px.histogram(
    pd.DataFrame(stats2, columns=["Permutation Statistic"]),  # Convert stats to DataFrame
    x="Permutation Statistic", 
    nbins=50, 
    histnorm="probability", 
    title="Custom Title for KS Statistic Distribution"  # Set custom title here
)

# Add a vertical line for the observed statistic
fig.add_vline(
    x=obs2, 
    line_color="red", 
    line_width=2, 
    opacity=1,
    annotation_text=f"Observed = {round(obs2, 4)}",
    annotation_position="top right"
)

# Adjust layout and axis ranges
fig.update_layout(
    xaxis_range=[0, 0.08],  # Adjusted range to match your desired shape
    yaxis_range=[0, 0.12],  # Adjusted to maintain proportionality
    xaxis_title="TVD",  # Set custom x-axis title
    yaxis_title="Probability",   # Set custom y-axis title
    title=dict(
        text="TVD Distribution in Years_Since_Submission",  # Custom title
        x=0.5,  # Center-align the title
        xanchor="center"
    )
)

# Show the plot
plotly.offline.plot(fig, filename='years_since_submission_tvd_plot.html', auto_open=True) 

'years_since_submission_tvd_plot.html'

## Step 4: Hypothesis Testing

'n_step_ks_plot.html'

In [11]:
def permutation_test_diff_means(group1, group2, n_permutations=1000): 
    # Calculate the observed difference in means
    observed_diff = np.mean(group1) - np.mean(group2)
    
    # Combine the data from both groups
    combined = np.concatenate([group1, group2])
    
    # Initialize an array to store permuted differences
    permuted_diffs = []
    
    # Perform permutations
    for _ in range(n_permutations):
        # Shuffle the combined data
        np.random.shuffle(combined)
        
        # Split the shuffled data into two groups of the same sizes as the original groups
        permuted_group1 = combined[:len(group1)]
        permuted_group2 = combined[len(group1):]
        
        # Calculate the difference in means for the permuted data
        permuted_diff = np.mean(permuted_group1) - np.mean(permuted_group2)
        permuted_diffs.append(permuted_diff)
    
    # Calculate the p-value as the proportion of permuted differences
    # that are as extreme as or more extreme than the observed difference
    permuted_diffs = np.array(permuted_diffs)
    p_value = np.mean(np.abs(permuted_diffs) >= np.abs(observed_diff))
    
    return observed_diff, p_value
#  merge the recipe and the 

rated_recipes
# Split into groups based on median
median_n_steps = rated_recipes['n_steps'].median()
rated_recipes['steps_group'] = np.where(rated_recipes['n_steps'] <= median_n_steps, 'low_steps', 'high_steps')
low_steps=rated_recipes[rated_recipes['steps_group'] == 'low_steps']
high_steps=rated_recipes[rated_recipes['steps_group'] == 'high_steps']
# Extract ratings for the two groups
low_steps_ratings =low_steps['average_rating']
high_steps_ratings = high_steps['average_rating']
observed_diff, p_value = permutation_test_diff_means(high_steps_ratings, low_steps_ratings)

observed_diff, p_value


(np.float64(-0.005233659109012301), np.float64(0.0))

## Step 5: Framing a Prediction Problem

In [12]:
# TODO
# We are attempting to predict the average rating using different columns (existing ones and the ones we encoded)
rated_recipes['for_large_group'] = rated_recipes['tags'].str.contains('for-large-groups')
rated_recipes['has_fish'] = rated_recipes['ingredients'].str.contains('fish')
rated_recipes['submitted'] = pd.to_datetime(rated_recipes['submitted'], errors='coerce')
current_year = datetime.now().year
rated_recipes['years_since_submission'] = current_year - rated_recipes['submitted'].dt.year
rated_recipes

Unnamed: 0,name,minutes,contributor_id,submitted,...,years_since_submission,steps_group,for_large_group,has_fish
0,1 brownies in the world best ever,40,985201,2008-10-27,...,16,high_steps,True,False
1,1 in canada chocolate chip cookies,45,1848091,2011-04-11,...,13,high_steps,True,False
2,412 broccoli casserole,40,50969,2008-05-30,...,16,low_steps,False,False
...,...,...,...,...,...,...,...,...,...
234426,cookies by design sugar shortbread cookies,20,506822,2008-04-15,...,16,low_steps,True,False
234427,cookies by design sugar shortbread cookies,20,506822,2008-04-15,...,16,low_steps,True,False
234428,cookies by design sugar shortbread cookies,20,506822,2008-04-15,...,16,low_steps,True,False


## Step 6: Baseline Model

In [13]:

rated_recipes_filtered=rated_recipes.dropna(subset=['rating'])
# Define features and target

features = rated_recipes_filtered[['n_steps','minutes', 'years_since_submission']]  # Example numeric features
target =rated_recipes_filtered['average_rating'].astype(int)  # Ratings as target classes (1-5)

# Split data
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Train the Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=50, max_depth=7, random_state=42)
rf_classifier.fit(X_train, y_train)

# Predict on the test set
predictions = rf_classifier.predict(X_test)

# Evaluate the model
f1 = f1_score(y_test, predictions, average='weighted')  # Weighted for class imbalance
accuracy = accuracy_score(y_test, predictions)
classification_rep = classification_report(y_test, predictions)

print(f"F1 Score (Weighted): {f1}")
print(f"Accuracy: {accuracy}")
print("Classification Report:\n", classification_rep)


F1 Score (Weighted): 0.5436764965306647
Accuracy: 0.5715490325668315
Classification Report:
               precision    recall  f1-score   support

           1       0.00      0.00      0.00       154
           2       0.00      0.00      0.00       239
           3       0.00      0.00      0.00      1751
           4       0.58      0.77      0.66     22759
           5       0.56      0.40      0.47     18976

    accuracy                           0.57     43879
   macro avg       0.23      0.23      0.23     43879
weighted avg       0.54      0.57      0.54     43879




Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



In [46]:
# best_f1 = 0
# best_features = None
# feature_columns = ['n_steps', 'minutes', 'n_ingredients', 'for_large_group',]
# # Iterate over all combinations of two features
# for feature_pair in combinations(feature_columns, 2):
#     # Select the feature pair
#     features = rated_recipes_filtered[list(feature_pair)]
#     target = rated_recipes_filtered['average_rating'].astype(int)  # Use ratings as the target variable

#     # Split the data
#     X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

#     # Train the model
#     rf_classifier = RandomForestClassifier(n_estimators=20, max_depth=5, random_state=42)
#     rf_classifier.fit(X_train, y_train)

#     # Predict on the test set
#     predictions = rf_classifier.predict(X_test)

#     # Calculate the F1 score
#     f1 = f1_score(y_test, predictions, average='weighted')

#     # Update the best F1 score and features if current F1 is better
#     if f1 > best_f1:
#         best_f1 = f1
#         best_features = feature_pair

# print(f"Best F1 Score: {best_f1}")
# print(f"Best Feature Pair: {best_features}")



## Step 7: Final Model

In [None]:
# TODO

## Step 8: Fairness Analysis

In [None]:
# TODO