In [1]:
import pandas as pd
import numpy as np
import os

import plotly.express as px
pd.options.plotting.backend = 'plotly'

In [91]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB, CategoricalNB
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, KBinsDiscretizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

# Data cleaning

In [4]:
# load the data (from project 3)
fp_recipe = os.path.join('data', 'RAW_recipes.csv')
fp_rating = os.path.join('data', 'RAW_interactions.csv')
recipe = pd.read_csv(fp_recipe)
rating = pd.read_csv(fp_rating)
print(rating.shape)
print(recipe.shape)

(731927, 5)
(83782, 12)


In [5]:
data = recipe.merge(rating, left_on='id', right_on='recipe_id', how='left')
data['rating'] = data['rating'].replace(0, np.NaN)
data['avg_rating'] = data.groupby('recipe_id')['rating'].transform(np.mean)
data.head(3)

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients,user_id,recipe_id,date,rating,review,avg_rating
0,1 brownies in the world best ever,333281,40,985201,2008-10-27,"['60-minutes-or-less', 'time-to-make', 'course...","[138.4, 10.0, 50.0, 3.0, 3.0, 19.0, 6.0]",10,['heat the oven to 350f and arrange the rack i...,"these are the most; chocolatey, moist, rich, d...","['bittersweet chocolate', 'unsalted butter', '...",9,386585.0,333281.0,2008-11-19,4.0,"These were pretty good, but took forever to ba...",4.0
1,1 in canada chocolate chip cookies,453467,45,1848091,2011-04-11,"['60-minutes-or-less', 'time-to-make', 'cuisin...","[595.1, 46.0, 211.0, 22.0, 13.0, 51.0, 26.0]",12,"['pre-heat oven the 350 degrees f', 'in a mixi...",this is the recipe that we use at my school ca...,"['white sugar', 'brown sugar', 'salt', 'margar...",11,424680.0,453467.0,2012-01-26,5.0,Originally I was gonna cut the recipe in half ...,5.0
2,412 broccoli casserole,306168,40,50969,2008-05-30,"['60-minutes-or-less', 'time-to-make', 'course...","[194.8, 20.0, 6.0, 32.0, 22.0, 36.0, 3.0]",6,"['preheat oven to 350 degrees', 'spray a 2 qua...",since there are already 411 recipes for brocco...,"['frozen broccoli cuts', 'cream of chicken sou...",9,29782.0,306168.0,2008-12-31,5.0,This was one of the best broccoli casseroles t...,5.0


In [6]:
data_cleaned = data[data['minutes'] <= 1000].copy()
nutrition_lst = [i.strip('][').split(', ') for i in data_cleaned['nutrition'].to_list()]
data_cleaned[['calories (#)', 'total fat (PDV)', 'sugar (PDV)', 'sodium (PDV)', \
      'protein (PDV)', 'saturated fat (PDV)', 'carbohydrates (PDV)']] \
    = pd.DataFrame(nutrition_lst, index = data_cleaned.index)
data_cleaned

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,...,rating,review,avg_rating,calories (#),total fat (PDV),sugar (PDV),sodium (PDV),protein (PDV),saturated fat (PDV),carbohydrates (PDV)
0,1 brownies in the world best ever,333281,40,985201,2008-10-27,"['60-minutes-or-less', 'time-to-make', 'course...","[138.4, 10.0, 50.0, 3.0, 3.0, 19.0, 6.0]",10,['heat the oven to 350f and arrange the rack i...,"these are the most; chocolatey, moist, rich, d...",...,4.0,"These were pretty good, but took forever to ba...",4.0,138.4,10.0,50.0,3.0,3.0,19.0,6.0
1,1 in canada chocolate chip cookies,453467,45,1848091,2011-04-11,"['60-minutes-or-less', 'time-to-make', 'cuisin...","[595.1, 46.0, 211.0, 22.0, 13.0, 51.0, 26.0]",12,"['pre-heat oven the 350 degrees f', 'in a mixi...",this is the recipe that we use at my school ca...,...,5.0,Originally I was gonna cut the recipe in half ...,5.0,595.1,46.0,211.0,22.0,13.0,51.0,26.0
2,412 broccoli casserole,306168,40,50969,2008-05-30,"['60-minutes-or-less', 'time-to-make', 'course...","[194.8, 20.0, 6.0, 32.0, 22.0, 36.0, 3.0]",6,"['preheat oven to 350 degrees', 'spray a 2 qua...",since there are already 411 recipes for brocco...,...,5.0,This was one of the best broccoli casseroles t...,5.0,194.8,20.0,6.0,32.0,22.0,36.0,3.0
3,412 broccoli casserole,306168,40,50969,2008-05-30,"['60-minutes-or-less', 'time-to-make', 'course...","[194.8, 20.0, 6.0, 32.0, 22.0, 36.0, 3.0]",6,"['preheat oven to 350 degrees', 'spray a 2 qua...",since there are already 411 recipes for brocco...,...,5.0,I made this for my son's first birthday party ...,5.0,194.8,20.0,6.0,32.0,22.0,36.0,3.0
4,412 broccoli casserole,306168,40,50969,2008-05-30,"['60-minutes-or-less', 'time-to-make', 'course...","[194.8, 20.0, 6.0, 32.0, 22.0, 36.0, 3.0]",6,"['preheat oven to 350 degrees', 'spray a 2 qua...",since there are already 411 recipes for brocco...,...,5.0,Loved this. Be sure to completely thaw the br...,5.0,194.8,20.0,6.0,32.0,22.0,36.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
234424,zydeco ya ya deviled eggs,308080,40,37779,2008-06-07,"['60-minutes-or-less', 'time-to-make', 'course...","[59.2, 6.0, 2.0, 3.0, 6.0, 5.0, 0.0]",7,"['in a bowl , combine the mashed yolks and may...","deviled eggs, cajun-style",...,5.0,These were very good. I meant to add some jala...,5.0,59.2,6.0,2.0,3.0,6.0,5.0,0.0
234425,cookies by design cookies on a stick,298512,29,506822,2008-04-15,"['30-minutes-or-less', 'time-to-make', 'course...","[188.0, 11.0, 57.0, 11.0, 7.0, 21.0, 9.0]",9,['place melted butter in a large mixing bowl a...,"i've heard of the 'cookies by design' company,...",...,1.0,I would rate this a zero if I could. I followe...,1.0,188.0,11.0,57.0,11.0,7.0,21.0,9.0
234426,cookies by design sugar shortbread cookies,298509,20,506822,2008-04-15,"['30-minutes-or-less', 'time-to-make', 'course...","[174.9, 14.0, 33.0, 4.0, 4.0, 11.0, 6.0]",5,"['whip sugar and shortening in a large bowl , ...","i've heard of the 'cookies by design' company,...",...,1.0,This recipe tastes nothing like the Cookies by...,3.0,174.9,14.0,33.0,4.0,4.0,11.0,6.0
234427,cookies by design sugar shortbread cookies,298509,20,506822,2008-04-15,"['30-minutes-or-less', 'time-to-make', 'course...","[174.9, 14.0, 33.0, 4.0, 4.0, 11.0, 6.0]",5,"['whip sugar and shortening in a large bowl , ...","i've heard of the 'cookies by design' company,...",...,5.0,"yummy cookies, i love this recipe me and my sm...",3.0,174.9,14.0,33.0,4.0,4.0,11.0,6.0


# Imbalance Issue

### The original dataset shows the imbalance issue. For the rating column, there are around 77% are rating 5. 

In [7]:
data_cleaned['rating'].value_counts(normalize=True)

5.0    0.773098
4.0    0.170366
3.0    0.032727
1.0    0.013054
2.0    0.010755
Name: rating, dtype: float64

In [8]:
# train val split
data=data_cleaned.drop(columns=['id', 'contributor_id', 'submitted', 'tags', 'nutrition', 'steps', \
                                         'description', 'user_id', 'recipe_id', 'date', 'ingredients', 'avg_rating'])
data=data.dropna()
X, y = data.drop(columns=['name', 'rating']), data['rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [9]:
# using the unbalanced data to train the model
X_train_unbalanced = X_train[['n_steps', 'review']].copy()
y_train_unbalanced = y_train.copy()

In [10]:
count_vect1 = CountVectorizer()
classifier = MultinomialNB(alpha=1)
preproc=ColumnTransformer(transformers=[
        ('vectorizer', count_vect1, 'review')],
        remainder='passthrough'
        )
pl = Pipeline([
        ('preprocessor', preproc),
        ('classifier', classifier)
    ])

In [11]:
pl.fit(X_train_unbalanced, y_train_unbalanced)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('vectorizer',
                                                  CountVectorizer(),
                                                  'review')])),
                ('classifier', MultinomialNB(alpha=1))])

### The model trained by unbalancded data shows an accuracy of 77%, which is the proportion of rating 5 in the rating. Although the accuracy seems high, the low F1-score shows the poor performance of the model.

In [12]:
pl.score(X_test, y_test)

0.779328582972159

In [13]:
f1_score(pl.predict(X_test), y_test, average='macro')

0.3377565854282478

### So we decided to balance the training set

In [14]:
# balance training set (by combining upsampling and downsampling)
balanced_train=X_train
balanced_train['rating']=y_train
balanced_train=balanced_train.groupby('rating').sample(20000, replace=True)
X_train_balanced, y_train_balanced=balanced_train.drop(columns=['rating']).iloc[:, :4], balanced_train['rating']
X_train_balanced.head()

Unnamed: 0,minutes,n_steps,n_ingredients,review
116465,60,5,14,"I&#039;m sorry, but this tastes absolutely not..."
96308,40,8,12,"I was excited about this recipe , but didn't e..."
85061,20,11,8,The only reason this recipe gets ANY stars is ...
21591,45,5,7,"I followed the recipe exactly, but this turned..."
130648,45,9,6,This is really nasty stuff-and I can&#039;t th...


In [15]:
balanced_train['rating'].value_counts(normalize=True)

1.0    0.2
2.0    0.2
3.0    0.2
4.0    0.2
5.0    0.2
Name: rating, dtype: float64

# Baseline model

In [16]:
# we choose 'n_steps' and 'reviews' as the features, using Naive Bayes Multinomial Classifier
X_train_baseline = X_train_balanced[['n_steps', 'review']]
y_train_baseline = y_train_balanced.copy()

In [17]:
X_train_baseline.head()

Unnamed: 0,n_steps,review
116465,5,"I&#039;m sorry, but this tastes absolutely not..."
96308,8,"I was excited about this recipe , but didn't e..."
85061,11,The only reason this recipe gets ANY stars is ...
21591,5,"I followed the recipe exactly, but this turned..."
130648,9,This is really nasty stuff-and I can&#039;t th...


In [18]:
# We use count vectorizer to perform bag of words on the review
count_vect1 = CountVectorizer()
classifier = MultinomialNB(alpha=1)
preproc=ColumnTransformer(transformers=[
        ('vectorizer', count_vect1, 'review')],
        remainder='passthrough'
        )
pl = Pipeline([
        ('preprocessor', preproc),
        ('classifier', classifier)
    ])

In [19]:
pl.fit(X_train_baseline, y_train_baseline)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('vectorizer',
                                                  CountVectorizer(),
                                                  'review')])),
                ('classifier', MultinomialNB(alpha=1))])

### This baseline model achieves the test accuracy of 66%. Compared to the model using unbalanced training data, the F1-Score for this baseline model has an improvement. 

In [20]:
pl.score(X_train_baseline, y_train_baseline), pl.score(X_test, y_test)

(0.73581, 0.6636670829354294)

In [21]:
f1_score(pl.predict(X_test), y_test, average='macro')

0.3893117356823762

# Final model

In [27]:
# we add 'minutes' and 'ingredients' as two new features to build the final model
X_train_balanced

Unnamed: 0,minutes,n_steps,n_ingredients,review
116465,60,5,14,"I&#039;m sorry, but this tastes absolutely not..."
96308,40,8,12,"I was excited about this recipe , but didn't e..."
85061,20,11,8,The only reason this recipe gets ANY stars is ...
21591,45,5,7,"I followed the recipe exactly, but this turned..."
130648,45,9,6,This is really nasty stuff-and I can&#039;t th...
...,...,...,...,...
3244,2,4,6,Nice flavor - a group of my favorite things al...
192838,50,13,12,I had to make this without the chilis in adobo...
117579,45,15,8,"Very simple, basic bread, great for soup. I c..."
149416,120,41,19,"Every Halloween through Christmas, my mother s..."


In [28]:
#We add three hyperparameters in count vectorizer and manually search for the values.
# We also encoding 'minutes', 'n_steps', and 'n_ingredients' by KBinsDiscretizer
clf = MultinomialNB(alpha=1)
count_vect1 = CountVectorizer(min_df=2, max_df=10000, ngram_range=(1, 3))

preproc=ColumnTransformer(transformers=[
        ('vectorizer', count_vect1, 'review'),
        ('std', KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile'), ['minutes', 'n_steps', 'n_ingredients'])],
        remainder='passthrough'
        )
pl = Pipeline([
        ('preprocessor', preproc),
        ('classifier', clf)
    ])

In [88]:
pl.fit(X_train_balanced, y_train_balanced)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('vectorizer',
                                                  CountVectorizer(max_df=10000,
                                                                  min_df=2,
                                                                  ngram_range=(1,
                                                                               3)),
                                                  'review'),
                                                 ('std',
                                                  KBinsDiscretizer(encode='ordinal',
                                                                   n_bins=10),
                                                  ['minutes', 'n_steps',
                                                   'n_ingredients'])])),
                ('classifier', MultinomialNB(alpha=1))])

### The final model achieves the test accuracy of 78%, compared to 66% accuracy from baseline model. The accuracy also exceeds the 77%(can be easily achieved if we make all the predictions rating 5). The F1_score also has an improvement to 0.5.

In [25]:
pl.score(X_train_balanced, y_train_balanced), pl.score(X_test, y_test)

(0.9638, 0.7878498494086535)

In [26]:
f1_score(pl.predict(X_test), y_test, average='macro')

0.5040893816864507

# Fairness Analysis

- Null Hypothesis: Our model is fair. Its precision for low calories' food and high calories' food are roughly the same, and any difference are due by chance. 
- Alternative Hypothesis: Out model is unfair. Its precision for high calories' food is different from low calories' food.

In [81]:
# we decide to use 400 calories as the threshold, due a dish has calories higher than 400 can be high calories' food
fairness_df=data.copy()
fairness_df['calories (#)'] = fairness_df['calories (#)'].astype(float)
low_cal=fairness_df.loc[fairness_df['calories (#)']<400, ['minutes', 'n_steps', 'n_ingredients', 'review', 'rating']]
high_cal=fairness_df.loc[fairness_df['calories (#)']>=400, ['minutes', 'n_steps', 'n_ingredients', 'review', 'rating']]

In [82]:
low_cal.head()

Unnamed: 0,minutes,n_steps,n_ingredients,review,rating
0,40,10,9,"These were pretty good, but took forever to ba...",4.0
2,40,6,9,This was one of the best broccoli casseroles t...,5.0
3,40,6,9,I made this for my son's first birthday party ...,5.0
4,40,6,9,Loved this. Be sure to completely thaw the br...,5.0
5,40,6,9,"5 stars from my husband and son, my toughest c...",5.0


In [83]:
high_cal.head()

Unnamed: 0,minutes,n_steps,n_ingredients,review,rating
1,45,12,11,Originally I was gonna cut the recipe in half ...,5.0
6,120,7,7,don't let the calories and fat grams scare you...,5.0
14,50,10,8,This was wonderful momaphet. We enjoyed this ...,5.0
15,40,14,10,This was great. A nice change from plain old c...,5.0
30,68,11,12,Made for dinner yesterday and yummmo! I did c...,5.0


In [101]:
low_cal_predict=pl.predict(low_cal.drop(columns=['rating']))
low_cal_predict

array([4., 5., 5., ..., 1., 1., 5.])

In [102]:
low_cal_precision=precision_score(low_cal['rating'], low_cal_predict, average='macro')

In [103]:
high_cal_predict=pl.predict(high_cal.drop(columns=['rating']))
high_cal_predict

array([5., 5., 5., ..., 5., 5., 4.])

In [104]:
high_cal_precision=precision_score(high_cal['rating'], high_cal_predict, average='macro')

In [109]:
test_stat=np.abs(low_cal_precision-high_cal_precision)
test_stat

0.0031981001744351145

In [113]:
df = data.copy()
df['calories (#)']=df['calories (#)'].astype(float)
diff=[]
for _ in range(100):
    df['calories #']=np.random.permutation(df['calories (#)'])
    lc_df=df.loc[df['calories (#)']<400, ['minutes', 'n_steps', 'n_ingredients', 'review', 'rating']]
    hc_df=df.loc[df['calories (#)']>=400, ['minutes', 'n_steps', 'n_ingredients', 'review', 'rating']]
    lc_pred=pl.predict(lc_df.drop(columns=['rating']))
    hc_pred=pl.predict(hc_df.drop(columns=['rating']))
    stat=precision_score(lc_df['rating'], lc_pred, average='macro')-precision_score(hc_df['rating'], hc_pred, average='macro')
    diff.append(np.abs(stat))
np.mean(diff>test_stat)

KeyboardInterrupt: 