In [1]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
raw_data = pd.read_csv('https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/epi_r.csv')

In [3]:
raw_data.columns

Index(['title', 'rating', 'calories', 'protein', 'fat', 'sodium', '#cakeweek',
       '#wasteless', '22-minute meals', '3-ingredient recipes',
       ...
       'yellow squash', 'yogurt', 'yonkers', 'yuca', 'zucchini', 'cookbooks',
       'leftovers', 'snack', 'snack week', 'turkey'],
      dtype='object', length=680)

In [4]:
raw_data.rating.unique()

array([2.5  , 4.375, 3.75 , 5.   , 3.125, 1.875, 0.   , 1.25 ])

In [5]:
raw_data.isnull().sum().sort_values(ascending=False).head(10)

fat               4183
protein           4162
sodium            4119
calories          4117
turkey               0
fourth of july       0
fritter              0
frittata             0
friendsgiving        0
freezer food         0
dtype: int64

In [6]:
raw_data.shape

(20052, 680)

In [7]:
data = raw_data.drop(columns=['title', 'calories', 'protein', 'fat', 'sodium'])
data.isnull().sum().sort_values(ascending=False).head(10)

turkey            0
frangelico        0
fruit             0
frozen dessert    0
fritter           0
frittata          0
friendsgiving     0
freezer food      0
freeze/chill      0
frankenrecipe     0
dtype: int64

In [8]:
data.rating.unique()

array([2.5  , 4.375, 3.75 , 5.   , 3.125, 1.875, 0.   , 1.25 ])

In [9]:
data['rating_is_high'] = np.where(data.rating >=3.75, 1, 0)
data

Unnamed: 0,rating,#cakeweek,#wasteless,22-minute meals,3-ingredient recipes,30 days of groceries,advance prep required,alabama,alaska,alcoholic,...,yogurt,yonkers,yuca,zucchini,cookbooks,leftovers,snack,snack week,turkey,rating_is_high
0,2.500,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
1,4.375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,3.750,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,5.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,3.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20047,3.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
20048,4.375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
20049,4.375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
20050,4.375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

X = data.drop(['rating','rating_is_high'], axis=1)
y = data.rating_is_high

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [11]:
rfc = RandomForestClassifier(min_samples_split=300, max_features=30, n_jobs=-1)
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features=30,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=300,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [12]:
rfc.score(X_train, y_train)

0.8126051991771086

In [13]:
rfc.score(X_test, y_test)

0.8135128396908502

In [14]:
best_features = pd.Series(rfc.feature_importances_, index=X.columns).sort_values(ascending=False).head(30).index

In [15]:
best_features

Index(['drink', 'alcoholic', 'bon appétit', 'house & garden', 'gin',
       'harpercollins', 'gourmet', 'cocktail', 'condiment', 'cocktail party',
       'vegan', 'roast', 'bake', 'sauce', 'rum', 'weelicious', 'sauté',
       'yogurt', 'spirit', 'non-alcoholic', 'lime juice', 'fall', 'bitters',
       'kid-friendly', 'condiment/spread', '3-ingredient recipes', 'winter',
       'chile pepper', 'low carb', 'tree nut'],
      dtype='object')

In [16]:
X_train_best = X_train[best_features]
X_test_best = X_test[best_features]

svc = SVC()
svc.fit(X_train_best, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [17]:
svc.score(X_train_best, y_train)

0.8149741287949629

In [18]:
svc.score(X_test_best, y_test)

0.8142607828471703

In [19]:
from sklearn.metrics import classification_report
y_test_pred = svc.predict(X_test_best)
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.67      0.12      0.20       794
           1       0.82      0.99      0.89      3217

    accuracy                           0.81      4011
   macro avg       0.75      0.55      0.55      4011
weighted avg       0.79      0.81      0.76      4011



In [22]:
from sklearn.model_selection import cross_val_score

cross_val_score(svc, X_train_best, y_train, cv=5)

array([0.80585852, 0.80299252, 0.80766833, 0.80081047, 0.80829177])