# Data

In [None]:
raw_data = pd.read_csv('epi_r.csv')
z = {0.0: 0, 1.25: 1, 1.875: 2, 2.5: 3, 3.125: 4, 3.75: 5, 4.375: 6, 5.0: 7}
z_inv = {v: k for k, v in z.items()}
raw_data["rating_cat"] = raw_data.rating.map(z)

In [None]:
# First lets look at the ratings.
raw_data.rating.value_counts()

In [None]:
raw_data["rating_cat"].value_counts()

In [None]:
np.diff(np.array([0.0, 1.25, 1.875, 2.5, 3.125, 3.75, 4.375, 5.0]))

Since the ratings are actually discrete, a classifier makes more sense. Note that the rating levels are not uniformly distributed. 

First lets add a binary feature to indicate if the recipe lists calories that are in the top 75% of all ratings, as well as some group some highly correlated features.

# Split

In [None]:
X = raw_data.dropna().drop(['rating', 'title', 'rating_cat'], axis = 1)
X_test, X_train, y_test, y_train = train_test_split(X, raw_data.dropna().rating, test_size=0.70)

In [None]:
# lets see what features are extrmely correlated
cm = X_train.corr()
s = cm.unstack()
so = s[s!=1.0].sort_values(kind="quicksort", ascending=False)
so[so>=0.7]

In [None]:
# combine calories, fat and sodium
X_train["source"] = X_train[['fat', 'calories', 'sodium']].sum(axis=1)
X_test["source"] = X_test[['fat', 'calories', 'sodium']].sum(axis=1)
# features to combine* (binary interaction)
features1 = ['calories', 'calories', 'sodium', 'peanut free', 
             'pescatarian', 'drink', 'peanut free', 'portland', 
             'soy free', 'sodium', 'vegetarian', 'snack week', 
             'pescatarian', 'kosher', 'peanut free', 'peanut free', 
             'soy free', 'calories', 'brunch', 'kentucky', 'denver', 
             'louisiana', 'new orleans', 'lasagna']

features2 = ['sodium', 'fat', 'fat', 'soy free', 'kosher', 'alcoholic', 
             'tree nut free', 'oregon', 'tree nut free', 'protein', 
             'pescatarian', 'snack', 'soy free', 'vegetarian', 
             'pescatarian', 'kosher', 'kosher', 'protein', 'breakfast', 
             'louisville', 'omelet', 'kitchen olympics', 'louisiana', 'epi loves the microwave']

for a,b in zip(features1, features2):
    X_train[a + "_" + b] = X_train[a] * X_train[b]
    X_test[a + "_" + b] = X_test[a] * X_test[b]
    
for b in ['fat', 'calories', 'sodium']:
    X_train['protein_' + b] = X_train['protein'] * X_train[b]
    X_test['protein_' + b] = X_test['protein'] * X_test[b]

# Feature Selection  

We will first predict how likely the recipe is a high (>=2.5 rating) or low (<2.5), and given this predicted probability we can feed the data to a further model (2 layer).

**High or Low rating?** (<= 2.5)

In [None]:
X_train_dummy = pd.get_dummies(X_train)
# convert all columns to int's and make source_pc binary
X_train_dummy = X_train_dummy.astype(int)

# Features & Target
y_low = y_train <= 2.5

#Pipeline Construction.
anova_low = SelectKBest(f_classif, k=30)
svc_low = SVC(kernel='linear')
anova_svc = make_pipeline(anova_low, svc_low)

In [None]:
#Fitting & Cross-Validation.
anova_svc.fit(X_train_dummy,y_low)
scores_low = cross_val_score(anova_svc, X_train_dummy, y_low, cv=5)

#De-Masking Selected Features.
features_low = anova_low.get_support(indices=True)
selected_features_low = list(X_train_dummy.columns[features_low])

#Printing Outcomes.
print('Cross-Validation Scores: {}'.format(scores_low))
print('Cross-Validation Score Averaged Across Folds: {:.2%}.\n'.format(scores_low.mean()))
print('Selected Features: {}\n'.format(selected_features_low))

In [None]:
svc_1ow = SVC()
X_low = X_train[selected_features_low]
svc_1ow.fit(X_low,y_low)

scores_low = cross_val_score(svc_1ow, X_train[selected_features_low], y_low, cv=5)
print('Cross-Validation Scores: {}'.format(scores_low))
print('Cross-Validation Score Averaged Across Folds: {:.2%}.\n'.format(scores_low.mean()))
y_pred_train = svc_1ow.predict(X_train[selected_features_low]).astype(int)

This model simply predicts if the given recipe will be rated high or low. Let's see how it performs on the test set.  

In [None]:
#Clasification report
results_train = metrics.classification_report(y_true=(y_train<=2.5).astype(int), y_pred=y_pred_train)
print(results_train)
metrics.accuracy_score(y_true=(y_train<=2.5).astype(int), y_pred=y_pred_train)
#metrics.auc() HOW TO USE THIS?

### Evaluation

In [None]:
y_pred_test = svc_1ow.predict(X_test[selected_features_low])
#Clasification report
results_test = metrics.classification_report(y_true=(y_test<=2.5).astype(int), y_pred=y_pred_test)
print(results_test)
metrics.accuracy_score(y_true=(y_test<=2.5).astype(int), y_pred=y_pred_test)


This classification task seems quite limited (not practical), so let's open it up to predict each rating level.

Let's use LASSO regression to identify features.  

## LASSO Regression

In [None]:
from sklearn.linear_model import LogisticRegression
#solvers = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
lassoregr = LogisticRegression(penalty ='l1', solver='saga', multi_class='multinomial')
lassoregr.fit(X_train, y_train.map(z))

coeffs = pd.DataFrame(lassoregr.coef_.transpose())
coeffs.index = X_train.columns
coeffs.head()

It would be easier to visualize if we first converted the above matrix into rankings (along each column). Say we wanted to optimize the model to perform on those recipes with a 3.125 rating, then we take the top k features for the 5th column.

In [None]:
new_features = coeffs.iloc[:,4].sort_values().index[:29]
print(new_features)

svc_new = SVC(probability=True)
svc_new.fit(X_train[new_features], y_train.map(z))

scores_new = cross_val_score(svc_new, X_train[new_features], y_train.map(z), cv=5)
print('Cross-Validation Scores: {}'.format(scores_new))
print('Cross-Validation Score Averaged Across Folds: {:.2%}.\n'.format(scores_new.mean()))

p_pred_new = svc_new.predict_proba(X_test[new_features])

y_pred_new = p_pred_new.argmax(axis=1)
#Clasification report
results_new = metrics.classification_report(y_true=y_test.map(z), y_pred=y_pred_new)
print(results_new)

Whoa this set of features appears to improve on all classes! Let's see if we can get better model performance from using a random forest to select the same number of features.

## Random Forest  