In [1]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.tree import DecisionTreeRegressor,DecisionTreeClassifier
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.metrics import accuracy_score,mean_squared_error,confusion_matrix,f1_score
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.ensemble import (RandomForestClassifier,RandomForestRegressor, ExtraTreesClassifier,
                              BaggingClassifier, AdaBoostRegressor)
from sklearn import svm
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.ensemble import AdaBoostClassifier


import warnings

from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix,accuracy_score, f1_score, classification_report


In [2]:
# Load the final csv 
usda_instacart_final_df= pd.read_csv('./prepared_data/USDA_Insta_Final_Data.csv',index_col = 'Unnamed: 0')

FileNotFoundError: [Errno 2] File b'./USDA_Insta_Final_Data.csv' does not exist: b'./USDA_Insta_Final_Data.csv'

In [None]:
usda_instacart_final_df.shape

In [None]:
usda_instacart_final_df.columns

In [None]:
usda_instacart_final_df= usda_instacart_final_df.drop(columns=['order_id','user_id','product_id',
                                            'department_id','product_name',
                                            'aisle_id','aisle','department','eval_set',
                                            'Long_Desc', 'product_id', 'product_name', 'Calcium, Ca (mg)',
                                            'Energy (kcal)','Fatty acids, total monounsaturated (g)',
                                            'Fatty acids, total polyunsaturated (g)',
                                            'Fatty acids, total saturated (g)', 'Fatty acids, total trans (g)',
                                            'Fiber, total dietary (g)', 'Iron, Fe (mg)',
                                            'Sodium, Na (mg)','Water (g)'],axis=1)
usda_instacart_final_df.head()

In [None]:
# define X and y
X = usda_instacart_final_df.drop('reordered', axis=1)
y = usda_instacart_final_df['reordered']

In [None]:
# base line
y.value_counts(normalize = True)

In [None]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3,
                                                    random_state = 42,
                                                    stratify = y)

In [None]:
lr = LogisticRegression()                          # instantiate the model
lr.fit(X_train,y_train)                            # fit the model
print("train score:", lr.score(X_train, y_train))
print("test score:", lr.score(X_test, y_test))

In [None]:
# evaluate the model
# evaluate the metrics
y_pred = lr.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print("accuracy:", accuracy_score(y_test, y_pred))
tn, fp, fn, tp = confusion_matrix(y_test,y_pred).ravel()
print("tn, fp, fn, tp:", tn, fp, fn, tp)
print("f1 score:",f1_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
bag = BaggingClassifier() # instantiate the model
bag.fit(X_train, y_train) # fit the model
print("train score:", bag.score(X_train, y_train))
print("test score:", bag.score(X_test, y_test))

In [None]:
# evaluate the model
# evaluate the metrics

y_pred_bag = bag.predict(X_test)
print(confusion_matrix(y_test, y_pred_bag))
tn, fp, fn, tp = confusion_matrix(y_test,y_pred_bag).ravel()
print("tn, fp, fn, tp:", tn, fp, fn, tp)
print("accuracy:", accuracy_score(y_test, y_pred_bag))
print("f1 score:",f1_score(y_test, y_pred_bag))
print(classification_report(y_test, y_pred_bag))

In [None]:
# Decision Tree Classifier
tree = DecisionTreeClassifier(max_depth=5) # instantiate the model
tree.fit(X_train, y_train)                 # fit the model
print("train score:", tree.score(X_train, y_train))
print("train score:", tree.score(X_test, y_test))

In [None]:
# evaluate the model
# evaluate the metrics

y_pred_tree = tree.predict(X_test)
print(confusion_matrix(y_test, y_pred_tree))
tn, fp, fn, tp = confusion_matrix(y_test,y_pred_tree).ravel()
print("tn, fp, fn, tp:", tn, fp, fn, tp)
print("accuracy:", accuracy_score(y_test, y_pred_tree))
print("f1 score:",f1_score(y_test, y_pred_tree))
print(classification_report(y_test, y_pred_tree))

In [None]:
# Random forest classifier
rfc = RandomForestClassifier(random_state=42) # instantiate the model
rfc.fit(X_train, y_train)                     # fit the model
print("train score:", rfc.score(X_train, y_train))
print("test score:", rfc.score(X_test, y_test))

In [None]:
# grid search
params = {'n_estimators':[5,10,50,100], 'max_depth':[None,2,3,4,10,11]}
grid = GridSearchCV(rfc, param_grid = params,cv=2,verbose=1)
grid.fit(X_train,y_train)
print("grid.best_score_:", grid.best_score_)
print("grid.best_params_:", grid.best_params_)

In [None]:
# evaluate the model
# evaluate the metrics

y_pred_rfc = rfc.predict(X_test)
print(confusion_matrix(y_test, y_pred_rfc))
tn, fp, fn, tp = confusion_matrix(y_test,y_pred_rfc).ravel()
print("tn, fp, fn, tp:", tn, fp, fn, tp)
print("accuracy:", accuracy_score(y_test, y_pred_rfc))
print("f1 score:",f1_score(y_test, y_pred_rfc))
print(classification_report(y_test, y_pred_rfc))

In [None]:
roc_curve(y_test, y_pred_rfc)

In [None]:
# calculate the fpr and tpr for all thresholds of the classification
probs = grid.best_estimator_.predict_proba(X_test)
preds = probs[:,1]
fpr, tpr, threshold = roc_curve(y_test, preds)
roc_auc = auc(fpr, tpr)

# method I: plt
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show();


In [None]:
from sklearn.ensemble import AdaBoostClassifier

scores_test = []
scores_train = []
n_estimators = []

for n_est in range(30):
    ada = AdaBoostClassifier(n_estimators=n_est+1, random_state=42) # instantiate the model
    ada.fit(X_train, y_train) # fit the model
    n_estimators.append(n_est+1)
    scores_test.append(ada.score(X_test, y_test))
    scores_train.append(ada.score(X_train, y_train))

In [None]:
print("test score:", ada.score(X_test, y_test))
print("train score:", ada.score(X_train, y_train))

In [None]:
# Create adaboost-decision tree classifer 
# abc = AdaBoostClassifier(n_estimators=30,
#                          learning_rate=1,
#                          random_state=42)
# # Train model
# model = abc.fit(X, y)

In [None]:
# plot ROC curve
plt.plot(n_estimators, scores_test, label='Test scores')
plt.plot(n_estimators, scores_train, label='Train scores')
plt.xlabel('# Estimators')
plt.ylabel('Accuracy')
plt.legend()

In [None]:
y_pred_ada = ada.predict(X_test)

In [None]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy:", accuracy_score(y_test, y_pred_ada))

In [None]:
# evaluation metrics
print((confusion_matrix(y_test, y_pred_ada)))
tn, fp, fn, tp = confusion_matrix(y_test,y_pred_ada).ravel()
print("tn, fp, fn, tp:", tn, fp, fn, tp)
print("f1 score:",f1_score(y_test, y_pred_ada))
print(classification_report(y_test, y_pred_ada))