In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import joblib
from tqdm.notebook import tqdm
import itertools

from scipy.cluster.hierarchy import linkage, dendrogram

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV

from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.cluster import AgglomerativeClustering

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import silhouette_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import r2_score
from sklearn.metrics import f1_score

from sklearn.svm import SVC

from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.tree import DecisionTreeRegressor

from sklearn.preprocessing import OneHotEncoder

from sklearn.multiclass import OneVsOneClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor

from sklearn.ensemble import VotingRegressor 
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import StackingRegressor

from sklearn.ensemble import VotingClassifier 
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import StackingClassifier

In [None]:
df = pd.read_csv('../datasets/epi_r.csv')

In [None]:
df.head()

In [None]:
df.dropna(inplace = True)

In [None]:
list_of_food = [
    'almond', 'amaretto', 'anchovy', 'anise', 'apple', 'apricot', 'artichoke', 'arugula', 'asparagus', 'avocado',
    'bacon', 'banana', 'barley', 'basil', 'beef', 'beet', 'bell pepper', 'berry', 'blackberry', 'blue cheese',
    'blueberry', 'bok choy', 'bran', 'bread', 'brie', 'broccoli', 'bulgur', 'burrito', 'butter', 'buttermilk',
    'butternut squash', 'cabbage', 'candy', 'cantaloupe', 'capers', 'carrot', 'cashew', 'cauliflower', 'caviar',
    'celery', 'cheddar', 'cheese', 'cherry', 'chestnut', 'chicken', 'chickpea', 'chile pepper', 'chili', 'chive',
    'chocolate', 'coconut', 'cod', 'coriander', 'corn', 'crab', 'cranberry', 'cream cheese', 'cucumber', 'curry',
    'custard', 'dairy', 'date', 'duck', 'egg', 'eggplant', 'endive', 'fennel', 'feta', 'fig', 'fish', 'garlic',
    'goat cheese', 'gouda', 'grape', 'grapefruit', 'green bean', 'green onion/scallion', 'ham', 'hamburger',
    'hazelnut', 'honey', 'hummus', 'ice cream', 'jalapeño', 'kale', 'kiwi', 'lamb', 'lemon', 'lentil', 'lettuce',
    'lima bean', 'lime', 'lobster', 'macaroni and cheese', 'mango', 'maple syrup', 'mayonnaise', 'meatball',
    'melon', 'mint', 'mushroom', 'mussel', 'mustard', 'nutmeg', 'oatmeal', 'olive', 'omelet', 'onion', 'orange',
    'oregano', 'oyster', 'pancake', 'papaya', 'paprika', 'parmesan', 'parsley', 'parsnip', 'pasta', 'peanut',
    'pear', 'pecan', 'pepper', 'persimmon', 'pineapple', 'pistachio', 'pizza', 'plum', 'pomegranate', 'pork',
    'potato', 'poultry', 'prosciutto', 'prune', 'pumpkin', 'quail', 'quinoa', 'radish', 'raisin', 'raspberry',
    'rice', 'ricotta', 'rosemary', 'salmon', 'salsa', 'sausage', 'scallop', 'seafood', 'sesame', 'shallot',
    'shrimp', 'spinach', 'squash', 'steak', 'strawberry', 'sugar snap pea', 'sweet potato/yam', 'swiss cheese',
    'tangerine', 'tapioca', 'tarragon', 'tea', 'thyme', 'tilapia', 'tofu', 'tomato', 'trout', 'tuna', 'turnip',
    'vanilla', 'veal', 'vegetable', 'walnut', 'wasabi', 'watermelon', 'wild rice', 'yellow squash', 'yogurt',
    'zucchini'
]

In [None]:
X = df[list_of_food]
y = df[['rating']]

In [None]:
X

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 21)

<h1> Regression </h1>

In [None]:
class tqdmGridSearchCV(GridSearchCV):
    def _run_search(self, evaluate_candidates):
        par = ParameterGrid(self.param_grid)
        for i in tqdm(par):
            evaluate_candidates([i])

In [None]:
def choose(grids, grid_dict, X_train, X_test, y_train, y_test):
        scores = {}
        for_dataframe = {'model': [], 'params': [], 'valid_score': []}
        for grid in grids:
            reg = grid
            reg.fit(X_train, y_train)

            best_model = reg.best_estimator_
            y_pred = best_model.predict(X_test)

            print(f'Estimator: {grid_dict[grid]}')
            for_dataframe['model'].append(grid_dict[grid])

            print(f'Best params: {reg.best_params_}')
            for_dataframe['params'].append(reg.best_params_)

            print(f'Best training accuracy: {np.abs(reg.best_score_)}')

            print(f'Validation set accuracy score for best params: {np.abs(mean_squared_error(y_test, y_pred))}')
            for_dataframe['valid_score'].append(np.abs(mean_squared_error(y_test, y_pred)))

            scores[grid_dict[grid]] = np.abs(mean_squared_error(y_test, y_pred))
            print()
            
        name_best_model = sorted(scores.items(), key = lambda x: x[1], reverse = False)
        print(f'Classifier with the best RMSE: {name_best_model[0][0]}')
        return name_best_model[0][0]

In [None]:
lin_reg_param = {'fit_intercept': [True, False], 'positive': [True, False]}
tree_reg_param = {'max_depth': np.arange(1, 10, 1), 'min_samples_split': np.arange(2, 5, 1), 'min_samples_leaf': np.arange(1, 5, 1)}
randf_reg_param = {'n_estimators': [5, 10, 50, 100], 'max_depth': np.arange(1, 10, 1),'min_samples_split': np.arange(2, 5, 1), 'min_samples_leaf': np.arange(1, 5, 1)}

lr = tqdmGridSearchCV(estimator = LinearRegression(), param_grid = lin_reg_param, scoring = 'neg_root_mean_squared_error', n_jobs = -1)
tr = tqdmGridSearchCV(estimator = DecisionTreeRegressor(random_state = 21), param_grid = tree_reg_param, scoring = 'neg_root_mean_squared_error', n_jobs = -1)
rfr = tqdmGridSearchCV(estimator = RandomForestRegressor(random_state = 21), param_grid = randf_reg_param, scoring = 'neg_root_mean_squared_error', n_jobs = -1)

grids = [lr, tr, rfr]
grid_dict = {lr: 'LinearRegression', tr: 'DecisionTreeRegressor', rfr: 'RandomForestRegressor'}

choose(grids, grid_dict, X_train, X_test, y_train, y_test)

<h1> Ensembles </h1>

In [None]:
def print_metrics(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f'MSE is {mse:.5f}')
    print(f'RMSE is {np.sqrt(mse):.5f}')
    print(f'R2 is {r2:.5f}')

In [None]:
lr_b = lr.best_estimator_
tr_b = tr.best_estimator_
rfr_b = rfr.best_estimator_

voting_estimators = [('LinearRegression', lr_b), ('DecisionTreeRegressor', tr_b), ('RandomForestRegressor', rfr_b)]
voting_params = {'weights': list(itertools.combinations([1, 2, 3, 4, 5], 3))}

model_voting = tqdmGridSearchCV(estimator = VotingRegressor(estimators = voting_estimators), param_grid = voting_params, cv = 5, n_jobs = -1, scoring = 'neg_root_mean_squared_error')
print_metrics(model_voting, X_train, y_train, X_test, y_test)
print(f'{model_voting.best_params_}')

In [None]:
p = {'max_depth': 6, 'min_samples_leaf': 4, 'min_samples_split': 2}
tr_b = DecisionTreeRegressor(**p, random_state = 21)

bagging_params = {'n_estimators': [5, 10, 50, 100]}

model_bagging = tqdmGridSearchCV(estimator = BaggingRegressor(estimator = tr_b, n_jobs = -1, random_state = 21), param_grid = bagging_params, cv = 5, n_jobs = -1, scoring = 'neg_root_mean_squared_error')
print_metrics(model_bagging, X_train, y_train, X_test, y_test)
print(f'{model_bagging.best_params_}')

In [None]:
lr_b = LinearRegression(n_jobs = -1)
p = {'max_depth': 6, 'min_samples_leaf': 4, 'min_samples_split': 2}
tr_b = DecisionTreeRegressor(**p, random_state = 21)
p = {'max_depth': 9, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 50}
rfr_b = RandomForestRegressor(**p, random_state = 21)

stacking_estimators = [('LinearRegression', lr_b), ('DecisionTreeRegressor', tr_b), ('RandomForestRegressor', rfr_b)]

model_stacking = StackingRegressor(estimators = stacking_estimators, final_estimator = RidgeCV(cv = 5, scoring = 'neg_root_mean_squared_error'), cv = 5, n_jobs = -1)
print_metrics(model_stacking, X_train, y_train, X_test, y_test)

<h1> Naive regression </h1>

In [None]:
y_naive = y_test.copy()
y_naive['predict'] = y_naive['rating'].mean()
np.sqrt(((y_naive.rating - y_naive.predict) ** 2).mean())

<h1> Classifier </h1>

Binarize the target column by rounding the ratings to the closest integer. This will be your classes.
Try different algorithms and their hyperparameters for class prediction. Choose the best on cross-validation and find the score (accuracy) on the test subsample.
Compare the metrics using accuracy. Calculate the accuracy of a naive classificator that predicts the most common class.
Binarize the target column again by converting the integers to classes ‘bad’ (0, 1), ‘so-so’ (2, 3), ‘great’ (4, 5).
Try different algorithms and their hyperparameters for class prediction. Choose the best on cross-validation and find the score on the test subsample.
Compare the metrics using accuracy. Calculate the accuracy of a naive classificator that predicts the most common class.
What is worse: to predict a bad rating which is good in real life, or to predict a good rating which is bad in real life? Replace accuracy with the appropriate metric.
Try different algorithms and their hyperparameters for class prediction with the new metric. Choose the best and find the score on the test subsample.
Try different ensembles and their hyperparameters. Choose the best and find the score on the test subsample.

In [None]:
y = y[['rating']].round(0)
y['rating'] = y['rating'].astype(int)
y['rating'].value_counts()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 21, stratify = y)

In [None]:
def choose(grids, grid_dict, X_train, X_test, y_train, y_test):
        scores = {}
        for_dataframe = {'model': [], 'params': [], 'valid_score': []}
        for grid in grids:
            reg = grid
            reg.fit(X_train, y_train)

            best_model = reg.best_estimator_
            y_pred = best_model.predict(X_test)

            print(f'Estimator: {grid_dict[grid]}')
            for_dataframe['model'].append(grid_dict[grid])

            print(f'Best params: {reg.best_params_}')
            for_dataframe['params'].append(reg.best_params_)

            print(f'Best training accuracy: {np.abs(reg.best_score_)}')

            print(f'Validation set accuracy score for best params: {np.abs(accuracy_score(y_test, y_pred))}')
            for_dataframe['valid_score'].append(np.abs(accuracy_score(y_test, y_pred)))

            scores[grid_dict[grid]] = np.abs(accuracy_score(y_test, y_pred))
            print()
            
        name_best_model = sorted(scores.items(), key = lambda x: x[1], reverse = False)
        print(f'Classifier with the best accuracy: {name_best_model[0][0]}')
        return name_best_model[0][0]

In [None]:
svm_params = {'kernel':('linear', 'rbf', 'sigmoid'), 'C':[0.01, 0.1, 1, 1.5, 5, 10], 'gamma': ['scale', 'auto'], 'class_weight':('balanced', None)}
tree_params = {'max_depth': [i for i in range(1, 50)], 'class_weight': ('balanced', None), 'criterion': ('entropy', 'gini')}
rf_params = {'max_depth': [i for i in range(1, 50)], 'class_weight': ('balanced', None), 'criterion': ('entropy', 'gini'), 'n_estimators': [5, 10, 50, 100]}

gs_svm = tqdmGridSearchCV(estimator = SVC(random_state = 21, probability = True), param_grid = svm_params, scoring = 'accuracy', n_jobs = -1)
gs_tree = tqdmGridSearchCV(estimator = DecisionTreeClassifier(random_state = 21), param_grid = tree_params, scoring = 'accuracy', n_jobs = -1)
gs_rf = tqdmGridSearchCV(estimator = RandomForestClassifier(random_state = 21), param_grid = rf_params, scoring = 'accuracy', n_jobs = -1)

grids = [gs_svm, gs_tree, gs_rf]

grid_dict = {gs_svm: 'SVM', gs_tree: 'DecisionTreeClassifier', gs_rf: 'RandomForestClassifier'}

choose(grids, grid_dict, X_train, X_test, y_train, y_test)

Estimator: SVM
Best params: {'C': 1, 'kernel': 'rbf'}
Best training accuracy: 0.6757545567321572
Validation set accuracy score for best params: 0.6750709108099591

Estimator: DecisionTreeClassifier
Best params: {'class_weight': None, 'criterion': 'entropy', 'max_depth': 1}
Best training accuracy: 0.6737057924742806
Validation set accuracy score for best params: 0.6738102741884652

Estimator: RandomForestClassifier
Best params: {'class_weight': None, 'criterion': 'gini', 'max_depth': 38, 'n_estimators': 100}
Best training accuracy: 0.6802458790232498
Validation set accuracy score for best params: 0.6760163882760795

<h3> naive </h3>

In [None]:
y_naive = y_test.copy()
y_naive['predict'] = y_test['rating'].mode()[0]
accuracy_score(y_naive['rating'], y_naive['predict'])

<h3> binarize from int to classes </h3>

In [None]:
y = y.copy()
y['rating'] = y['rating'].case_when([
    (y['rating'] <= 1, 'bad'),
    ((y['rating'] <= 3) & (y['rating'] >= 2), 'so-so'),
    ((y['rating'] <= 5) & (y['rating'] >= 4), 'great')]
)

In [None]:
y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 21, stratify = y)

In [None]:
y_naive = y_test.copy()
y_naive['predict'] = y_test['rating'].mode()[0]
accuracy_score(y_naive['rating'], y_naive['predict'])

In [None]:
y_naive = y_test.copy()
y_naive['predict'] = y_test['rating'].mode()[0]
f1_score(y_naive['rating'], y_naive['predict'], average = 'weighted')

In [None]:
def choose(grids, grid_dict, X_train, X_test, y_train, y_test):
        scores = {}
        for_dataframe = {'model': [], 'params': [], 'valid_score': []}
        for grid in grids:
            reg = grid
            reg.fit(X_train, y_train)

            best_model = reg.best_estimator_
            y_pred = best_model.predict(X_test)

            print(f'Estimator: {grid_dict[grid]}')
            for_dataframe['model'].append(grid_dict[grid])

            print(f'Best params: {reg.best_params_}')
            for_dataframe['params'].append(reg.best_params_)

            print(f'Best training f1_score: {np.abs(reg.best_score_)}')

            print(f'Validation set f1_score for best params: {np.abs(f1_score(y_test, y_pred, average = "weighted"))}')
            for_dataframe['valid_score'].append(np.abs(f1_score(y_test, y_pred, average = "weighted")))

            scores[grid_dict[grid]] = np.abs(f1_score(y_test, y_pred, average = "weighted"))
            print()
            
        name_best_model = sorted(scores.items(), key = lambda x: x[1], reverse = True)
        print(f'Classifier with the best f1_score: {name_best_model[0][0]}')
        return name_best_model[0][0]

In [None]:
#svm_params = {'kernel':('linear', 'rbf', 'sigmoid'), 'C':[0.01, 1, 5, 10]}
tree_params = {'max_depth': [i for i in range(1, 50)], 'class_weight': ('balanced', None), 'criterion': ('entropy', 'gini')}
rf_params = {'max_depth': [i for i in range(1, 50)], 'class_weight': ('balanced', None), 'criterion': ('entropy', 'gini'), 'n_estimators': [5, 10, 50, 100]}

#gs_svm = tqdmGridSearchCV(estimator = SVC(random_state = 21, probability = True), param_grid = svm_params, scoring = 'f1_weighted', n_jobs = -1)
gs_tree = tqdmGridSearchCV(estimator = DecisionTreeClassifier(random_state = 21), param_grid = tree_params, scoring = 'f1_weighted', n_jobs = -1)
gs_rf = tqdmGridSearchCV(estimator = RandomForestClassifier(random_state = 21), param_grid = rf_params, scoring = 'f1_weighted', n_jobs = -1)

grids = [gs_tree, gs_rf]

grid_dict = {gs_tree: 'DecisionTreeClassifier', gs_rf: 'RandomForestClassifier'}

choose(grids, grid_dict, X_train, X_test, y_train, y_test)

best with f1

Estimator: SVM
Best params: {'C': 5, 'kernel': 'rbf'}
Best training f1_score: 0.7337721319736552

Estimator: DecisionTreeClassifier
Best params: {'class_weight': None, 'criterion': 'gini', 'max_depth': 20}
Best training f1_score: 0.726571335535054

Estimator: RandomForestClassifier
Best params: {'class_weight': None, 'criterion': 'gini', 'max_depth': 45, 'n_estimators': 100}
Best training f1_score: 0.7340618439935772

<h3> ensembles </h3>

In [None]:
def print_metrics(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_pred, average = 'weighted')
    print(f'f1_score is {f1:.5f}')

In [None]:
svm_b = SVC(C = 5, kernel = 'rbf', random_state = 21)
tree_b = DecisionTreeClassifier(class_weight = None, criterion = 'gini', max_depth = 20, random_state = 21)
rfc_b = RandomForestClassifier(class_weight = None, criterion = 'gini', max_depth = 45, n_estimators = 100, random_state = 21)

voting_estimators = [('SVC', svm_b), ('DecisionTreeClassifier', tree_b), ('RandomForestClassifier', rfc_b)]
voting_params = {'weights': list(itertools.combinations([1, 2, 3, 4, 5], 3))}

In [None]:
model_voting = tqdmGridSearchCV(estimator = VotingClassifier(estimators = voting_estimators), param_grid = voting_params, cv = 5, n_jobs = -1, scoring = 'f1_weighted')
print_metrics(model_voting, X_train, y_train, X_test, y_test)
print(f'{model_voting.best_params_}')

voting
f1_score is 0.73574
{'weights': (1, 2, 3)}

bagging
f1_score is 0.73907
{'n_estimators': 100}

stacking
f1_score is 0.72771


In [None]:
bag_parametrs = {'n_estimators': [5, 10, 50, 100]}
model = BaggingClassifier(estimator = svm_b, random_state = 21)
model_bagging = tqdmGridSearchCV(model, bag_parametrs, n_jobs = -1, scoring = 'f1_weighted')

print_metrics(model_bagging, X_train, y_train, X_test, y_test)
print(f'{model_bagging.best_params_}')

In [None]:
stacking_estimators = [('SVC', svm_b), ('DecisionTreeClassifier', tree_b), ('RandomForestClassifier', rfc_b)]

model_stacking = StackingClassifier(estimators = stacking_estimators, final_estimator = LogisticRegression(solver = 'liblinear'), cv = 5, n_jobs = -1)
print_metrics(model_stacking, X_train, y_train, X_test, y_test)

In [None]:
joblib.dump(model_bagging.best_estimator_, 'best_recipes_model')

In [None]:
m = joblib.load('best_recipes_model')

In [None]:
y1 = m.predict(X_test)

In [None]:
accuracy_score(y1, y_test)

In [None]:
f1_score(y_test, y1, average = 'weighted')