# Clasification Model
#### Predict the wine 

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from xgboost import XGBClassifier, plot_importance
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, classification_report, f1_score, confusion_matrix
import matplotlib.pyplot as plt
import pickle
import time

import warnings
warnings.filterwarnings('ignore')

In [2]:
%load_ext version_information
%matplotlib inline

In [3]:
%version_information pandas, sklearn, matplotlib, xgboost

Software,Version
Python,3.7.4 64bit [GCC 7.3.0]
IPython,7.8.0
OS,Linux 5.0.0 27 generic x86_64 with debian buster sid
pandas,0.25.1
sklearn,0.21.2
matplotlib,3.1.1
xgboost,0.90
Tue Sep 10 20:40:35 2019 CEST,Tue Sep 10 20:40:35 2019 CEST


## 1.Load and prepare de data 

In [4]:
train = pd.read_pickle('Data/wine_train.pkl')
test = pd.read_pickle('Data/wine_test.pkl')
X_train = train.drop(columns=['color', 'quality'])
y_train = train[['quality']]
X_test = test.drop(columns=['color', 'quality'])
y_test = test[['quality']]

## 2. Instance our models

In [7]:
dt = DecisionTreeClassifier()
lr = LogisticRegression()
kn = KNeighborsClassifier()
svc = SVC()
rf = RandomForestClassifier()
xgb = XGBClassifier()
bg = BaggingClassifier()

#our scaler too
scaler = StandardScaler()

## 3. Create pipelines

In [8]:
pipeline_dt = Pipeline(steps=[("scaler", scaler), ('dt', dt)])
pipeline_lr = Pipeline(steps=[('scaler', scaler), ('lr', lr)])
pipeline_kn = Pipeline(steps=[('scaler', scaler), ('kn', kn)])
pipeline_svc = Pipeline(steps=[('scaler', scaler), ('svc', svc)])
pipeline_rf = Pipeline(steps=[('scaler', scaler), ('rf', rf)])
pipeline_xgb = Pipeline(steps=[('scaler', scaler), ('xgb', xgb)])
pipeline_bg = Pipeline(steps=[('scaler', scaler), ('bg', bg)])

## 4. Grid parameters

In [9]:
grid_dt = {"scaler__with_mean": [True, False],
           "scaler__with_std": [True, False],
           "dt__max_depth": [2,3,4,5,6],
           "dt__min_samples_split": [3,4,5,10],
           "dt__min_samples_leaf": [3,4,5,10],
           "dt__class_weight": [None, "balanced"]}

grid_lr = {"scaler__with_mean": [True, False],
           "scaler__with_std": [True, False],
           "lr__C": [0.01, 0.1, 0.5, 1.0, 2.0, 5.0],
           "lr__fit_intercept": [True, False],
           "lr__multi_class": ['ovr', 'auto'],
           "lr__class_weight": [None, "balanced"]}

grid_kn = {"scaler__with_mean": [True, False],
            "scaler__with_std": [True, False],
            "kn__n_neighbors": [1,3,5,7,9,11],
            "kn__metric": ["minkowski", "euclidean"],
            "kn__p": [1,2,3,4]}

grid_svc = {"scaler__with_std": [True],
            "svc__C": [0.1, 0.5, 1.0, 2.0],
            "svc__kernel": ["linear", "rbf"],
            "svc__degree": [2,3,4],
            "svc__class_weight": [None, "balanced"],
            "svc__decision_function_shape": ['ovo', 'ovr']}

grid_rf = {"scaler__with_mean": [True, False],
           "scaler__with_std": [True, False],
           "rf__n_estimators": [10, 30, 50, 100, 150, 200],
           "rf__max_depth": [3,4,5,6],
           "rf__min_samples_split": [3,4,5,10],
           "rf__min_samples_leaf": [3,4,5,10],
           "rf__class_weight": [None, "balanced"]}

grid_xgb = {"scaler__with_mean": [True, False],
            "scaler__with_std": [True, False],
            "xgb__learning_rate": [0.1, 0.01],
            "xgb__gamma" : [0.3, 0.5, 1,],
            "xgb__max_depth": [4, 7],
            "xgb__n_estimators": [100, 250, 500, 1000],
            "xgb__objective": ["multi:softmax"],
            "xgb__num_class": [10]}

grid_bg = {"scaler__with_mean": [True, False],
           "scaler__with_std": [True, False],
           "bg__base_estimator":[lr],
           "bg__n_estimators": [30, 50, 70], 
           "bg__max_samples": [0.2, 0.4, 0.6], 
           "bg__max_features": [1,2,3,4]}

## 5. Grid Search instances

In [10]:
my_scorer = make_scorer(f1_score, greater_is_better=True, average='micro')

In [11]:
gs_dt = GridSearchCV(estimator=pipeline_dt,
                     param_grid=grid_dt,
                     scoring= my_scorer,
                     cv=10,
                     n_jobs=-1)

gs_lr = GridSearchCV(estimator=pipeline_lr,
                     param_grid=grid_lr,
                     scoring=my_scorer,
                     cv=10,
                     n_jobs=-1)

gs_kn = GridSearchCV(estimator=pipeline_kn,
                      param_grid=grid_kn,
                      scoring=my_scorer,
                      cv=10,
                      n_jobs=-1)

gs_svc = GridSearchCV(estimator=pipeline_svc,
                      param_grid=grid_svc,
                      scoring=my_scorer,
                      cv=10,
                      n_jobs=-1)

gs_rf = GridSearchCV(estimator=pipeline_rf,
                     param_grid=grid_rf,
                     scoring=my_scorer,
                     cv=10,
                     n_jobs=-1)

gs_xgb = GridSearchCV(estimator=pipeline_xgb,
                     param_grid=grid_xgb,
                     scoring=my_scorer,
                     cv=10,
                     n_jobs=-1)

gs_bg = GridSearchCV(estimator=pipeline_bg,
                     param_grid=grid_bg,
                     scoring=my_scorer,
                     cv=10,
                     n_jobs=-1)

## 6. Train and test

In [12]:
bag_of_models = {"Decision Tree": gs_dt,
                 "LR": gs_lr,
                 "KN": gs_kn,
                 "SVC": gs_svc,
                 "Random Forest": gs_rf,
                 "XGB": gs_xgb,
                 "GB": gs_bg}

In [13]:
start = time.time()
for name, gs in bag_of_models.items():
    print("Doing Grid Search of {}".format(name))
    gs.fit(X_train, y_train)
    model = gs.best_estimator_
    model.fit(X_train, y_train)
    train_predict_f1 = f1_score(model.predict(X_train), y_train, average='micro')
    print("Model F1 score in train {}".format(train_predict_f1))
    test_predict_f1 = f1_score(model.predict(X_test), y_test, average='micro')
    print('Model F1 score in test {}'.format(test_predict_f1))
    print("MODEL {} FINISHED".format(name))
end = time.time()
print("Time training model {} min".format(np.around((end - start)/60)))

Doing Grid Search of Decision Tree
Model F1 score in train 0.5808093045795978
Model F1 score in test 0.4824561403508772
MODEL Decision Tree FINISHED
Doing Grid Search of LR
Model F1 score in train 0.5568209353040949
Model F1 score in test 0.518796992481203
MODEL LR FINISHED
Doing Grid Search of KN
Model F1 score in train 0.6268475890477344
Model F1 score in test 0.5225563909774437
MODEL KN FINISHED
Doing Grid Search of SVC
Model F1 score in train 0.6275745093288103
Model F1 score in test 0.5375939849624061
MODEL SVC FINISHED
Doing Grid Search of Random Forest
Model F1 score in train 0.6210322267991277
Model F1 score in test 0.5213032581453634
MODEL Random Forest FINISHED
Doing Grid Search of XGB
Model F1 score in train 0.9294887327356434
Model F1 score in test 0.5300751879699248
MODEL XGB FINISHED
Doing Grid Search of GB
Model F1 score in train 0.5255633632178338
Model F1 score in test 0.4849624060150376
MODEL GB FINISHED
Time training model 38.0 min


## 7 Create groups: bad, regular, good
we can see that the multiclassification have a week F1 score, lets try to do bigger groups and test the F1 score again

In [14]:
def quality_group(quality):
    if quality == 3 or quality == 4:
        return 0 #bad wine
    elif quality == 5 or quality == 6 or quality == 7:
        return 1 #regular wine
    else:
        return 2 #good wine

In [15]:
train['new_quality'] = train.apply(lambda x: quality_group(x['quality']), axis=1)
test['new_quality'] = test.apply(lambda x: quality_group(x['quality']), axis=1)

In [16]:
X_train2 = train.drop(columns=['color', 'quality', 'new_quality'])
y_train2 = train[['new_quality']]
X_test2 = test.drop(columns=['color', 'quality', 'new_quality'])
y_test2 = test[['new_quality']]

In [17]:
start = time.time()
for name, gs in bag_of_models.items():
    print("Doing Grid Search of {}".format(name))
    gs.fit(X_train2, y_train2)
    model = gs.best_estimator_
    model.fit(X_train2, y_train2)
    train_predict_f1 = f1_score(model.predict(X_train2), y_train2, average='micro')
    print("Model accuracy in train {}".format(train_predict_f1))
    test_predict_f1 = f1_score(model.predict(X_test2), y_test2, average='micro')
    print('Model accuracy in test {}'.format(test_predict_f1))
    print("MODEL {} FINISHED".format(name))
end = time.time()
print("Time training model {} min".format(np.around((end - start)/60)))

Doing Grid Search of Decision Tree
Model accuracy in train 0.9372425490671191
Model accuracy in test 0.9085213032581454
MODEL Decision Tree FINISHED
Doing Grid Search of LR
Model accuracy in train 0.9372425490671191
Model accuracy in test 0.9085213032581454
MODEL LR FINISHED
Doing Grid Search of KN
Model accuracy in train 0.9386963896292707
Model accuracy in test 0.9060150375939849
MODEL KN FINISHED
Doing Grid Search of SVC
Model accuracy in train 0.9372425490671191
Model accuracy in test 0.9085213032581454
MODEL SVC FINISHED
Doing Grid Search of Random Forest
Model accuracy in train 0.9379694693481948
Model accuracy in test 0.9085213032581454
MODEL Random Forest FINISHED
Doing Grid Search of XGB
Model accuracy in train 0.9430579113157257
Model accuracy in test 0.9085213032581454
MODEL XGB FINISHED
Doing Grid Search of GB
Model accuracy in train 0.9372425490671191
Model accuracy in test 0.9085213032581454
MODEL GB FINISHED
Time training model 27.0 min


As in the last notebooks, XGB is the model with best score in train, but the same in test, so to change we are going to pick the SVM model.

## 8 Interprete the model
SVM is the only linear model which can classify data which is not linearly separable. The model multiply the feature by itself so many times as we fix in the "degree" hyperparameter, creating spaces with 2, 3, 4... dimensions in which the data is linearly separable. the problem is that this calculous is computationally very expensive, so we can't fix a degree 100, or do a grid search with a lot of differents degrees

In [18]:
bag_of_models['SVC'].best_estimator_

Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('svc',
                 SVC(C=0.1, cache_size=200, class_weight=None, coef0=0.0,
                     decision_function_shape='ovo', degree=2,
                     gamma='auto_deprecated', kernel='linear', max_iter=-1,
                     probability=False, random_state=None, shrinking=True,
                     tol=0.001, verbose=False))],
         verbose=False)

In [19]:
svc_champion = SVC(C=0.1, cache_size=200, class_weight=None, coef0=0.0,
                     decision_function_shape='ovo', degree=2,
                     gamma='auto_deprecated', kernel='linear', max_iter=-1,
                     probability=False, random_state=None, shrinking=True,
                     tol=0.001, verbose=2)

svc_champion.fit(X_train2, y_train2)

[LibSVM]

SVC(C=0.1, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovo', degree=2, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=2)

In [20]:
print("F1 score en Train: ", f1_score(svc_champion.predict(X_train2), y_train2, average='micro'))
print("F1 score en Test: ", f1_score(svc_champion.predict(X_test2), y_test2, average='micro'))

F1 score en Train:  0.9372425490671191
F1 score en Test:  0.9085213032581454


In [21]:
confusion_matrix(svc_champion.predict(X_test2), y_test2)

array([[  0,   0,   0],
       [ 52, 725,  21],
       [  0,   0,   0]])

In [22]:
confusion_matrix(svc_champion.predict(X_train2), y_train2)

array([[   0,    0,    0],
       [ 137, 3868,  122],
       [   0,    0,    0]])

Here we can see a horrible model with a very good score. Our model is terrible, as we can see our dataset is very unbalanced and the model predicts all wines as "regular" wines, so it's not a predictive model.  
Let's try with a balanced hyperparamenter!

In [23]:
svc2 = SVC(C=0.1, cache_size=200, class_weight='balanced', coef0=0.0,
                     decision_function_shape='ovo', degree=2,
                     gamma='auto_deprecated', kernel='linear', max_iter=-1,
                     probability=False, random_state=None, shrinking=True,
                     tol=0.001, verbose=2)
svc2.fit(X_train2, y_train2)

[LibSVM]

SVC(C=0.1, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovo', degree=2, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=2)

In [24]:
confusion_matrix(svc2.predict(X_train2), y_train2)

array([[  98, 1058,    7],
       [  30, 1877,   23],
       [   9,  933,   92]])

The cofusion Matrix looks very bad, we're going to try to make other groups

In [25]:
def quality_group(quality):
    if quality == 3 or quality == 4 or quality == 5:
        return 0 #bad wine
    elif quality == 6:
        return 1 #regular wine
    else:
        return 2 #good wine

In [26]:
train['new_quality'] = train.apply(lambda x: quality_group(x['quality']), axis=1)
test['new_quality'] = test.apply(lambda x: quality_group(x['quality']), axis=1)

In [27]:
X_train2 = train.drop(columns=['color', 'quality', 'new_quality'])
y_train2 = train[['new_quality']]
X_test2 = test.drop(columns=['color', 'quality', 'new_quality'])
y_test2 = test[['new_quality']]

In [28]:
train.groupby(['new_quality'])['quality'].count()

new_quality
0    1491
1    1832
2     804
Name: quality, dtype: int64

Is not completly balanced but looks good

In [29]:
svc3 = SVC(C=0.1, cache_size=200, coef0=0.0,
                     decision_function_shape='ovo', degree=2,
                     gamma='auto_deprecated', kernel='linear', max_iter=-1,
                     probability=False, random_state=None, shrinking=True,
                     tol=0.001, verbose=2)
svc3.fit(X_train2, y_train2)

[LibSVM]

SVC(C=0.1, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovo', degree=2, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=2)

In [30]:
confusion_matrix(svc3.predict(X_train2), y_train2)

array([[ 988,  460,   48],
       [ 503, 1372,  756],
       [   0,    0,    0]])

Terrible results, too. So in this point we can't create a good multiclass model, we can try some things to improve our model.

1. Collect more data 
2. Make a better preprocessing 
3. Try other complex techniques to create synthetic data
4. If we only want to predict the class and don't care about the explanation of the model, we can make a Neural Network

We will not explain the output variables of this model because it will have no sense with the real world

PD: obviously is a very bad model so i will not save it.