In [18]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import joblib
import os
import copy
from scipy import stats
from itertools import *
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve,auc,roc_auc_score
import pickle
%matplotlib inline
sns.set_style('white')
plt.rcParams['figure.dpi'] = 300
plt.show()

In [19]:
cwd = os.getcwd()
cwd

'/home/jupyter-bhu22/Capstone Project'

In [20]:
red_df = pd.read_csv(cwd+'/winequality-red.csv',sep=';')
red_df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [21]:
red_df['wine_label'] = [2 if x> 6 else 1 if ((x >4) and (x <7)) else 0 for x in  red_df['quality']]

In [22]:
X1 = red_df.drop(['quality','wine_label'], axis = 1)
y1 = red_df['wine_label']

In [23]:
def train_test(X,y,random):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state = random)
    X_train = StandardScaler().fit_transform(X_train)
    X_test  = StandardScaler().fit_transform(X_test)
    print('X_train:',X_train.shape, 'X_test:', X_test.shape)
    print('y_train:',y_train.shape, 'y_test:', y_test.shape)
    return X_train, X_test, y_train, y_test

In [24]:
def cv_hyperparameters(model,param_grid,X_train,y_train,cv):
    scoring = { 'F1 Score':'f1_weighted'}
    
    X_train = np.array(X_train, dtype=int)
    y_train = np.array(y_train, dtype=int)
    
    grid = GridSearchCV(model, param_grid, scoring=scoring, cv=cv, refit=False)
    grid.fit(X_train, y_train)
    
    result_df= pd.concat([pd.DataFrame(grid.cv_results_['params']),
              pd.DataFrame(grid.cv_results_['mean_test_F1 Score'],columns=['F1 Score'])],axis=1)
    return result_df

In [25]:
models = {'SVM': SVC(),
          'RF': RandomForestClassifier()}

#SVC
param_gridSVM = {'C':[0.001,0.01,0.1,1,10,100,1000],
             'gamma':[1,0.1,0.01,0.001,0.0001],
             'kernel':['linear','poly','sigmoid','rbf'],
             'degree':[1,2,3,4,5]}

# RandomForest
param_gridRF = {'n_estimators': [100,200,300,1000],
                'bootstrap': [True],
              'max_features': [2,3],
              'max_depth' : [80,90,100,110],
                'min_samples_leaf':[3,4,5],
                'min_samples_split':[8,10,12]}



In [26]:
X_train, X_test, y_train, y_test= train_test(X1,y1,20)

X_train: (1119, 11) X_test: (480, 11)
y_train: (1119,) y_test: (480,)


## SVM Hyperparameters Tuning

In [27]:
result_svm = cv_hyperparameters(models['SVM'],param_gridSVM,X_train,y_train,5)

In [28]:
result_svm.sort_values(by='F1 Score', ascending = False)

Unnamed: 0,C,degree,gamma,kernel,F1 Score
587,100.0,5,0.1,rbf,0.811863
547,100.0,3,0.1,rbf,0.811863
507,100.0,1,0.1,rbf,0.811863
567,100.0,4,0.1,rbf,0.811863
527,100.0,2,0.1,rbf,0.811863
...,...,...,...,...,...
642,1000.0,3,1.0,sigmoid,0.718658
682,1000.0,5,1.0,sigmoid,0.718658
622,1000.0,2,1.0,sigmoid,0.718658
662,1000.0,4,1.0,sigmoid,0.718658


## Random Forest Hyperparameters Tuning

In [30]:
result_rf = cv_hyperparameters(models['RF'],param_gridRF,X_train,y_train,5)

In [31]:
result_rf.sort_values(by='F1 Score', ascending = False)

Unnamed: 0,bootstrap,max_depth,max_features,min_samples_leaf,min_samples_split,n_estimators,F1 Score
117,True,90,3,3,12,200,0.793958
56,True,80,3,4,12,100,0.792998
123,True,90,3,4,8,1000,0.791814
41,True,80,3,3,10,200,0.791158
191,True,100,3,3,12,1000,0.791010
...,...,...,...,...,...,...,...
168,True,100,2,5,8,100,0.767272
241,True,110,2,5,8,200,0.766969
106,True,90,2,5,12,300,0.766969
240,True,110,2,5,8,100,0.766058
