In [0]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets
from sklearn.model_selection import train_test_split
from sklearn import metrics
import pandas as pd
from pandas import read_csv
from pandas import datetime
from pandas import datetime
from pandas import DataFrame
from pandas import concat
from sklearn.metrics import mean_squared_error

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [0]:
from google.colab import drive

drive.mount('/content/drive', force_remount = True)
data = '/content/drive/My Drive/data_folder/winequality-white.csv'

Mounted at /content/drive


In [0]:
from sklearn.neural_network import MLPClassifier
from sklearn.neural_network import MLPRegressor
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

series = read_csv(data, header=1, sep=';', engine='python')
df = DataFrame(series.values, columns=["fixed acidity", "volatile acidity", 
                                       "citric acid", "residual sugar", 
                                       "chlorides", "free sulfur dioxide", 
                                       "total sulfur dioxide", "density", 
                                       "pH", "sulphates", "alcohol", "quality"])

y = df['quality']
lenc = preprocessing.LabelEncoder()
y = lenc.fit_transform(y)
X = df.loc[:, df.columns != 'quality']
X = StandardScaler(with_std=False).fit_transform(X)

print(pd.Series(y).value_counts())
print(np.unique(y))

3    2197
2    1457
4     880
5     175
1     163
0      20
6       5
dtype: int64
[0 1 2 3 4 5 6]


In [0]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.10, random_state=1) 

In [0]:
from sklearn.utils import resample

X = pd.DataFrame(X_train, columns=["fixed acidity", "volatile acidity", 
                                       "citric acid", "residual sugar", 
                                       "chlorides", "free sulfur dioxide", 
                                       "total sulfur dioxide", "density", 
                                       "pH", "sulphates", "alcohol"])

Y = pd.DataFrame(Y_train, columns=["quality"])
frame = concat([X, Y], axis=1)
six = frame[frame.quality==6]
zero = frame[frame.quality==0]
one = frame[frame.quality==1]
five = frame[frame.quality==5]
four = frame[frame.quality==4]

classes = [six, zero, one, five, four]

for aclass in classes:
  upsample = resample(aclass, replace=True,
                    n_samples=len(frame[frame.quality==4]), 
                    random_state=1)
  frame = pd.concat([frame, upsample])

frame.quality.value_counts()

3    1987
4    1592
2    1297
5     958
1     940
0     812
6     801
Name: quality, dtype: int64

In [0]:
X_train = frame.loc[:, frame.columns != 'quality']
Y_train = frame.iloc[:,-1]

In [0]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
 
parameters = {
    'bootstrap': [True, False],
    'max_features': [3, 5, 9, 11],
    'min_samples_leaf': [2, 3],
    'min_samples_split': [2, 5, 8],
    'n_estimators': [20, 150, 500, 1000, 1500]}

rf = RandomForestClassifier()

grid_search = GridSearchCV(estimator = rf, param_grid = parameters, cv = 3, n_jobs = 1, verbose = 2, refit=False)
grid_search.fit(X_train, Y_train)
grid_search.best_params_

Fitting 3 folds for each of 240 candidates, totalling 720 fits
[CV] bootstrap=True, max_features=3, min_samples_leaf=2, min_samples_split=2, n_estimators=20 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  bootstrap=True, max_features=3, min_samples_leaf=2, min_samples_split=2, n_estimators=20, total=   0.2s
[CV] bootstrap=True, max_features=3, min_samples_leaf=2, min_samples_split=2, n_estimators=20 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s


[CV]  bootstrap=True, max_features=3, min_samples_leaf=2, min_samples_split=2, n_estimators=20, total=   0.2s
[CV] bootstrap=True, max_features=3, min_samples_leaf=2, min_samples_split=2, n_estimators=20 
[CV]  bootstrap=True, max_features=3, min_samples_leaf=2, min_samples_split=2, n_estimators=20, total=   0.2s
[CV] bootstrap=True, max_features=3, min_samples_leaf=2, min_samples_split=2, n_estimators=150 
[CV]  bootstrap=True, max_features=3, min_samples_leaf=2, min_samples_split=2, n_estimators=150, total=   1.6s
[CV] bootstrap=True, max_features=3, min_samples_leaf=2, min_samples_split=2, n_estimators=150 
[CV]  bootstrap=True, max_features=3, min_samples_leaf=2, min_samples_split=2, n_estimators=150, total=   1.6s
[CV] bootstrap=True, max_features=3, min_samples_leaf=2, min_samples_split=2, n_estimators=150 
[CV]  bootstrap=True, max_features=3, min_samples_leaf=2, min_samples_split=2, n_estimators=150, total=   1.6s
[CV] bootstrap=True, max_features=3, min_samples_leaf=2, min_sam

[Parallel(n_jobs=1)]: Done 720 out of 720 | elapsed: 181.0min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando

{'bootstrap': False,
 'max_features': 3,
 'min_samples_leaf': 2,
 'min_samples_split': 5,
 'n_estimators': 1000}

In [0]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report

rf = RandomForestClassifier(bootstrap=False, max_features=1, min_samples_leaf=2, min_samples_split=9, n_estimators=100)
rf.fit(X_train, Y_train)
print(rf.score(X_test, Y_test))
y_pred = rf.predict(X_test)

print(mean_squared_error(Y_test, y_pred))
print(confusion_matrix(Y_test, y_pred))
print(classification_report(Y_test, y_pred))

RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features=1,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=9,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

0.710204081632653
0.38979591836734695
[[  0   0   2   2   0   0]
 [  0   4  10   5   0   0]
 [  0   1 113  45   1   0]
 [  0   0  29 169  12   0]
 [  0   0   0  29  55   0]
 [  0   0   0   3   3   7]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       0.80      0.21      0.33        19
           2       0.73      0.71      0.72       160
           3       0.67      0.80      0.73       210
           4       0.77      0.65      0.71        84
           5       1.00      0.54      0.70        13

    accuracy                           0.71       490
   macro avg       0.66      0.49      0.53       490
weighted avg       0.72      0.71      0.70       490



  _warn_prf(average, modifier, msg_start, len(result))


In [0]:
mlp = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(100,50,25), activation = 'relu', max_iter=100, random_state=12, early_stopping = True) #alpha is the L2 penalty (regularization term) parameter
mlp.fit(X_train, Y_train)
print("Training set score: %f" % mlp.score(X_train, Y_train))
print("Test set score: %f" % mlp.score(X_test, Y_test))
y_pred = mlp.predict(X_test)
print(mean_squared_error(Y_test, y_pred))

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=True, epsilon=1e-08,
              hidden_layer_sizes=(100, 50, 25), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=100,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=12, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

Training set score: 0.646357
Test set score: 0.481633
0.926530612244898


In [0]:
from sklearn.svm import LinearSVC

clf = LinearSVC(random_state=1, tol=1e-5, max_iter=300)
clf.fit(X_train, Y_train)
print(clf.score(X_test, Y_test))
y_pred = clf.predict(X_test)
print(mean_squared_error(Y_test, y_pred))



LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=300,
          multi_class='ovr', penalty='l2', random_state=1, tol=1e-05,
          verbose=0)

0.25510204081632654
3.016326530612245


In [0]:
from sklearn.svm import SVC
svm1 = svm.SVC(kernel='rbf', gamma='auto', decision_function_shape= 'ovo')
svm1.fit(X_train, Y_train)
print(svm1.score(X_test, Y_test))

y_pred = svm1.predict(X_test)
print(mean_squared_error(Y_test, y_pred))

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovo', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

0.5551020408163265
0.7979591836734694


In [0]:
from sklearn import linear_model
clf = linear_model.SGDClassifier(max_iter=3000, tol=1e-3)
clf.fit(X_train, Y_train)
print(clf.score(X_test, Y_test))
y_pred = clf.predict(X_test)
print(mean_squared_error(Y_test, y_pred))

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=3000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

0.23877551020408164
2.3877551020408165


In [0]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0, max_iter=1000).fit(X_train, Y_train)
print(clf.score(X_test, Y_test))
y_pred = clf.predict(X_test)
print(mean_squared_error(Y_test, y_pred))

0.3816326530612245
1.473469387755102


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [0]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(X_train, Y_train)
print(gnb.score(X_test, Y_test))
y_pred = gnb.predict(X_test)
print(mean_squared_error(Y_test, y_pred))

GaussianNB(priors=None, var_smoothing=1e-09)

0.3795918367346939
1.3020408163265307


In [0]:
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, Y_train)
print(clf.score(X_test, Y_test))
y_pred = clf.predict(X_test)
print(mean_squared_error(Y_test, y_pred))

0.6571428571428571
0.6530612244897959


In [0]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=1)
clf.fit(X_train, Y_train)
print(clf.score(X_test, Y_test))
y_pred = clf.predict(X_test)
print(mean_squared_error(Y_test, y_pred))

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')

0.6204081632653061
0.636734693877551
