In [690]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
from collections import Counter
from sklearn.model_selection import GridSearchCV

In [667]:
data = pd.read_csv('winequality-red.csv')
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


# Preprocessing

Drop NaN from the dataset

In [668]:
data.dropna(inplace=True)

Transform the data

In [669]:
numeric_features = data.drop('quality', axis=1).columns

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

column_transformer = ColumnTransformer(
    transformers=[
        ('numeric', numeric_transformer, numeric_features),
    ]
)

transformed_data = column_transformer.fit_transform(data)

Split the data into features and target variable

In [670]:
X = pd.DataFrame(transformed_data, columns=numeric_features)
y = data['quality']

Split the data into training and testing

In [671]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=45)
Counter(y_train)

Counter({5: 479, 6: 436, 7: 150, 4: 36, 8: 13, 3: 5})

# Training

In [672]:
def show_metrics(y_pred, X_test, y_test):
    print('Classification report:')
    report = classification_report(y_test, y_pred, zero_division=1)
    print(report)

    print('Confusion matrix:')
    cm = confusion_matrix(y_test, y_pred)
    print(cm)

## Random Forest

In [673]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

Predict and calculate metrics

In [674]:
y_pred = rf.predict(X_test)
show_metrics(y_pred, X_test, y_test)

Classification report:
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         5
           4       1.00      0.00      0.00        17
           5       0.78      0.85      0.81       202
           6       0.72      0.75      0.73       202
           7       0.55      0.53      0.54        49
           8       0.50      0.20      0.29         5

    accuracy                           0.73       480
   macro avg       0.59      0.39      0.40       480
weighted avg       0.73      0.73      0.71       480

Confusion matrix:
[[  0   0   4   1   0   0]
 [  1   0   9   7   0   0]
 [  0   0 171  30   1   0]
 [  0   0  33 151  17   1]
 [  0   0   3  20  26   0]
 [  0   0   0   1   3   1]]


Use Grid Search to perform hyperparameter tuning with cross-validation to find the best hyperparameters for the model

In [687]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}
grid_search = GridSearchCV(rf, param_grid, cv=5)
grid_search.fit(X_train, y_train)
print('Best parameters: ', grid_search.best_params_)
print('Best score: ', grid_search.best_score_)

Best parameters:  {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 200}
Best score:  0.6773422485586164


Use the best parameters to fit the model

In [688]:
rf = RandomForestClassifier(
    max_depth=grid_search.best_params_['max_depth'], 
    min_samples_split=grid_search.best_params_['min_samples_split'], 
    n_estimators=grid_search.best_params_['n_estimators']
)
rf.fit(X_train, y_train)

Predict and calculate the metrics using the best parameters

In [689]:
y_pred = rf.predict(X_test)
show_metrics(y_pred, X_test, y_test)

Classification report:
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         5
           4       1.00      0.06      0.11        17
           5       0.77      0.84      0.80       202
           6       0.71      0.75      0.73       202
           7       0.56      0.49      0.52        49
           8       1.00      0.20      0.33         5

    accuracy                           0.72       480
   macro avg       0.67      0.39      0.42       480
weighted avg       0.73      0.72      0.71       480

Confusion matrix:
[[  0   0   4   1   0   0]
 [  1   1  10   5   0   0]
 [  0   0 170  31   1   0]
 [  0   0  34 152  16   0]
 [  0   0   3  22  24   0]
 [  0   0   0   2   2   1]]


## Naive Bayes

In [675]:
nb = GaussianNB()
nb.fit(X_train, y_train)

Predict and calculate metrics

In [676]:
y_pred = nb.predict(X_test)
show_metrics(y_pred, X_test, y_test)

Classification report:
              precision    recall  f1-score   support

           3       0.50      0.20      0.29         5
           4       0.16      0.41      0.23        17
           5       0.65      0.60      0.62       202
           6       0.55      0.42      0.48       202
           7       0.31      0.55      0.39        49
           8       0.00      0.00      0.00         5

    accuracy                           0.50       480
   macro avg       0.36      0.36      0.34       480
weighted avg       0.54      0.50      0.51       480

Confusion matrix:
[[  1   2   1   1   0   0]
 [  1   7   6   3   0   0]
 [  0  24 122  50   6   0]
 [  0   8  57  85  51   1]
 [  0   2   3  15  27   2]
 [  0   0   0   1   4   0]]


Use Grid Search to perform hyperparameter tuning with cross-validation to find the best hyperparameters for the model

In [691]:
param_grid = {
    'var_smoothing': np.logspace(0,-9, num=100)
}
grid_search = GridSearchCV(nb, param_grid, cv=5)
grid_search.fit(X_train, y_train)
print('Best parameters: ', grid_search.best_params_)
print('Best score: ', grid_search.best_score_)

Best parameters:  {'var_smoothing': 0.8111308307896871}
Best score:  0.5718930172966047


Use the best parameters to fit the model

In [692]:
nb = GaussianNB(
    var_smoothing=grid_search.best_params_['var_smoothing']
)
nb.fit(X_train, y_train)

Predict and calculate the metrics using the best parameters

In [693]:
y_pred = nb.predict(X_test)
show_metrics(y_pred, X_test, y_test)

Classification report:
              precision    recall  f1-score   support

           3       1.00      0.00      0.00         5
           4       0.20      0.06      0.09        17
           5       0.64      0.74      0.69       202
           6       0.57      0.58      0.58       202
           7       0.39      0.31      0.34        49
           8       1.00      0.00      0.00         5

    accuracy                           0.59       480
   macro avg       0.63      0.28      0.28       480
weighted avg       0.58      0.59      0.57       480

Confusion matrix:
[[  0   2   3   0   0   0]
 [  0   1   9   7   0   0]
 [  0   1 149  49   3   0]
 [  0   1  67 117  17   0]
 [  0   0   5  29  15   0]
 [  0   0   0   2   3   0]]


## Logistic Regression

In [706]:
lr = LogisticRegression(max_iter=10000)
lr.fit(X_train, y_train)

Predict and calculate metrics

In [701]:
y_pred = lr.predict(X_test)
show_metrics(y_pred, X_test, y_test)

Classification report:
              precision    recall  f1-score   support

           3       1.00      0.20      0.33         5
           4       0.33      0.06      0.10        17
           5       0.68      0.78      0.73       202
           6       0.61      0.61      0.61       202
           7       0.38      0.31      0.34        49
           8       0.00      0.00      0.00         5

    accuracy                           0.62       480
   macro avg       0.50      0.33      0.35       480
weighted avg       0.60      0.62      0.60       480

Confusion matrix:
[[  1   1   3   0   0   0]
 [  0   1   9   6   1   0]
 [  0   1 157  40   4   0]
 [  0   0  59 124  18   1]
 [  0   0   3  31  15   0]
 [  0   0   0   3   2   0]]


Use Grid Search to perform hyperparameter tuning with cross-validation to find the best hyperparameters for the model

In [708]:
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga']
}
grid_search = GridSearchCV(lr, param_grid, cv=5)
grid_search.fit(X_train, y_train)
print('Best parameters: ', grid_search.best_params_)
print('Best score: ', grid_search.best_score_)

Best parameters:  {'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}
Best score:  0.5862027546444587


Use the best parameters to fit the model

In [709]:
lr = LogisticRegression(
    C=grid_search.best_params_['C'], 
    penalty=grid_search.best_params_['penalty'], 
    solver=grid_search.best_params_['solver']
)
lr.fit(X_train, y_train)

Predict and calculate the metrics using the best parameters

In [710]:
y_pred = lr.predict(X_test)
show_metrics(y_pred, X_test, y_test)

Classification report:
              precision    recall  f1-score   support

           3       1.00      0.00      0.00         5
           4       0.25      0.06      0.10        17
           5       0.68      0.81      0.74       202
           6       0.62      0.63      0.62       202
           7       0.42      0.27      0.33        49
           8       1.00      0.00      0.00         5

    accuracy                           0.63       480
   macro avg       0.66      0.29      0.30       480
weighted avg       0.62      0.63      0.61       480

Confusion matrix:
[[  0   2   3   0   0   0]
 [  0   1   9   6   1   0]
 [  0   1 163  35   3   0]
 [  0   0  62 127  13   0]
 [  0   0   3  33  13   0]
 [  0   0   0   4   1   0]]


## KNN

In [679]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

Predict and calculate metrics

In [680]:
y_pred = knn.predict(X_test)
show_metrics(y_pred, X_test, y_test)

Classification report:
              precision    recall  f1-score   support

           3       1.00      0.00      0.00         5
           4       1.00      0.12      0.21        17
           5       0.63      0.69      0.66       202
           6       0.58      0.60      0.59       202
           7       0.41      0.39      0.40        49
           8       1.00      0.00      0.00         5

    accuracy                           0.59       480
   macro avg       0.77      0.30      0.31       480
weighted avg       0.61      0.59      0.57       480

Confusion matrix:
[[  0   0   4   1   0   0]
 [  0   2   8   7   0   0]
 [  0   0 139  59   4   0]
 [  0   0  58 122  22   0]
 [  0   0  11  19  19   0]
 [  0   0   1   3   1   0]]


Use Grid Search to perform hyperparameter tuning with cross-validation to find the best hyperparameters for the model

In [711]:
param_grid = {
    'n_neighbors': list(range(1, 31)),
    'weights': ['uniform', 'distance'],
    'p': [1, 2]
}
grid_search = GridSearchCV(knn, param_grid, cv=5)
grid_search.fit(X_train, y_train)
print('Best parameters: ', grid_search.best_params_)
print('Best score: ', grid_search.best_score_)

Best parameters:  {'n_neighbors': 27, 'p': 2, 'weights': 'distance'}
Best score:  0.6541519859064702


Use the best parameters to fit the model

In [712]:
knn = KNeighborsClassifier(
    n_neighbors=grid_search.best_params_['n_neighbors'], 
    weights=grid_search.best_params_['weights'], 
    p=grid_search.best_params_['p']
)
knn.fit(X_train, y_train)

Predict and calculate the metrics using the best parameters

In [713]:
y_pred = knn.predict(X_test)
show_metrics(y_pred, X_test, y_test)

Classification report:
              precision    recall  f1-score   support

           3       1.00      0.00      0.00         5
           4       1.00      0.00      0.00        17
           5       0.75      0.78      0.77       202
           6       0.67      0.74      0.70       202
           7       0.55      0.53      0.54        49
           8       1.00      0.20      0.33         5

    accuracy                           0.70       480
   macro avg       0.83      0.38      0.39       480
weighted avg       0.71      0.70      0.68       480

Confusion matrix:
[[  0   0   4   1   0   0]
 [  0   0  11   6   0   0]
 [  0   0 158  43   1   0]
 [  0   0  35 149  18   0]
 [  0   0   2  21  26   0]
 [  0   0   0   2   2   1]]
