In [50]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [3]:
data = pd.read_csv('winequality-red.csv')
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


# Preprocessing

Drop NaN from the dataset

In [4]:
data.dropna(inplace=True)

Transform the data

In [38]:
numeric_features = data.drop('quality', axis=1).columns

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

column_transformer = ColumnTransformer(
    transformers=[
        ('numeric', numeric_transformer, numeric_features),
    ]
)

transformed_data = column_transformer.fit_transform(data)

Split the data into features and target variable

In [39]:
X = pd.DataFrame(transformed_data, columns=numeric_features)
y = data['quality']

Split the data into training and testing

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Training

## Random Forest

In [41]:
rf = RandomForestClassifier()

rf.fit(X_train, y_train)

Predict and calculate metrics

In [42]:
y_pred = rf.predict(X_test)

report = classification_report(y_test, y_pred, zero_division=1)
print(report)

              precision    recall  f1-score   support

           3       1.00      0.00      0.00         1
           4       1.00      0.00      0.00        17
           5       0.73      0.77      0.75       195
           6       0.64      0.72      0.68       200
           7       0.57      0.46      0.51        61
           8       0.50      0.17      0.25         6

    accuracy                           0.67       480
   macro avg       0.74      0.35      0.36       480
weighted avg       0.68      0.67      0.66       480



In [43]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[  0   0   1   0   0   0]
 [  0   0  11   6   0   0]
 [  0   0 150  43   2   0]
 [  0   0  42 144  14   0]
 [  0   0   1  31  28   1]
 [  0   0   0   0   5   1]]


## Naive Bayes

In [44]:
nb = GaussianNB()
nb.fit(X_train, y_train)

Predict and calculate metrics

In [45]:
y_pred = nb.predict(X_test)

report = classification_report(y_test, y_pred, zero_division=1)
print(report)

              precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       0.12      0.12      0.12        17
           5       0.68      0.61      0.64       195
           6       0.52      0.53      0.52       200
           7       0.40      0.52      0.45        61
           8       0.00      0.00      0.00         6

    accuracy                           0.54       480
   macro avg       0.28      0.30      0.29       480
weighted avg       0.55      0.54      0.54       480



In [46]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[  0   0   1   0   0   0]
 [  1   2   8   6   0   0]
 [  0   6 119  64   6   0]
 [  0   9  45 105  38   3]
 [  0   0   3  26  32   0]
 [  0   0   0   1   5   0]]


## Logistic Regression

In [47]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

Predict and calculate metrics

In [52]:
y_pred = lr.predict(X_test)

report = classification_report(y_test, y_pred, zero_division=1)
print(report)

              precision    recall  f1-score   support

           3       1.00      0.00      0.00         1
           4       1.00      0.00      0.00        17
           5       0.62      0.75      0.68       195
           6       0.53      0.55      0.54       200
           7       0.42      0.26      0.32        61
           8       1.00      0.00      0.00         6

    accuracy                           0.56       480
   macro avg       0.76      0.26      0.26       480
weighted avg       0.58      0.56      0.54       480



In [53]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[  0   0   1   0   0   0]
 [  0   0  11   6   0   0]
 [  0   0 146  49   0   0]
 [  0   0  73 109  18   0]
 [  0   0   4  41  16   0]
 [  0   0   0   2   4   0]]


## KNN

In [51]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

Predict and calculate metrics

In [54]:
y_pred = knn.predict(X_test)

report = classification_report(y_test, y_pred, zero_division=1)
print(report)

              precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       0.33      0.06      0.10        17
           5       0.61      0.70      0.66       195
           6       0.55      0.56      0.56       200
           7       0.48      0.39      0.43        61
           8       1.00      0.00      0.00         6

    accuracy                           0.57       480
   macro avg       0.50      0.29      0.29       480
weighted avg       0.56      0.57      0.56       480



In [55]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[  0   0   0   1   0   0]
 [  0   1   8   8   0   0]
 [  1   1 137  52   4   0]
 [  0   1  68 112  19   0]
 [  0   0   9  28  24   0]
 [  0   0   1   2   3   0]]
