In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

# Wine Quality Dataset

In [2]:
df = pd.read_csv("28-winequality-red.csv")

In [3]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [5]:
df.isnull().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

- Tüm kolonlar sayısal türden. Bir encoding işlemine ihtiyacımız olmayacak. Missing value de yok yine hiçbir kolon için.

# Train Test Split

In [6]:
X = df.drop('quality', axis=1)
y = df['quality']

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=15)

In [9]:
X_train.shape

(1199, 11)

In [10]:
X_test.shape 

(400, 11)

# Decision Tree Classifier

In [11]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold

In [12]:
param_grid = {
    'criterion' : ['gini', 'entropy', 'log_loss'],
    'splitter' : ['best', 'random'],
    'max_depth' : [3,4,5,6,10,15,20,25,30,None],
    'max_features' : ['sqrt', 'log2', None]
}

cv = StratifiedKFold()

In [13]:
gridDecision = GridSearchCV(estimator=DecisionTreeClassifier(), param_grid=param_grid, cv=cv, scoring='accuracy', refit=True)

In [14]:
import warnings
warnings.filterwarnings('ignore')

In [15]:
gridDecision.fit(X_train, y_train)

In [16]:
gridDecision.best_params_

{'criterion': 'gini',
 'max_depth': 20,
 'max_features': None,
 'splitter': 'random'}

In [17]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [18]:
tree = DecisionTreeClassifier(criterion='gini', max_depth=20, max_features='log2', splitter='best')

In [19]:
tree.fit(X_train, y_train)

In [20]:
y_pred = tree.predict(X_test)
print("Accuracy :", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("Confusion Matrix :\n", confusion_matrix(y_test, y_pred))

Accuracy : 0.585
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         4
           4       0.08      0.07      0.07        14
           5       0.64      0.68      0.66       163
           6       0.63      0.56      0.59       171
           7       0.43      0.56      0.48        43
           8       0.50      0.40      0.44         5

    accuracy                           0.58       400
   macro avg       0.38      0.38      0.38       400
weighted avg       0.58      0.58      0.58       400

Confusion Matrix :
 [[  0   1   1   1   1   0]
 [  0   1   8   5   0   0]
 [  1   6 111  36   7   2]
 [  0   5  49  96  21   0]
 [  0   0   5  14  24   0]
 [  0   0   0   0   3   2]]


# K- Nearest Neighboors Classifier (KNN)

In [21]:
from sklearn.preprocessing import StandardScaler

In [22]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [23]:
from sklearn.neighbors import KNeighborsClassifier

In [24]:
knn = KNeighborsClassifier(n_neighbors=5, algorithm='auto')
knn.fit(X_train_scaled, y_train)

In [25]:
y_pred = knn.predict(X_test)
print("Accuracy :", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("Confusion Matrix :\n", confusion_matrix(y_test, y_pred))

Accuracy : 0.4275
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         4
           4       0.00      0.00      0.00        14
           5       0.42      0.98      0.59       163
           6       0.60      0.07      0.13       171
           7       0.00      0.00      0.00        43
           8       0.00      0.00      0.00         5

    accuracy                           0.43       400
   macro avg       0.17      0.17      0.12       400
weighted avg       0.43      0.43      0.29       400

Confusion Matrix :
 [[  0   0   4   0   0   0]
 [  0   0  12   2   0   0]
 [  0   0 159   4   0   0]
 [  0   0 159  12   0   0]
 [  0   0  41   2   0   0]
 [  0   0   5   0   0   0]]


# SVM Classifier

In [26]:
from sklearn.svm import SVC

In [31]:
param_grid = {
    'C' : [0.1, 1, 10],
    'kernel' : ['linear', 'poly', 'rbf'],
    'gamma' : ['scale', 'auto']
}
cv = StratifiedKFold()

In [32]:
svcgrid = GridSearchCV(estimator=SVC(), param_grid=param_grid, scoring='accuracy', cv=cv, refit=True, n_jobs=-1, verbose=3)

In [33]:
svcgrid.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


In [34]:
y_pred = svcgrid.predict(X_test)
print("Accuracy :", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("Confusion Matrix :\n", confusion_matrix(y_test, y_pred))

Accuracy : 0.4075
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         4
           4       0.00      0.00      0.00        14
           5       0.41      1.00      0.58       163
           6       0.00      0.00      0.00       171
           7       0.00      0.00      0.00        43
           8       0.00      0.00      0.00         5

    accuracy                           0.41       400
   macro avg       0.07      0.17      0.10       400
weighted avg       0.17      0.41      0.24       400

Confusion Matrix :
 [[  0   0   4   0   0   0]
 [  0   0  14   0   0   0]
 [  0   0 163   0   0   0]
 [  0   0 171   0   0   0]
 [  0   0  43   0   0   0]
 [  0   0   5   0   0   0]]
