In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [38]:
df = pd.read_csv(r"WineQT.csv")

In [39]:
df.head(5)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,Id
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,1
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,2
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,3
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,4


In [40]:
#For Data Visualisation
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [41]:
#All Algorithms
from sklearn.linear_model import LinearRegression ,LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.svm import SVC ,SVR
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
import os
import random

In [42]:
# For Data Processing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split 

In [43]:
df['quality'] = df['quality']-3
df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,Id
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,2,0
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,2,1
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,2,2
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,3,3
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,2,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1138,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,3,1592
1139,6.8,0.620,0.08,1.9,0.068,28.0,38.0,0.99651,3.42,0.82,9.5,3,1593
1140,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,2,1594
1141,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,3,1595


In [44]:
df.describe()[1:].T.style.background_gradient(cmap='Oranges')

Unnamed: 0,mean,std,min,25%,50%,75%,max
fixed acidity,8.311111,1.747595,4.6,7.1,7.9,9.1,15.9
volatile acidity,0.531339,0.179633,0.12,0.3925,0.52,0.64,1.58
citric acid,0.268364,0.196686,0.0,0.09,0.25,0.42,1.0
residual sugar,2.532152,1.355917,0.9,1.9,2.2,2.6,15.5
chlorides,0.086933,0.047267,0.012,0.07,0.079,0.09,0.611
free sulfur dioxide,15.615486,10.250486,1.0,7.0,13.0,21.0,68.0
total sulfur dioxide,45.914698,32.78213,6.0,21.0,37.0,61.0,289.0
density,0.99673,0.001925,0.99007,0.99557,0.99668,0.997845,1.00369
pH,3.311015,0.156664,2.74,3.205,3.31,3.4,4.01
sulphates,0.657708,0.170399,0.33,0.55,0.62,0.73,2.0


In [45]:
fig = go.Figure(data=[go.Pie(labels=df['quality'].value_counts().index, values=df['quality'].value_counts(), hole=.5)])
fig.update_layout(legend_title_text='Quality')
fig.show()

In [46]:
fig = px.imshow(df.corr(),color_continuous_scale="Oranges")
fig.update_layout(height=750)
fig.show()

In [47]:
df_corr_bar = abs(df.corr()['quality']).sort_values()[:-1]
fig = px.bar(df_corr_bar, orientation='h', color_discrete_sequence =['#4285f4']*len(df_corr_bar))
fig.update_layout()
fig.show()

In [48]:
fig = go.Figure()
for x in range(6):
    fig.add_trace(go.Box(
        x=df[df['quality']==x]['volatile acidity'],
        y=df[df['quality']==x]['quality'], name='Quality '+str(x)
    ))
fig.update_layout(
    yaxis_title='quality', xaxis_title='volatile acidity'
)
fig.update_traces(orientation='h')
fig.show()

In [49]:
fig = px.scatter(df, x="total sulfur dioxide", y="free sulfur dioxide", color=df['quality'],  color_continuous_scale='Reds')
fig.update_layout(legend_title_text='Quality')

In [50]:
fig = go.Figure()
for x in range(6):
    fig.add_trace(go.Box(
        x=df[df['quality']==x]['citric acid'],
        y=df[df['quality']==x]['quality'], name='Quality '+str(x)
    ))

fig.update_layout(
    yaxis_title='quality', xaxis_title='citric acid'
)
fig.update_traces(orientation='h')
fig.show()

In [51]:
fig = px.scatter(df, x="fixed acidity", y="density", color=df['quality'],  color_continuous_scale='Reds')
fig.update_layout(legend_title_text='Quality')


In [52]:
fig = go.Figure()
for x in range(6):
    fig.add_trace(go.Box(
        x=df[df['quality']==x]['sulphates'],
        y=df[df['quality']==x]['quality'], name='Quality '+str(x)
    ))

fig.update_layout(
    yaxis_title='quality', xaxis_title='sulphates'
)
fig.update_traces(orientation='h')
fig.show()

In [53]:
fig = px.scatter(df, x="citric acid", y="volatile acidity", color=df['quality'],  color_continuous_scale='Reds')
fig.update_layout(legend_title_text='Quality')

In [54]:
fig = px.scatter(df, x="citric acid", y="volatile acidity", color=df['quality'],  color_continuous_scale='Reds')
fig.update_layout(legend_title_text='Quality')

In [55]:
df.describe().T[['min', 'max']][:-1].style.background_gradient(cmap='Oranges')

Unnamed: 0,min,max
fixed acidity,4.6,15.9
volatile acidity,0.12,1.58
citric acid,0.0,1.0
residual sugar,0.9,15.5
chlorides,0.012,0.611
free sulfur dioxide,1.0,68.0
total sulfur dioxide,6.0,289.0
density,0.99007,1.00369
pH,2.74,4.01
sulphates,0.33,2.0


In [56]:
for col in ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol']:
    df[col] = df[col]/df[col].max()

In [57]:
features = np.array(df[['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol']])
labels = np.array(df['quality'])

In [58]:
#Training and Testing the data
x_train, x_val, y_train, y_val = train_test_split(features, labels, test_size=0.2, random_state=0)

In [59]:
model_comparison = {}

In [60]:
parameters = {'C': [6,8,10,12,14,16], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}

svc_model = SVC()

clf = GridSearchCV(svc_model, parameters)
print("Searching for best hyperparameters ...")
clf.fit(x_train, y_train)
print(f'Best Hyperparameters: {clf.best_params_}')

y_pred = clf.predict(x_val)
model_comparison['SVC'] = [accuracy_score(y_val,y_pred), f1_score(y_val,y_pred, average='weighted')]
print('\n')
print(classification_report(y_val,y_pred, zero_division=1))

Searching for best hyperparameters ...
Best Hyperparameters: {'C': 12, 'kernel': 'poly'}


              precision    recall  f1-score   support

           0       1.00      0.00      0.00         1
           1       1.00      0.00      0.00         7
           2       0.72      0.83      0.77       100
           3       0.61      0.63      0.62        92
           4       0.47      0.33      0.39        27
           5       1.00      0.00      0.00         2

    accuracy                           0.66       229
   macro avg       0.80      0.30      0.30       229
weighted avg       0.66      0.66      0.63       229



In [61]:
parameters = {'max_depth': [5,10,15,20]}

Tree_model = DecisionTreeClassifier()

clf = GridSearchCV(Tree_model, parameters)
print("Searching for best hyperparameters ...")
clf.fit(x_train, y_train)
print(f'Best Hyperparameters: {clf.best_params_}')

y_pred = clf.predict(x_val)
model_comparison['DecisionTreeClassifier'] = [accuracy_score(y_val,y_pred), f1_score(y_val,y_pred, average='weighted')]
print('\n')
print(classification_report(y_val,y_pred, zero_division=1))

Searching for best hyperparameters ...
Best Hyperparameters: {'max_depth': 5}


              precision    recall  f1-score   support

           0       1.00      0.00      0.00         1
           1       0.00      0.00      0.00         7
           2       0.66      0.67      0.66       100
           3       0.55      0.58      0.56        92
           4       0.46      0.44      0.45        27
           5       1.00      0.00      0.00         2

    accuracy                           0.58       229
   macro avg       0.61      0.28      0.28       229
weighted avg       0.57      0.58      0.57       229



In [62]:
parameters = {'n_neighbors': [10,20,30,40,50]}

K_model = KNeighborsClassifier()

clf = GridSearchCV(K_model, parameters)
print("Searching for best hyperparameters ...")
clf.fit(x_train, y_train)
print(f'Best Hyperparameters: {clf.best_params_}')

y_pred = clf.predict(x_val)
model_comparison['KNeighborsClassifier'] = [accuracy_score(y_val,y_pred), f1_score(y_val,y_pred, average='weighted')]
print('\n')
print(classification_report(y_val,y_pred, zero_division=1))

Searching for best hyperparameters ...
Best Hyperparameters: {'n_neighbors': 20}


              precision    recall  f1-score   support

           0       1.00      0.00      0.00         1
           1       1.00      0.00      0.00         7
           2       0.68      0.73      0.71       100
           3       0.54      0.62      0.58        92
           4       0.53      0.33      0.41        27
           5       1.00      0.00      0.00         2

    accuracy                           0.61       229
   macro avg       0.79      0.28      0.28       229
weighted avg       0.62      0.61      0.59       229



In [63]:
model_comparison_df = pd.DataFrame.from_dict(model_comparison).T
model_comparison_df.columns = ['Accuracy', 'F1 Score']
model_comparison_df = model_comparison_df.sort_values('F1 Score')
model_comparison_df.style.background_gradient(cmap='Oranges')

Unnamed: 0,Accuracy,F1 Score
DecisionTreeClassifier,0.576419,0.568388
KNeighborsClassifier,0.606987,0.588714
SVC,0.655022,0.632507


In [64]:
fig = go.Figure(data=[
    go.Bar(name='F1 Score', y=model_comparison_df.index, x=model_comparison_df['F1 Score'], orientation='h'),
    go.Bar(name='Accuracy', y=model_comparison_df.index, x=model_comparison_df['Accuracy'], orientation='h')
])
fig.update_layout(barmode='group')
fig.show()