<a href="https://colab.research.google.com/github/basangoudapatil/Project-1-WineQuality-/blob/main/Project_1(WineQuality).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# import the necessary modules
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('/content/winequalityN.csv')
df.head()

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,white,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,white,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [None]:
# Perform basic data engineering on the dataset
df.isnull().sum()

type                     0
fixed acidity           10
volatile acidity         8
citric acid              3
residual sugar           2
chlorides                2
free sulfur dioxide      0
total sulfur dioxide     0
density                  0
pH                       9
sulphates                4
alcohol                  0
quality                  0
dtype: int64

In [None]:
# impute the null values with any central tendencies
for col in df.columns:
  if df[col].isnull().sum()>0:
    df[col] = df[col].fillna(df[col].median())

In [None]:
df.isnull().sum()

type                    0
fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

In [None]:
# check if all the data is in correct format
df.dtypes

type                     object
fixed acidity           float64
volatile acidity        float64
citric acid             float64
residual sugar          float64
chlorides               float64
free sulfur dioxide     float64
total sulfur dioxide    float64
density                 float64
pH                      float64
sulphates               float64
alcohol                 float64
quality                   int64
dtype: object

In [None]:
# Types is my target variable, as i am trying to classify based on other featue variables, encode it to proper format
df['type'].value_counts()

white    4898
red      1599
Name: type, dtype: int64

In [None]:
# In the type column only two variables are present, so label encoding is appropraite
df['type'] = df['type'].map({'white': 0, 'red': 1})
df['type'].head()

0    0
1    0
2    0
3    0
4    0
Name: type, dtype: int64

In [None]:
df.head()
df.describe()

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0
mean,0.246114,7.216246,0.33963,0.318718,5.443574,0.056039,30.525319,115.744574,0.994697,3.218384,0.531202,10.491801,5.818378
std,0.430779,1.295779,0.164557,0.145231,4.757585,0.035031,17.7494,56.521855,0.002999,0.160637,0.148769,1.192712,0.873255
min,0.0,3.8,0.08,0.0,0.6,0.009,1.0,6.0,0.98711,2.72,0.22,8.0,3.0
25%,0.0,6.4,0.23,0.25,1.8,0.038,17.0,77.0,0.99234,3.11,0.43,9.5,5.0
50%,0.0,7.0,0.29,0.31,3.0,0.047,29.0,118.0,0.99489,3.21,0.51,10.3,6.0
75%,0.0,7.7,0.4,0.39,8.1,0.065,41.0,156.0,0.99699,3.32,0.6,11.3,6.0
max,1.0,15.9,1.58,1.66,65.8,0.611,289.0,440.0,1.03898,4.01,2.0,14.9,9.0


In [None]:
df['quality'].value_counts()

6    2836
5    2138
7    1079
4     216
8     193
3      30
9       5
Name: quality, dtype: int64

In [None]:
#Encode the values using appropriate method
df['quality1'] = [0 if i<5 else 1 for i in df.quality]

In [None]:
df.columns

Index(['type', 'fixed acidity', 'volatile acidity', 'citric acid',
       'residual sugar', 'chlorides', 'free sulfur dioxide',
       'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol',
       'quality', 'quality1'],
      dtype='object')

**Prepare the data for Test and Train**

In [None]:
from sklearn.model_selection import train_test_split

X = df.drop(['quality1','quality'], axis=1)
y = df['quality1']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train.shape, y_train.shape

((4547, 12), (4547,))

In [None]:
#As this is a classification probelm, performing scaling is a good practise
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
X_train

array([[-0.57235587, -0.86093724, -1.02571959, ..., -0.36686969,
        -1.07790647,  0.50098461],
       [-0.57235587, -0.17021265, -1.14598873, ..., -1.60916673,
        -0.94609422, -1.2562058 ],
       [-0.57235587, -0.55394853, -0.90545046, ...,  0.87542736,
         0.04249764,  0.24995741],
       ...,
       [ 1.74716475,  4.74160669, -0.4243739 , ..., -0.36686969,
         0.89927724, -0.67047566],
       [ 1.74716475,  1.28798372,  0.3573755 , ...,  0.62696795,
         2.41511809,  2.42552649],
       [-0.57235587,  0.6740063 , -0.4243739 , ...,  0.00581943,
         0.63565275, -0.08474553]])

**Modelling of the Task**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

clf1 = LogisticRegression()
clf2 = SVC(kernel='rbf')

In [None]:
# For logisticregression
clf1.fit(X_train, y_train)
y_pred = clf1.predict(X_test)

df1 = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
df1.head()

Unnamed: 0,Actual,Predicted
3103,1,1
1419,1,1
4761,1,1
4690,1,1
4032,1,1


In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score
print('The accuracy of the model: ', accuracy_score(y_test, y_pred))
print('The AUROC score of the model: ', roc_auc_score(y_test, y_pred))

The accuracy of the model:  0.9651282051282051
The AUROC score of the model:  0.5142269375678986


In [None]:
# For SVC
clf2.fit(X_train, y_train)
y_pred = clf2.predict(X_test)

df2 = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
df2.head()

Unnamed: 0,Actual,Predicted
3103,1,1
1419,1,1
4761,1,1
4690,1,1
4032,1,1


In [None]:
print('The accuracy of the model: ', accuracy_score(y_test, y_pred))
print('The AUROC score of the model: ', roc_auc_score(y_test, y_pred))

The accuracy of the model:  0.9641025641025641
The AUROC score of the model:  0.49973418394471025


In [None]:
# For KNN Classifier
clf3 = KNeighborsClassifier(3)
clf3.fit(X_train, y_train)
y_pred = clf3.predict(X_test)
y_pred[0:5]

df3 = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
df3.head()

Unnamed: 0,Actual,Predicted
3103,1,1
1419,1,1
4761,1,1
4690,1,1
4032,1,1


In [None]:
# To find the best value of k neighbours of classification we use train score and cross-val score

from sklearn.model_selection import cross_val_score
for k in [1,2,3,4,5,6,7,8,9,10]:
  clf3 = KNeighborsClassifier(k)
  clf3.fit(X_train, y_train)
  print("K value  : " , k, " train score : ", clf3.score(X_train,y_train) , " score : ", np.mean(cross_val_score(clf3, X_train, y_train, cv=10))) #predicting using the model

K value  :  1  train score :  1.0  score :  0.9491997869971438
K value  :  2  train score :  0.9780074774576644  score :  0.9296286972938956
K value  :  3  train score :  0.9714097206949637  score :  0.9595342983008182
K value  :  4  train score :  0.9694303936661535  score :  0.9549150409062304
K value  :  5  train score :  0.964811963932263  score :  0.96063416759452
K value  :  6  train score :  0.9645920387068396  score :  0.9586532410320956
K value  :  7  train score :  0.9639322630305696  score :  0.9606346516919204
K value  :  8  train score :  0.9641521882559929  score :  0.9601946071549596
K value  :  9  train score :  0.9628326369034528  score :  0.9604143873747398
K value  :  10  train score :  0.9628326369034528  score :  0.959974342837779


In [None]:
# From the above results K=5 and K=7 are giving the highest cross_eval score 
# therefore the model has 2 values for classification
# Finding the scores of the model with k = 5

clf3 = KNeighborsClassifier(5)
clf3.fit(X_train, y_train)
y_pred = clf3.predict(X_test)
y_pred[0:5]

print('The accuracy of the model: ', accuracy_score(y_test, y_pred))
print('The AUROC score of the model: ', roc_auc_score(y_test, y_pred))

The accuracy of the model:  0.9620512820512821
The AUROC score of the model:  0.5335737235050736


In [None]:
#K=7
clf3 = KNeighborsClassifier(7)
clf3.fit(X_train, y_train)
y_pred = clf3.predict(X_test)

print('The accuracy of the model: ', accuracy_score(y_test, y_pred))
print('The AUROC score of the model: ', roc_auc_score(y_test, y_pred))

The accuracy of the model:  0.9651282051282051
The AUROC score of the model:  0.5142269375678986


In [None]:
# Applying Decision Tree Model to the above problem
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()

dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
y_pred[0:5]

array([1, 1, 1, 1, 1])

In [None]:
# Checking the best value of max-depth to reduceoveritting and underfitting
for depth in [1,2,3,4,5,6,7,8,9,10]:
  dt = DecisionTreeClassifier(max_depth = depth)
  dt.fit(X_train, y_train)

  trainaccuracy = accuracy_score(y_train, dt.predict(X_train))

  dt = DecisionTreeClassifier(max_depth = depth)
  valAccuracy = cross_val_score(dt, X, y, cv =10)
  print('Depth: ', depth, 'Training Accuracy: ', trainaccuracy, 'Cross Val Score: ', np.mean(valAccuracy))

Depth:  1 Training Accuracy:  0.9610732351000659 Cross Val Score:  0.9621367784757616
Depth:  2 Training Accuracy:  0.9628326369034528 Cross Val Score:  0.9621370155268458
Depth:  3 Training Accuracy:  0.9630525621288761 Cross Val Score:  0.9599789024534786
Depth:  4 Training Accuracy:  0.9643721134814163 Cross Val Score:  0.9582858836079173
Depth:  5 Training Accuracy:  0.9652518143831097 Cross Val Score:  0.9542813796373117
Depth:  6 Training Accuracy:  0.9663514405102265 Cross Val Score:  0.9528967642526964
Depth:  7 Training Accuracy:  0.9698702441170002 Cross Val Score:  0.9493528505392913
Depth:  8 Training Accuracy:  0.9736089729491972 Cross Val Score:  0.946583382718976
Depth:  9 Training Accuracy:  0.9793270288102045 Cross Val Score:  0.9438098850302241
Depth:  10 Training Accuracy:  0.9817462062898614 Cross Val Score:  0.9418117814389001


In [None]:
# From the above results depth=1 and depth=2 are giving the highest cross_eval score 
# therefore the model has 2 values for classification
# Finding the scores of the model

dt = DecisionTreeClassifier(max_depth=1)

dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)

print('The accuracy of the model: ', accuracy_score(y_test, y_pred))
print('The AUROC score of the model: ', roc_auc_score(y_test, y_pred))

The accuracy of the model:  0.9646153846153847
The AUROC score of the model:  0.5


In [None]:
# max_depth = 2
dt = DecisionTreeClassifier(max_depth = 2)

dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)

print('The accuracy of the model: ', accuracy_score(y_test, y_pred))
print('The AUROC score of the model: ', roc_auc_score(y_test, y_pred))

The accuracy of the model:  0.9641025641025641
The AUROC score of the model:  0.5067147447010146


From the above results of AUROC score, it is evident that KNN is the optimum model for WINE Prediction.

**Feature Importances**

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

from matplotlib import pyplot
dt = DecisionTreeClassifier(max_depth = 3)
# Fit dt to the training set
dt.fit(X_train, y_train)
importance = dt.feature_importances_
#pyplot.bar([x for x in range(len(importance))], importance)
list(zip(importance,X_test.columns)) # it calculates the feature importances based on IG

[(0.2592760229427976, 'type'),
 (0.0, 'fixed acidity'),
 (0.33912712256675825, 'volatile acidity'),
 (0.0, 'citric acid'),
 (0.15042501062145291, 'residual sugar'),
 (0.0, 'chlorides'),
 (0.198758885553971, 'free sulfur dioxide'),
 (0.0, 'total sulfur dioxide'),
 (0.05241295831502032, 'density'),
 (0.0, 'pH'),
 (0.0, 'sulphates'),
 (0.0, 'alcohol')]

The Important features are: type, volatile acidity, residual sugar, free sulphuric acid