### Importing the required libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score,f1_score 

### Loading the data set

In [2]:
df=pd.read_csv("D:\\python_datascience\\machine_learning\\decision_tree\\Decision Trees \\Decision Trees\\winequality_red.csv")
df.head(5)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


The data set consists following Input variables :
1 - fixed acidity 
2 - volatile acidity  
3 - citric acid  
4 - residual sugar 
5 - chlorides  
6 - free sulfur dioxide
7 - total sulfur dioxide 
8 - density  
9 - pH   
10 - sulphates   
11 - alcohol

and the Output variable/target gives the quality of th wine based on the input variables: 

12 - quality (score between 0 and 10)

### checking shape of data 

In [3]:
df.shape

(1599, 12)

### Basic information of data 

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


#### Checking data types of columns 

In [5]:
df.dtypes

fixed acidity           float64
volatile acidity        float64
citric acid             float64
residual sugar          float64
chlorides               float64
free sulfur dioxide     float64
total sulfur dioxide    float64
density                 float64
pH                      float64
sulphates               float64
alcohol                 float64
quality                   int64
dtype: object

all are numeric

#### Checking missing values 

In [6]:
df.isnull().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

the data has no missing values 

#### Seggregating features and target

In [7]:
x=df.drop(columns=["quality"])
y=df["quality"]

### do scaling of features 

In [8]:
from sklearn.preprocessing import StandardScaler
scaler= StandardScaler()
new_data=scaler.fit_transform(x)

In [9]:
new_data

array([[-0.52835961,  0.96187667, -1.39147228, ...,  1.28864292,
        -0.57920652, -0.96024611],
       [-0.29854743,  1.96744245, -1.39147228, ..., -0.7199333 ,
         0.1289504 , -0.58477711],
       [-0.29854743,  1.29706527, -1.18607043, ..., -0.33117661,
        -0.04808883, -0.58477711],
       ...,
       [-1.1603431 , -0.09955388, -0.72391627, ...,  0.70550789,
         0.54204194,  0.54162988],
       [-1.39015528,  0.65462046, -0.77526673, ...,  1.6773996 ,
         0.30598963, -0.20930812],
       [-1.33270223, -1.21684919,  1.02199944, ...,  0.51112954,
         0.01092425,  0.54162988]])

#### Checking class imbalance

In [10]:
df["quality"].value_counts()

5    681
6    638
7    199
4     53
8     18
3     10
Name: quality, dtype: int64

here ther is class imbalance so handle it using smote  : oversampling 

In [11]:
from imblearn.over_sampling import SMOTE

In [12]:
## oversampling 
smote=SMOTE()
x_smote,y_smote=smote.fit_resample(new_data,y)


In [13]:
## checking new count of all classes records
from collections import Counter
Counter(y_smote)

Counter({5: 681, 6: 681, 7: 681, 4: 681, 8: 681, 3: 681})

In [14]:
## checking shape of data after over sampling 
print(x_smote.shape)
print(y_smote.shape)

(4086, 11)
(4086,)


### Now Dividing  this smote data(over sampled data)  into training and test data

In [15]:
x_train,x_test,y_train,y_test = train_test_split(x_smote,y_smote,test_size = 0.25, random_state= 42)

#### Getting shape of train and test data

In [16]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(3064, 11)
(3064,)
(1022, 11)
(1022,)


### SVM Classifier 

In [17]:
model=SVC() # taking default parameters
model.fit(x_train,y_train) ## training of model

SVC()

#### Predictions 

In [18]:
y_pred=model.predict(x_test)

### Model Evaluation

#### 1. Accuracy 

In [19]:
acc=accuracy_score(y_test,y_pred)
acc

0.7573385518590998

#### 2. Confusion Matrix

In [20]:
c=confusion_matrix(y_test,y_pred)
c

array([[165,   0,   0,   0,   0,   0],
       [ 13, 142,   9,   1,   0,   0],
       [ 12,  24, 115,  24,   9,   1],
       [  6,  18,  48,  77,  32,  12],
       [  0,   4,   4,  13, 108,  18],
       [  0,   0,   0,   0,   0, 167]], dtype=int64)

#### 3. F1 Score

In [21]:
f1=f1_score(y_test,y_pred,average="weighted")
f1

0.7417149167537872

As observed, the accuracy of the model is quite low. We need to implement the grid search approach to optimize the parameters to give the best accuracy.

#### Implementing Grid Search

In [22]:
param_grid={'gamma':[0.1,1,10,20,30,40],'C':[1,0.5,0.1,1.5,2,2.5]}

In [23]:
grid= GridSearchCV(SVC(),param_grid, verbose=3, n_jobs=-1)

In [24]:
grid.fit(x_train,y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


GridSearchCV(estimator=SVC(), n_jobs=-1,
             param_grid={'C': [1, 0.5, 0.1, 1.5, 2, 2.5],
                         'gamma': [0.1, 1, 10, 20, 30, 40]},
             verbose=3)

In [25]:
grid.best_params_

{'C': 2.5, 'gamma': 1}

###  Now , Fit the model using optimal parameters of c and gamma

In [26]:
model_new=SVC(C=2 ,gamma=1)
model_new.fit(x_train,y_train)

SVC(C=2, gamma=1)

In [27]:
accuracy_score(y_test,model_new.predict(x_test))

0.87573385518591

### Now, our accuracy got increased using hyper parameters tuning of c and gamma we can play with other parameters as well 