In [1]:
import numpy as np
import pandas as pd

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
red = pd.read_csv('wineQualityReds.csv')
white = pd.read_csv('wineQualityWhites.csv')

In [4]:
red['type']= 'red'
white['type'] = 'white'

In [7]:
df = pd.concat([red,white])
df = df.drop(df.columns[0],axis=1)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6497 entries, 0 to 4897
Data columns (total 13 columns):
fixed.acidity           6497 non-null float64
volatile.acidity        6497 non-null float64
citric.acid             6497 non-null float64
residual.sugar          6497 non-null float64
chlorides               6497 non-null float64
free.sulfur.dioxide     6497 non-null float64
total.sulfur.dioxide    6497 non-null float64
density                 6497 non-null float64
pH                      6497 non-null float64
sulphates               6497 non-null float64
alcohol                 6497 non-null float64
quality                 6497 non-null int64
type                    6497 non-null object
dtypes: float64(11), int64(1), object(1)
memory usage: 710.6+ KB


### 4 Classification

In order to make the classification more easier to understand, we want to create a new colum called `Review` to have 3 different levels of quality of wine.

* Bad : score 0,1,2,3
* Medium: score 4,5,6,7
* Good: score 8,9,10

In [9]:
reviews = []
for i in df['quality']:
    if i >= 0 and i <= 3:
        reviews.append('1')
    elif i >= 4 and i <= 7:
        reviews.append('2')
    elif i >= 8 and i <= 10:
        reviews.append('3')
df['Reviews'] = reviews

In [10]:
from collections import Counter
Counter(df["Reviews"])

Counter({'2': 6269, '3': 198, '1': 30})

It is clear that most of the wine is Medium quality. We will use this `Review` as our response Y to perform classification

Now, lets split the x and y variables

From the previous research, we know that `total sulfur dioxide` is high related to `free sulfur dioxide`. Therefore, to aviod multicollinearity, we just keep `total sulfur dioxide` in our analysis.

In [16]:
x = df.drop(columns=['quality','type','Reviews','free.sulfur.dioxide'],axis=1)
x.head(10)

Unnamed: 0,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,total.sulfur.dioxide,density,pH,sulphates,alcohol
0,7.4,0.7,0.0,1.9,0.076,34.0,0.9978,3.51,0.56,9.4
1,7.8,0.88,0.0,2.6,0.098,67.0,0.9968,3.2,0.68,9.8
2,7.8,0.76,0.04,2.3,0.092,54.0,0.997,3.26,0.65,9.8
3,11.2,0.28,0.56,1.9,0.075,60.0,0.998,3.16,0.58,9.8
4,7.4,0.7,0.0,1.9,0.076,34.0,0.9978,3.51,0.56,9.4
5,7.4,0.66,0.0,1.8,0.075,40.0,0.9978,3.51,0.56,9.4
6,7.9,0.6,0.06,1.6,0.069,59.0,0.9964,3.3,0.46,9.4
7,7.3,0.65,0.0,1.2,0.065,21.0,0.9946,3.39,0.47,10.0
8,7.8,0.58,0.02,2.0,0.073,18.0,0.9968,3.36,0.57,9.5
9,7.5,0.5,0.36,6.1,0.071,102.0,0.9978,3.35,0.8,10.5


In [14]:
y = df["Reviews"]
y.head(10)

0    2
1    2
2    2
3    2
4    2
5    2
6    2
7    2
8    2
9    2
Name: Reviews, dtype: object

Lets split the data into training and test data sets.

In [29]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3)

In [30]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(4547, 10)
(4547,)
(1950, 10)
(1950,)


#### 4.1 Logsitic Regression for Review Category

In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

In [36]:
#Fit logistic Regression
logit_model = LogisticRegression()
logit_model.fit(x_train, y_train)
y_predict = logit_model.predict(x_test)



In [37]:
#print the confusion matrix and accuracy score to see the performance
logit_conf_m = confusion_matrix(y_test, y_predict)
score_logit = logit_model.score(x_test, y_test)

In [38]:
score_logit

0.9676923076923077

In [39]:
logit_conf_m

array([[   0,    6,    0],
       [   0, 1887,    0],
       [   0,   57,    0]], dtype=int64)

#### 4.2 SVM

In [40]:
from sklearn.svm import SVC

lets use linear kernel first

In [41]:
svc_classifier = SVC(kernel = "linear")
svc_classifier.fit(x_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [42]:
y_pred_svc = svc_classifier.predict(x_test)

In [43]:
svc_conf_m = confusion_matrix(y_test, y_pred_svc)
svc_conf_m 

array([[   0,    6,    0],
       [   0, 1887,    0],
       [   0,   57,    0]], dtype=int64)

In [45]:
score_svc = svc_classifier.score(x_test, y_test)
score_svc

0.9676923076923077

lets use radial basis kernel 

In [47]:
svc_classifier2 = SVC(kernel = "rbf")
svc_classifier2.fit(x_train, y_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [48]:
y_pred_svc2 = svc_classifier2.predict(x_test)

In [49]:
svc_conf_m2 = confusion_matrix(y_test, y_pred_svc2)
svc_conf_m2 

array([[   0,    6,    0],
       [   0, 1886,    1],
       [   0,   55,    2]], dtype=int64)

In [50]:
score_svc = svc_classifier2.score(x_test, y_test)
score_svc

0.9682051282051282