In [1]:
import numpy as np
import pandas as pd

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
red = pd.read_csv('wineQualityReds.csv')
white = pd.read_csv('wineQualityWhites.csv')

In [4]:
red['type']= 'red'
white['type'] = 'white'

In [5]:
df = pd.concat([red,white])
df = df.drop(df.columns[0],axis=1)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6497 entries, 0 to 4897
Data columns (total 13 columns):
fixed.acidity           6497 non-null float64
volatile.acidity        6497 non-null float64
citric.acid             6497 non-null float64
residual.sugar          6497 non-null float64
chlorides               6497 non-null float64
free.sulfur.dioxide     6497 non-null float64
total.sulfur.dioxide    6497 non-null float64
density                 6497 non-null float64
pH                      6497 non-null float64
sulphates               6497 non-null float64
alcohol                 6497 non-null float64
quality                 6497 non-null int64
type                    6497 non-null object
dtypes: float64(11), int64(1), object(1)
memory usage: 710.6+ KB


In [8]:
df['quality'].describe()

count    6497.000000
mean        5.818378
std         0.873255
min         3.000000
25%         5.000000
50%         6.000000
75%         6.000000
max         9.000000
Name: quality, dtype: float64

### 4 Classification

In order to make the classification more easier to understand, we want to create a new colum called `Review` to have 3 different levels of quality of wine. Since the min of the score is 3, and max of the score is 8, we will do the following spliting:

* Bad : score 3-6
* Good: score 7-9

In [9]:
reviews = []
for i in df['quality']:
    if i >= 3 and i <= 6:
        reviews.append('0')
    elif i >= 7 and i <= 9:
        reviews.append('1')
df['Reviews'] = reviews

In [10]:
df['Reviews'].astype('category')

0       0
1       0
2       0
3       0
4       0
       ..
4893    0
4894    0
4895    0
4896    1
4897    0
Name: Reviews, Length: 6497, dtype: category
Categories (2, object): [0, 1]

In [11]:
from collections import Counter
Counter(df["Reviews"])

Counter({'0': 5220, '1': 1277})

It is clear that most of the wine is **Bad** quality. We will use this `Review` as our response Y to perform classification

Now, lets split the x and y variables

From the previous research, we know that `total sulfur dioxide` is high related to `free sulfur dioxide`. Therefore, to aviod multicollinearity, we just keep `total sulfur dioxide` in our analysis.

In [12]:
x = df.drop(columns=['quality','type','Reviews','free.sulfur.dioxide'],axis=1)
x.head(10)

Unnamed: 0,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,total.sulfur.dioxide,density,pH,sulphates,alcohol
0,7.4,0.7,0.0,1.9,0.076,34.0,0.9978,3.51,0.56,9.4
1,7.8,0.88,0.0,2.6,0.098,67.0,0.9968,3.2,0.68,9.8
2,7.8,0.76,0.04,2.3,0.092,54.0,0.997,3.26,0.65,9.8
3,11.2,0.28,0.56,1.9,0.075,60.0,0.998,3.16,0.58,9.8
4,7.4,0.7,0.0,1.9,0.076,34.0,0.9978,3.51,0.56,9.4
5,7.4,0.66,0.0,1.8,0.075,40.0,0.9978,3.51,0.56,9.4
6,7.9,0.6,0.06,1.6,0.069,59.0,0.9964,3.3,0.46,9.4
7,7.3,0.65,0.0,1.2,0.065,21.0,0.9946,3.39,0.47,10.0
8,7.8,0.58,0.02,2.0,0.073,18.0,0.9968,3.36,0.57,9.5
9,7.5,0.5,0.36,6.1,0.071,102.0,0.9978,3.35,0.8,10.5


In [13]:
y = df["Reviews"]
y.head(10)

0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    1
8    1
9    0
Name: Reviews, dtype: object

Lets split the data into training and test data sets.

In [14]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3)

In [15]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(4547, 10)
(4547,)
(1950, 10)
(1950,)


#### 4.1 Logsitic Regression for Review Category

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

In [17]:
#Fit logistic Regression
logit_model = LogisticRegression()
logit_model.fit(x_train, y_train)
y_predict = logit_model.predict(x_test)



In [18]:
#print the confusion matrix and accuracy score to see the performance
logit_conf_m = confusion_matrix(y_test, y_predict)
score_logit = logit_model.score(x_test, y_test)

In [19]:
score_logit

0.8153846153846154

In [20]:
logit_conf_m

array([[1503,   68],
       [ 292,   87]], dtype=int64)

Logistic Regression manage to achieve a 82% accuracy level

#### 4.2 SVM for Reveiw Category

In [21]:
from sklearn.svm import SVC

lets use linear kernel first

In [22]:
svc_classifier = SVC(kernel = "linear")
svc_classifier.fit(x_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [23]:
# use the trained model to do prediction
y_pred_svc = svc_classifier.predict(x_test)

In [24]:
# construct the confusion matrix
svc_conf_m = confusion_matrix(y_test, y_pred_svc)
svc_conf_m 

array([[1571,    0],
       [ 379,    0]], dtype=int64)

In [25]:
# calculate the accuracy score
score_svc_linear = svc_classifier.score(x_test, y_test)
score_svc_linear

0.8056410256410257

lets use radial basis kernel 

In [26]:
svc_classifier2 = SVC(kernel = "rbf")
svc_classifier2.fit(x_train, y_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [27]:
# use the trained model to do prediction
y_pred_svc2 = svc_classifier2.predict(x_test)

In [28]:
# construct the confusion matrix
svc_conf_m2 = confusion_matrix(y_test, y_pred_svc2)
svc_conf_m2 

array([[1536,   35],
       [ 332,   47]], dtype=int64)

In [29]:
# calculate the accuracy score
score_svc_rbf = svc_classifier2.score(x_test, y_test)
score_svc_rbf 

0.8117948717948718

#### 4.3 Decision Tree for Reveiw Category

#### 4.4 Random Forest for Reveiw Category

#### 4.5 Summary of Classification for Reveiw Category

In [30]:
print(f"The accuracy score for logistic regression is {score_logit}")
print(f"The accuracy score for linear SVM is {score_svc_linear}")
print(f"The accuracy score for radial basis SVM regression is {score_svc_rbf}")

The accuracy score for logistic regression is 0.8153846153846154
The accuracy score for linear SVM is 0.8056410256410257
The accuracy score for radial basis SVM regression is 0.8117948717948718


#### 4.6 Logistic Regression for Wine Type

Since there are two main types of the wine of our dataset: red wine and white wine. We can also perform classification regard the `type` (as our response Y)

In [31]:
# re-split the data for further analysis
x1 = x = df.drop(columns=['quality','type','Reviews','free.sulfur.dioxide'],axis=1)
y1 = df["type"]

In [32]:
Counter(df["type"])

Counter({'red': 1599, 'white': 4898})

In [37]:
from sklearn.model_selection import train_test_split
x1_train, x1_test, y1_train, y1_test = train_test_split(x1, y1, test_size = 0.3, random_state = 42)

In [38]:
# fit a new logistic regression for type
logit_model1 = LogisticRegression()
logit_model1.fit(x1_train, y1_train)
y1_predict = logit_model1.predict(x1_test)



In [39]:
#print the confusion matrix and accuracy score to see the performance
logit_conf_m1 = confusion_matrix(y1_test, y1_predict)
score_logit1 = logit_model1.score(x1_test, y1_test)

In [40]:
print(logit_conf_m1)
print(score_logit1)

[[ 473   29]
 [  17 1431]]
0.9764102564102564


98% accuracy level! Logistic regression did a very decent performance to classify two type of wine for the test dataset.

#### 4.7 SVM for Wine Type

In [41]:
# fit linear kernel SVM first 
svc_classifier = SVC(kernel = "linear")
svc_classifier.fit(x1_train, y1_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [42]:
y3_pred_svc = svc_classifier.predict(x1_test)

In [43]:
svc_conf_m3 = confusion_matrix(y1_test, y3_pred_svc)
score_svc3 = svc_classifier.score(x1_test, y1_test)

In [44]:
print(svc_conf_m3)
print(score_svc3)

[[ 480   22]
 [  12 1436]]
0.9825641025641025


fit a radial basis kernel SVM 

In [50]:
svc_classifier2 = SVC(kernel = "rbf")
svc_classifier2.fit(x1_train, y1_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [51]:
y4_pred_svc = svc_classifier2.predict(x1_test)

In [52]:
svc_conf_m4 = confusion_matrix(y1_test, y4_pred_svc)
score_svc4 = svc_classifier.score(x1_test, y1_test)

In [53]:
print(svc_conf_m4)
print(score_svc4)

[[ 435   67]
 [  34 1414]]
0.9825641025641025


#### 4.8 Decision Tree for Wine Type

#### 4.9 Random Forest for Wine Type

#### 4.10 Summary of Classification for Wine Type

In [54]:
print(f"The accuracy score for logistic regression is {score_logit1}")
print(f"The accuracy score for linear SVM is {score_svc3}")
print(f"The accuracy score for radial basis SVM regression is {score_svc4}")

The accuracy score for logistic regression is 0.9764102564102564
The accuracy score for linear SVM is 0.9825641025641025
The accuracy score for radial basis SVM regression is 0.9825641025641025
