In [1]:
import requests
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler

In [2]:
http_request = requests.get('https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv')

In [3]:
http_text = http_request.text.split("\n")

In [4]:
data_array = []
for lines in http_text:
    data_array.append(lines.split(";"))
col_names = []
for col in data_array[0]:
    col_names.append(col.strip('"'))

data_frame = pd.DataFrame(data=data_array[1:],columns=col_names).apply(pd.to_numeric)

In [5]:
data_frame

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6.0
1,6.3,0.30,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6.0
2,8.1,0.28,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6.0
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6.0
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5.0
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6.0
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7.0
4897,6.0,0.21,0.38,0.8,0.020,22.0,98.0,0.98941,3.26,0.32,11.8,6.0


In [6]:
numrecs = data_frame.shape[0]
data_frame = data_frame.ix[0:numrecs-2,:]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  retval = getattr(retval, self.name)._getitem_axis(key, axis=i)


In [7]:
# Let us do a little pre-processing
label = data_frame["quality"]
predictors = data_frame[col_names[:-1]]

scaler_model = MinMaxScaler().fit_transform(predictors.values)
bins = np.linspace(0, 1, 10)
digitized = np.digitize(scaler_model, bins)


In [8]:
clf = svm.SVC()
clf.fit(X=digitized,y=label.values)  



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [9]:
score_knn = cross_val_score(clf, digitized, label.values, cv=4)
print("Cross Validation score : " + str(score_knn))
print("Cross Validation Mean score : " + str(score_knn.mean()))



Cross Validation score : [0.48655257 0.50938776 0.53513072 0.5294599 ]
Cross Validation Mean score : 0.5151327357734451


In [10]:
from sklearn.metrics import confusion_matrix
confusion_matrix(label.values, clf.predict(X=digitized))

array([[   2,    0,    6,   12,    0,    0,    0],
       [   0,   11,   89,   61,    2,    0,    0],
       [   0,    0,  869,  582,    6,    0,    0],
       [   0,    1,  316, 1785,   96,    0,    0],
       [   0,    0,   36,  574,  270,    0,    0],
       [   0,    0,    7,  120,   48,    0,    0],
       [   0,    0,    0,    2,    3,    0,    0]], dtype=int64)

In [11]:
# Let's try different kernel
clf = svm.SVC(kernel='poly')
clf.fit(X=digitized,y=label.values)  



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='poly', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [12]:
score_knn = cross_val_score(clf, digitized, label.values, cv=5)
print("Cross Validation score : " + str(score_knn))
print("Cross Validation Mean score : " + str(score_knn.mean()))



Cross Validation score : [0.47604485 0.48216106 0.53163265 0.48977505 0.54601227]
Cross Validation Mean score : 0.5051251772917944


In [13]:
from sklearn.metrics import confusion_matrix
confusion_matrix(label.values, clf.predict(X=digitized))

array([[  10,    0,    3,    7,    0,    0,    0],
       [   1,   31,   66,   63,    2,    0,    0],
       [   0,    2,  802,  648,    5,    0,    0],
       [   4,    1,  325, 1766,  102,    0,    0],
       [   0,    0,   35,  606,  238,    1,    0],
       [   0,    0,    1,  104,   43,   27,    0],
       [   0,    0,    0,    2,    3,    0,    0]], dtype=int64)

In [None]:
# Please vary and check for Accuracy and Confusion Matrix by varying (C, kernel, gamma)