In [None]:
import pandas as pd
import numpy as np

# read red wine set of observations
data_red = pd.read_csv("C:/Users/ag4488/Documents/Python/svm-classification-with-pytorch/data/winequality-red.csv",sep=',')
data_red['color'] = 1 #redwine

print(data_red.shape)

# read white wine set of observations
data_white = pd.read_csv("C:/Users/ag4488/Documents/Python/svm-classification-with-pytorch/data/winequality-white.csv",sep=',')
data_white['color'] = 0 #whitewine

print(data_white.shape)

# merge the two sets in one
data = data_red.merge(data_white, how='outer')
fields = list(data.columns)
print(fields)

In [None]:
# show the counts of all quality levels
print("All 'quality level' counts")
print(data["quality"].value_counts())

In [None]:
# based on the "quality histograms" above, we will drop the ratings with low counts (we will keep only 5,6,7)
data = data.drop(data[data.quality == 9].index)
data = data.drop(data[data.quality == 8].index)
data = data.drop(data[data.quality == 3].index)
data = data.drop(data[data.quality == 4].index)

# show the counts of selected quality levels
print("Selected 'quality level' counts")
print(data["quality"].value_counts())

In [None]:
# split the data set in two: 1) color+features (observations)  2) quality (actuals)

# select the outcomes
y = data['quality']

data = data.drop(columns=['quality'])

# select the rows (observations)
fields = list(data.columns)
X = data[fields]
print(fields)

A Pearson correlation was used to identify which features correlate with wine quality. It looks as if higher the alcohol content the higher the quality. Lower density and volatile acidity also correlated with better quality as seen in the pairwise correlation chart the chart below. Only the top 5 correlated features were carried over to the SVM models.

In [None]:
correlations = data[fields].corrwith(y)
correlations.sort_values(inplace=True)

# the following fields are the 5 retained as having the highest correlations to wine quality
fields = correlations.map(abs).sort_values().iloc[-5:].index
print(fields) #prints the top two abs correlations

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# The figure below shows Pearson Pairwise correlation of features to wine quality.
# Looks like alcohol and density are the most correlated with quality
ax = correlations.plot(kind='bar')
ax.set(ylim=[-1, 1], ylabel='pearson correlation')

We will run now K_Nearest Neighbour algorithm (KNN) to create a "prediction model"
KNN converges faster when features are scaled. If the model is senstive to magnitudes its generally a good idea to scale so one feature doesn’t get more influence than the other(in terms of scale).

In [None]:
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

DO_STANDARDSCALER = True
X = data[fields]
scaler = None

if DO_STANDARDSCALER:
    scaler = StandardScaler().fit(X)
    X = scaler.transform(X)

    X = pd.DataFrame(X, columns=['%s_scaled' % fld for fld in fields])
    print(X.columns) #scaled columns

    print(scaler.mean_)
    print(scaler.scale_)

else:
    scaler = MinMaxScaler().fit(X)
    print(scaler.data_max_)
    print(scaler.data_min_)

    X = scaler.transform(X)

    X = pd.DataFrame(X, columns=['%s_scaled' % fld for fld in fields])
    print(X.columns) #scaled columns

In [None]:
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

# we will split the data in training (70%) and testing (30%) whihc is the usual ratio
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# Instantiate KNN learning model (k=15)
knn = KNeighborsClassifier(n_neighbors=15)
# predict the wine rankings for the test data set

# Fit the model
knn.fit(X_train,y_train)

y_pred = knn.predict(X_test)

print(print('predict\tactual\tcolor_scaled,','chlorides_scaled,','volatile acidity_scaled,','density_scaled,','alcohol_scaled'))
i=0
for index,row in X_test.iterrows():
    print(f"{list(y_pred)[i]}\t{list(y_test)[i]}\t",f"{row['color_scaled']}, {row['chlorides_scaled']},{row['volatile acidity_scaled']}, {row['density_scaled']},{row['alcohol_scaled']}")
    i=i+1

In [None]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score
from sklearn.preprocessing import label_binarize

# Calculate the accuracy of prediction
metrics = list()
cm = dict()

# Precision, recall, f-score from the multi-class support function
precision, recall, fscore, _ = score(y_test, y_pred, average='weighted')

# Accuracy score
accuracy = accuracy_score(y_test, y_pred)

metrics.append(pd.Series({'precision':precision, 'recall':recall, 
                          'fscore':fscore, 'accuracy':accuracy}, 
                         name='Model'))

metrics = pd.concat(metrics, axis=1)

print(metrics)

y_pred = knn.predict(X_test)
print(y_pred)

# Last, the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

sns.heatmap(cm, annot=True, fmt='.2g');
plt.title('Confusion matrix of the KNN classifier (use Correlation)')    
plt.tight_layout()

In [None]:
# another way to calculate the prediction accuracy by using a KNN built-in method
mean_accuracy = knn.score(X_test,y_test)
print(mean_accuracy)

We will try now to get the quality prediction for a new wine that comes with the 5 sets of parameters:
'color', 'chlorides', 'volatile acidity', 'density', 'alcohol'

Examples:
    red wine:          [1,0.114,0.78,0.9545,8.5]
    white wine:        [0,0.019,0.23,0.9745,9.2]

In [None]:
X0 = [[1,0.114,0.78,0.9545,7],[0,0.032,0.23,0.9945,16.2]]
dfX=pd.DataFrame(X0)

print("===Wines (new data)=====================================================")
dfX.columns = ['color', 'chlorides', 'volatile acidity', 'density', 'alcohol']
dfX.index = ['red_wine_0','white_wine_0']
print(dfX)
print("\n===Predicted Quality====================================================")

inxs = dfX.index

dfX = scaler.transform(dfX)
y0_pred = knn.predict(dfX)
print("Predicted quality:")
print(f"{inxs[0]}:\t{y0_pred[0]}")
print(f"{inxs[1]}:\t{y0_pred[1]}")