# Importing required libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib notebook
import seaborn as sns

from sklearn import datasets #for loading the dataset
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier #KNN-algorithm model
from sklearn.metrics import confusion_matrix

# Loading the Dataset and Data Preprocessing

In [2]:
wine = datasets.load_wine() #loading the wine dataset

In [3]:
df = pd.DataFrame(wine['data'], columns = wine['feature_names']) #arranging the dataset into pandas dataframe
df


Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.20,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.50,16.8,113.0,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95.0,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740.0
174,13.40,3.91,2.48,23.0,102.0,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750.0
175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835.0
176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840.0


In [6]:
df['target'] = wine['target'] #adding target column in the dataframe
df
df.to_csv('file1.csv')

In [5]:
df.isnull().sum() #checking for null values in the dataframe

alcohol                         0
malic_acid                      0
ash                             0
alcalinity_of_ash               0
magnesium                       0
total_phenols                   0
flavanoids                      0
nonflavanoid_phenols            0
proanthocyanins                 0
color_intensity                 0
hue                             0
od280/od315_of_diluted_wines    0
proline                         0
target                          0
dtype: int64

In [6]:
df.describe() #checking the statistical measures of the dataframe

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
count,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0
mean,13.000618,2.336348,2.366517,19.494944,99.741573,2.295112,2.02927,0.361854,1.590899,5.05809,0.957449,2.611685,746.893258,0.938202
std,0.811827,1.117146,0.274344,3.339564,14.282484,0.625851,0.998859,0.124453,0.572359,2.318286,0.228572,0.70999,314.907474,0.775035
min,11.03,0.74,1.36,10.6,70.0,0.98,0.34,0.13,0.41,1.28,0.48,1.27,278.0,0.0
25%,12.3625,1.6025,2.21,17.2,88.0,1.7425,1.205,0.27,1.25,3.22,0.7825,1.9375,500.5,0.0
50%,13.05,1.865,2.36,19.5,98.0,2.355,2.135,0.34,1.555,4.69,0.965,2.78,673.5,1.0
75%,13.6775,3.0825,2.5575,21.5,107.0,2.8,2.875,0.4375,1.95,6.2,1.12,3.17,985.0,2.0
max,14.83,5.8,3.23,30.0,162.0,3.88,5.08,0.66,3.58,13.0,1.71,4.0,1680.0,2.0


# Splitting the DataFrame : 80% (X_train, Y_train) 20% (X_test, Y_test)

In [7]:
#Splitting into Features(X) and Target(Y)
X = df
Y = X.pop('target')

In [8]:
print("X - Shape : ", X.shape)
print("Y - shape : ", Y.shape)

X - Shape :  (178, 13)
Y - shape :  (178,)


In [9]:
Y.unique() #shows the different sets of values present in Y

array([0, 1, 2])

### Splitting into Train(80%) Test(20%) - After tuning for best fit

In [23]:
#80% (X_train, Y_train) 20% (X_test, Y_test)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 55)

In [24]:
print("X_train - Shape : ", X_train.shape)
print("Y_train - shape : ", Y_train.shape)
print("X_test - Shape : ", X_test.shape)
print("Y_test - shape : ", Y_test.shape)

X_train - Shape :  (142, 13)
Y_train - shape :  (142,)
X_test - Shape :  (36, 13)
Y_test - shape :  (36,)


# Training the Model by KNN

### Tuning model sensitivity : n_neighbors

In [25]:
k_range = range(1, 30)
scores = []

#checking accuracy score for n_neighbors from range (1-30) to find the best fit, model with highest accuracy
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k) 
    knn.fit(X_train, Y_train)
    scores.append(knn.score(X_test, Y_test))

In [26]:
# plotting the scores

plt.figure()
plt.xlabel('K_count')
plt.ylabel('Model_accuracy')
plt.scatter(k_range, scores)
plt.grid()
plt.show()

<IPython.core.display.Javascript object>

### Traing the KNN Model

In [27]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, Y_train) #training the KNN model

KNeighborsClassifier(n_neighbors=3)

In [28]:
knn.score(X_test, Y_test) #testing the accuracy score

0.8333333333333334

## Saving the Trained Model

In [30]:
import pickle
filename = 'Wine Classification Model_KNN.train'
pickle.dump(knn, open(filename,'wb'))
#loadMODEL = pickle.load(open('Wine Classification Model_KNN.train','rb')) #loading the saved model

## Making Predictions Using the Model

In [31]:
Predictions = knn.predict(X_test)
Predictions

array([0, 2, 1, 2, 1, 2, 0, 1, 0, 1, 0, 2, 2, 0, 1, 2, 0, 0, 2, 0, 1, 2,
       0, 1, 1, 1, 1, 2, 1, 1, 0, 1, 1, 1, 2, 2])

In [33]:
cm = confusion_matrix(Y_test, Predictions)
cm

array([[ 8,  0,  0],
       [ 0, 13,  2],
       [ 2,  2,  9]], dtype=int64)

In [35]:
plt.figure(figsize=(8,8))
sns.heatmap(cm, annot = True)
plt.title('Confusion_Matrix')
plt.ylabel('Truth')
plt.xlabel('Prediction')

<IPython.core.display.Javascript object>

Text(0.5, 58.7222222222222, 'Prediction')