In [38]:
from math import sqrt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys
from time import time
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn import model_selection
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

In [39]:
#Import scikit-learn dataset library
from sklearn import datasets

#Load dataset
wine_data = datasets.load_wine()

In [40]:
df = pd.DataFrame(wine_data.data, columns=wine_data.feature_names)
target=pd.DataFrame(data= wine_data['target'],
                     columns= ['wine_class'])

In [41]:
df_new = pd.concat([df, target], axis=1)

In [42]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 14 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   alcohol                       178 non-null    float64
 1   malic_acid                    178 non-null    float64
 2   ash                           178 non-null    float64
 3   alcalinity_of_ash             178 non-null    float64
 4   magnesium                     178 non-null    float64
 5   total_phenols                 178 non-null    float64
 6   flavanoids                    178 non-null    float64
 7   nonflavanoid_phenols          178 non-null    float64
 8   proanthocyanins               178 non-null    float64
 9   color_intensity               178 non-null    float64
 10  hue                           178 non-null    float64
 11  od280/od315_of_diluted_wines  178 non-null    float64
 12  proline                       178 non-null    float64
 13  wine_

In [43]:
df_new.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,wine_class
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0


In [44]:
print(wine_data.feature_names)

['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']


In [45]:
print(wine_data.target_names)

['class_0' 'class_1' 'class_2']


In [46]:
print(df_new['wine_class'].unique())

[0 1 2]


In [47]:
df_new.shape

(178, 14)

In [48]:
from sklearn.model_selection import train_test_split


(trainData, testData, trainLabels, testLabels) = train_test_split(df, df_new['wine_class'], test_size=0.25) 


In [49]:
trainData.shape

(133, 13)

In [50]:
testData.shape

(45, 13)

In [51]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(trainData)

X_train = scaler.transform(trainData)
X_test = scaler.transform(testData)

In [52]:
X_train[0]

array([ 1.5824983 , -0.60433891,  1.19427289,  1.55833148, -0.07772989,
        0.86731527, -0.62471252,  1.25076709,  2.20475427,  3.27683688,
       -1.60566872, -0.8157889 , -0.26863038])

In [53]:
#Create KNN Classifier for 5
knn5 = KNeighborsClassifier(n_neighbors=10)

#Train the model using the training sets
knn5.fit(X_train, trainLabels)

#Predict the response for train dataset
y_pred = knn5.predict(X_train)

# Mode accuracy
m_acc = metrics.accuracy_score(trainLabels, y_pred)
print("Accuracy train:",m_acc)


#Predict the response for test dataset
y_pred = knn5.predict(X_test)

# Mode accuracy
m_acc = metrics.accuracy_score(testLabels, y_pred)

print("Accuracy test:",m_acc)



Accuracy train: 0.9774436090225563
Accuracy test: 0.9333333333333333


In [54]:
from sklearn.naive_bayes import GaussianNB
nb_model = GaussianNB()

results1 = model_selection.cross_val_score(nb_model, X_train, 
                                            trainLabels, cv=5,
                                            scoring='accuracy')

#Print cross validation score
print("Accuracy")
print(results1.mean(), results1.std())


nb_model.fit(X_train, trainLabels)
predictions = nb_model.predict(X_test)
print(accuracy_score(testLabels, predictions))

print(confusion_matrix(testLabels, predictions))

 
# show a final classification report demonstrating the accuracy of the classifier
# for each of the digits
print("EVALUATION ON TESTING DATA")
print(classification_report(testLabels, predictions))



Accuracy
0.9772079772079773 0.030631749414736422
1.0
[[16  0  0]
 [ 0 23  0]
 [ 0  0  6]]
EVALUATION ON TESTING DATA
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        16
           1       1.00      1.00      1.00        23
           2       1.00      1.00      1.00         6

    accuracy                           1.00        45
   macro avg       1.00      1.00      1.00        45
weighted avg       1.00      1.00      1.00        45



In [55]:
# now, let's take 10% of the training data and use that for validation
(trainData_new, valData, trainLabels_new, valLabels) = train_test_split(X_train, trainLabels,
test_size=0.1, random_state=84)

In [56]:
# initialize the values of k for our k-Nearest Neighbor classifier along with the
# list of accuracies for each value of k
kVals = range(1, 30, 2)
accuracies = []

 
# loop over various values of `k` for the k-Nearest Neighbor classifier
for k in range(1, 30, 2):
# train the k-Nearest Neighbor classifier with the current value of `k`
    model = KNeighborsClassifier(n_neighbors=k)
    model.fit(trainData_new, trainLabels_new)
 
    # evaluate the model and update the accuracies list
    score = model.score(valData, valLabels)
    print("k=%d, accuracy=%.2f%%" % (k, score * 100))
    accuracies.append(score)
 

#find the value of k that has the largest accuracy
i = int(np.argmax(accuracies))
print("k=%d achieved highest accuracy of %.2f%% on validation data" % (kVals[i],
    accuracies[i] * 100))

k=1, accuracy=92.86%
k=3, accuracy=100.00%
k=5, accuracy=92.86%
k=7, accuracy=100.00%
k=9, accuracy=100.00%
k=11, accuracy=92.86%
k=13, accuracy=92.86%
k=15, accuracy=92.86%
k=17, accuracy=85.71%
k=19, accuracy=92.86%
k=21, accuracy=85.71%
k=23, accuracy=92.86%
k=25, accuracy=92.86%
k=27, accuracy=92.86%
k=29, accuracy=92.86%
k=3 achieved highest accuracy of 100.00% on validation data


In [57]:
# re-train our classifier using the best k value and predict the labels of the
# test data
from sklearn.metrics import classification_report, confusion_matrix
model = KNeighborsClassifier(n_neighbors=kVals[i])
model.fit(X_train, trainLabels)
predictions = model.predict(X_test)

print(confusion_matrix(testLabels, predictions))
 
# show a final classification report demonstrating the accuracy of the classifier
# for each of the digits
print("EVALUATION ON TESTING DATA")
print(classification_report(testLabels, predictions))

[[16  0  0]
 [ 1 21  1]
 [ 0  0  6]]
EVALUATION ON TESTING DATA
              precision    recall  f1-score   support

           0       0.94      1.00      0.97        16
           1       1.00      0.91      0.95        23
           2       0.86      1.00      0.92         6

    accuracy                           0.96        45
   macro avg       0.93      0.97      0.95        45
weighted avg       0.96      0.96      0.96        45



In [58]:
from sklearn.model_selection import GridSearchCV
knn = KNeighborsClassifier() 
tuned_parameters = [{'n_neighbors': [1,2,4,6,8,10,12,14,16,20,22,25,27],'p': [1, 2, 3, 4]}]

clf = GridSearchCV(knn, tuned_parameters, cv=5)

clf.fit(X_train, trainLabels)

print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Grid scores on development set:")
print()
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
print()



Best parameters set found on development set:

{'n_neighbors': 1, 'p': 1}

Grid scores on development set:

0.970 (+/-0.057) for {'n_neighbors': 1, 'p': 1}
0.932 (+/-0.073) for {'n_neighbors': 1, 'p': 2}
0.925 (+/-0.083) for {'n_neighbors': 1, 'p': 3}
0.940 (+/-0.060) for {'n_neighbors': 1, 'p': 4}
0.955 (+/-0.057) for {'n_neighbors': 2, 'p': 1}
0.932 (+/-0.058) for {'n_neighbors': 2, 'p': 2}
0.925 (+/-0.050) for {'n_neighbors': 2, 'p': 3}
0.932 (+/-0.031) for {'n_neighbors': 2, 'p': 4}
0.947 (+/-0.061) for {'n_neighbors': 4, 'p': 1}
0.940 (+/-0.076) for {'n_neighbors': 4, 'p': 2}
0.933 (+/-0.054) for {'n_neighbors': 4, 'p': 3}
0.940 (+/-0.037) for {'n_neighbors': 4, 'p': 4}
0.955 (+/-0.057) for {'n_neighbors': 6, 'p': 1}
0.955 (+/-0.032) for {'n_neighbors': 6, 'p': 2}
0.955 (+/-0.057) for {'n_neighbors': 6, 'p': 3}
0.955 (+/-0.057) for {'n_neighbors': 6, 'p': 4}
0.962 (+/-0.001) for {'n_neighbors': 8, 'p': 1}
0.955 (+/-0.032) for {'n_neighbors': 8, 'p': 2}
0.940 (+/-0.062) for {'n_nei

In [59]:
from sklearn import metrics
#Predict the response for test dataset
y_pred = clf.predict(X_test)

# Model accuracy on test
m_acc = metrics.accuracy_score(testLabels, y_pred)

print("Accuracy:",m_acc)

Accuracy: 0.9777777777777777


In [60]:
# re-train our classifier using the best k value and predict the labels of the
# test data
print(confusion_matrix(testLabels, y_pred))
 
# show a final classification report demonstrating the accuracy of the classifier
# for each of the digits
print("EVALUATION ON TESTING DATA")
print(classification_report(testLabels, y_pred))

[[16  0  0]
 [ 0 22  1]
 [ 0  0  6]]
EVALUATION ON TESTING DATA
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        16
           1       1.00      0.96      0.98        23
           2       0.86      1.00      0.92         6

    accuracy                           0.98        45
   macro avg       0.95      0.99      0.97        45
weighted avg       0.98      0.98      0.98        45

