In [None]:
import numpy as np
import sklearn
from sklearn.datasets import load_wine
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.gaussian_process.kernels import RBF
import matplotlib.pyplot as pyplot
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

## Importing dataset

In [None]:
data, target = load_wine(return_X_y=True)

Scaling data

In [None]:
scaler = StandardScaler()
scaler.fit(data)
data = scaler.transform(data)

Data and target examples

In [None]:
print("Data ",data[130],"\ntarget ",target[130])

Getting indexes in order to plot with different colors

In [None]:
print(len(target[target==0]))
print(len(target[target==1]))
print(len(target[target==2]))

In [None]:
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
pyplot.scatter(data[:, 0], data[:, 1], c=target, cmap=cmap_bold)
pyplot.show()

### Splitting data in Train (50%), Validation (20%) and Test (30%) 

In [None]:
Data_train, Data_test, Target_train, Target_test = train_test_split(data, target, test_size=0.30, random_state=45)
Data_train, Data_validation, Target_train, Target_validation = train_test_split(Data_train, Target_train, test_size=2/7, random_state=41)

In [None]:
#plotting method
step_size = 0.02
# Create color maps
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
def plot_boundaries(model, data_train, target_train, n_neighbors):
    data_0, data_1 = np.meshgrid(np.arange(data_train[:, 0].min() - 1, data_train[:, 0].max() + 1 , step_size),
                     np.arange(data_train[:, 1].min()-1, data_train[:, 1].max()+1, step_size))
    predictions = model.predict(np.c_[data_0.ravel(), data_1.ravel()])

    # Put the result into a color plot
    predictions = predictions.reshape(data_0.shape)
    pyplot.figure()
    pyplot.pcolormesh(data_0, data_1, predictions, cmap=cmap_light)

    # Plot also the training points
    pyplot.scatter(data_train[:, 0], data_train[:, 1], c=target_train, cmap=cmap_bold)
    pyplot.xlim(data_0.min(), data_0.max())
    pyplot.ylim(data_1.min(), data_1.max())
    pyplot.title("3-Class classification (k = %i)"
              % (n_neighbors))

    pyplot.show()

## kNN

### Hyperparameters

Setting values for k

In [None]:
K = [1, 3, 5, 7]

Plotting decision boundaries and predicting on validation split with different values for k

In [None]:
accuracies = []

for k in K:
    model = KNeighborsClassifier(k, weights='uniform')
    model.fit(Data_train[:, :2], Target_train)
    
    #Plotting boundaries
    plot_boundaries(model, Data_train, Target_train, k)
    
    #Predicting on validation split
    predictions_valid = model.predict(Data_validation[:, :2])
    score_valid = (predictions_valid[predictions_valid==Target_validation].sum())/len(Target_validation)
    print("Result for validation split with k= ",k," ",score_valid*100,"%")
    accuracies.append(score_valid)

Plotting accuracies for each value of k

In [None]:
pyplot.plot(K, accuracies, 'bo')
pyplot.title('Accuracy for each k')
pyplot.xlabel('k')
pyplot.ylabel('Accuracy')

Extracting best value for k, based on validation set

In [None]:
best_k = np.argmax(accuracies)
best_k = K[best_k]
print("Best value is ",best_k)

### Evaluating on the test set

In [None]:
#Setting k to its best value
model = KNeighborsClassifier(best_k, weights='uniform')
model.fit(Data_train[:, :2], Target_train)

test_predictions = model.predict(Data_test[:, :2])

In [None]:
score_test = (test_predictions[test_predictions==Target_test].sum())/len(Target_test)
print("Result for test set with k= ",best_k," ",score_test*100,"%")

## SVM

In [None]:
def plot_boundaries_svm(model, data_train, target_train, c, l=-1000):
    data_0, data_1 = np.meshgrid(np.arange(data_train[:, 0].min() - 1, data_train[:, 0].max() + 1 , step_size),
                     np.arange(data_train[:, 1].min()-1, data_train[:, 1].max()+1, step_size))
    predictions = model.predict(np.c_[data_0.ravel(), data_1.ravel()])

    # Put the result into a color plot
    predictions = predictions.reshape(data_0.shape)
    pyplot.figure()
    pyplot.pcolormesh(data_0, data_1, predictions, cmap=cmap_light)

    # Plot also the training points
    pyplot.scatter(data_train[:, 0], data_train[:, 1], c=target_train, cmap=cmap_bold)
    pyplot.xlim(data_0.min(), data_0.max())
    pyplot.ylim(data_1.min(), data_1.max())
    if l == -1000:
        pyplot.title("3-Class classification (c = %i)"
              % (c))
    else:
        pyplot.title("3-Class classification (c = %i, l = %f)"
              % (c, l))

    pyplot.show()

### Hyperparameters

Setting possible values for C

In [None]:
C = [0.001, 0.01, 0.1, 1, 10, 100, 1000]

In [None]:
accuracies = []
for c in C:    
    modelSVM = svm.LinearSVC(C=c)
    modelSVM.fit(Data_train[:,:2], Target_train)
    
    #Plotting boundaries
    plot_boundaries_svm(modelSVM, Data_train, Target_train, c)
    
    #Predicting on validation split
    predictions_valid = modelSVM.predict(Data_validation[:, :2])
    score_valid = (predictions_valid[predictions_valid==Target_validation].sum())/len(Target_validation)
    print("Result for validation split with c= ",c," ",score_valid*100,"%")
    accuracies.append(score_valid)

Plotting accuracies

In [None]:
pyplot.plot(C, accuracies, 'bo')
pyplot.title('Accuracy for each c')
pyplot.xlabel('c')
pyplot.ylabel('Accuracy')

Extracting best value for c, based on validation set

In [None]:
best_c = np.argmax(accuracies)
best_c = C[best_c]
print("Best value is ",best_c)

### Evaluating on the test set

In [None]:
#Setting c to its best value
modelSVM = svm.LinearSVC(C=best_c)
modelSVM.fit(Data_train[:,:2], Target_train)

In [None]:
test_predictions = modelSVM.predict(Data_test[:, :2])
score_test = (test_predictions[test_predictions==Target_test].sum())/len(Target_test)
print("Result for test set with c = ",best_c," ",score_test*100,"%")

## RBF Kernel

### Hyperparameters

In [None]:
accuracies = []
for c in C:    
    modelSVM = svm.SVC(C=c, kernel='rbf')
    modelSVM.fit(Data_train[:,:2], Target_train)
    
    #Plotting boundaries
    plot_boundaries_svm(modelSVM, Data_train, Target_train, c)
    
    #Predicting on validation split
    predictions_valid = modelSVM.predict(Data_validation[:, :2])
    score_valid = (predictions_valid[predictions_valid==Target_validation].sum())/len(Target_validation)
    print("Result for validation split with c= ",c," ",score_valid*100,"%")
    accuracies.append(score_valid)

In [None]:
best_c = np.argmax(accuracies)
best_c = C[best_c]
print("Best value is ",best_c)

### Evaluating on the test set

In [None]:
modelSVM = svm.SVC(C=best_c, kernel='rbf')
modelSVM.fit(Data_train[:,:2], Target_train)

In [None]:
test_predictions = modelSVM.predict(Data_test[:, :2])
score_test = (test_predictions[test_predictions==Target_test].sum())/len(Target_test)
print("Result for test set with c = ",best_c," ",score_test*100,"%")

### Grid search

Setting values for gamma (l)

In [None]:
L = [0.7, 0.4, 0.1, 0.07, 0.04, 0.03, 0.01, 0.001, 0.0001]

In [None]:
accuracies = []
for c in C:
    accuracies.append([])
    for l in L:
        modelSVM = svm.SVC(C=c, kernel='rbf', gamma=l)
        modelSVM.fit(Data_train[:,:2], Target_train)
    
        #Predicting on validation split
        predictions_valid = modelSVM.predict(Data_validation[:, :2])
        score_valid = (predictions_valid[predictions_valid==Target_validation].sum())/len(Target_validation)
        print("Result for validation split with c= ",c," and l= ",l," ",score_valid*100,"%")
        accuracies[len(accuracies)-1].append(score_valid)

Searching for best values of c and l

In [None]:
best_c = 0
best_l = 0
for c in range(len(C)):
    l = np.argmax(accuracies[c])
    if accuracies[c][l] > accuracies[best_c][best_l]:
        best_c = c
        best_l = l
best_c = C[best_c]
best_l = L[best_l]
print("Best value for c = ",best_c)
print("Best value for l = ",best_l)

### Evaluating on the test set

In [None]:
modelSVM = svm.SVC(C=best_c, kernel='rbf', gamma=best_l)
modelSVM.fit(Data_train[:,:2], Target_train)
    
test_predictions = modelSVM.predict(Data_test[:, :2])
score_test = (test_predictions[test_predictions==Target_test].sum())/len(Target_test)
print("Result for test set with c = ",best_c," and l = ",best_l," ",score_test*100,"%")

In [None]:
#Plotting boundaries
plot_boundaries_svm(modelSVM, Data_train, Target_train, best_c, best_l)

## K_Fold

### Merging train and validation split

In [None]:
Data_train_valid = np.concatenate((Data_train, Data_validation))
Target_train_valid = np.concatenate((Target_train, Target_validation))

### Hyperparameters

Computing number of samples to take in each k-fold validation iteration

In [None]:
n_samples = int(20/100*len(Target_train_valid))
print(n_samples)

In [None]:
accuracies = []
for c in C:
    accuracies.append([])
    for l in L:
        score_valids = []
        for i in range(5):
            
            #Extracting validation samples
            starting_index = i*n_samples
            Data_validation = Data_train_valid[starting_index:starting_index+n_samples]
            Target_validation = Target_train_valid[starting_index:starting_index+n_samples]
            
            #Extracting remaining samples
            Data_train_tmp = np.concatenate((Data_train_valid[:starting_index], Data_train_valid[starting_index+n_samples:]))
            Target_train_tmp = np.concatenate((Target_train_valid[:starting_index], Target_train_valid[starting_index+n_samples:]))
            
            #Fitting train set
            modelSVM = svm.SVC(C=c, kernel='rbf', gamma=l)
            modelSVM.fit(Data_train_tmp[:,:2], Target_train_tmp)
    
            #Predicting on validation split
            predictions_valid = modelSVM.predict(Data_validation[:, :2])
            score_valids.append((predictions_valid[predictions_valid==Target_validation].sum())/len(Target_validation))
            
        #Computing mean of result for current values of hyperparameters
        score_valid = (sum(score_valids))/5
        print("Result for validation split with c= ",c," and l= ",l," ",score_valid*100,"%")
        accuracies[len(accuracies)-1].append(score_valid)

Searching for best values of c and l

In [None]:
best_c = 0
best_l = 0
for c in range(len(C)):
    l = np.argmax(accuracies[c])
    if accuracies[c][l] > accuracies[best_c][best_l]:
        best_c = c
        best_l = l
best_c = C[best_c]
best_l = L[best_l]
print("Best value for c = ",best_c)
print("Best value for l = ",best_l)

### Evaluating on the test set

In [None]:
modelSVM = svm.SVC(C=best_c, kernel='rbf', gamma=best_l)
modelSVM.fit(Data_train[:,:2], Target_train)
    
test_predictions = modelSVM.predict(Data_test[:, :2])
score_test = (test_predictions[test_predictions==Target_test].sum())/len(Target_test)
print("Result for test set with c = ",best_c," and l = ",best_l," ",score_test*100,"%")

## Searching for best pair of features

Here we are searching for the most representative pair of features.

In [None]:
#plotting method
step_size = 0.02
# Create color maps
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
def plot_boundaries_kNN_bf(model, data_train, target_train, n_neighbors,i):
    data_0, data_1 = np.meshgrid(np.arange(data_train[:, i].min() - 1, data_train[:, i].max() + 1 , step_size),
                     np.arange(data_train[:, i+1].min()-1, data_train[:, i+1].max()+1, step_size))
    predictions = model.predict(np.c_[data_0.ravel(), data_1.ravel()])

    # Put the result into a color plot
    predictions = predictions.reshape(data_0.shape)
    pyplot.figure()
    pyplot.pcolormesh(data_0, data_1, predictions, cmap=cmap_light)

    # Plot also the training points
    pyplot.scatter(data_train[:, i], data_train[:, i+1], c=target_train, cmap=cmap_bold)
    pyplot.xlim(data_0.min(), data_0.max())
    pyplot.ylim(data_1.min(), data_1.max())
    pyplot.title("3-Class classification (k = %i, i = %i)"
              % (n_neighbors, i))

    pyplot.show()

### kNN

In [None]:
K = [1, 3, 5, 7]

In [None]:
accuracies = []            

for i in range(len(Data_train_valid[0])-1):
    accuracies.append([])
    for k in K:
        score_valids = []
        for j in range(5):
            
            #Extracting validation samples
            starting_index = j*n_samples
            Data_validation = Data_train_valid[starting_index:starting_index+n_samples]
            Target_validation = Target_train_valid[starting_index:starting_index+n_samples]
            
            #Extracting remaining samples
            Data_train_tmp = np.concatenate((Data_train_valid[:starting_index], Data_train_valid[starting_index+n_samples:]))
            Target_train_tmp = np.concatenate((Target_train_valid[:starting_index], Target_train_valid[starting_index+n_samples:]))
            
            model = KNeighborsClassifier(k, weights='uniform')
            model.fit(Data_train_tmp[:, i:i+2], Target_train_tmp)
    
            #Plotting boundaries
            #plot_boundaries_kNN_bf(model, Data_train_tmp, Target_train_tmp, k, i)
    
            #Predicting on validation split
            predictions_valid = model.predict(Data_validation[:, i:i+2])
            score_valids.append((predictions_valid[predictions_valid==Target_validation].sum())/len(Target_validation))
            #print("Result for validation split with k= ",k," ",score_valid*100,"%")
        #Computing mean of result for current values of hyperparameters
        score_valid = (sum(score_valids))/5
        accuracies[i].append(score_valid)

In [None]:
best_i = 0
best_k = 0
for i in range(len(Data_train_valid[0])-1):
    k = np.argmax(accuracies[i])
    if accuracies[i][k] > accuracies[best_i][best_k]:
        best_i = i
        best_k = k
best_k = K[best_k]
print("Best pair: ",best_i," : ",best_i+1)
print("Best value for k = ",best_k)

Considering weights based on distance between test samples and k nearest train samples

In [None]:
#Setting k to its best value
model = KNeighborsClassifier(best_k, weights='distance')
model.fit(Data_train_valid[:, best_i:best_i+2], Target_train_valid)

plot_boundaries_kNN_bf(model, Data_train_valid, Target_train_valid, best_k, best_i)

test_predictions = model.predict(Data_test[:, best_i:best_i+2])

In [None]:
score_test = (test_predictions[test_predictions==Target_test].sum())/len(Target_test)
print("Result for test set with k= ",best_k," ",score_test*100,"%")

### SVM, RBF kernel

In [None]:
def plot_boundaries_svm_bf(model, data_train, target_train, c, i, l=-1000):
    data_0, data_1 = np.meshgrid(np.arange(data_train[:, i].min() - 1, data_train[:, i].max() + 1 , step_size),
                     np.arange(data_train[:, i+1].min()-1, data_train[:, i+1].max()+1, step_size))
    predictions = model.predict(np.c_[data_0.ravel(), data_1.ravel()])

    # Put the result into a color plot
    predictions = predictions.reshape(data_0.shape)
    pyplot.figure()
    pyplot.pcolormesh(data_0, data_1, predictions, cmap=cmap_light)

    # Plot also the training points
    pyplot.scatter(data_train[:, i], data_train[:, i+1], c=target_train, cmap=cmap_bold)
    pyplot.xlim(data_0.min(), data_0.max())
    pyplot.ylim(data_1.min(), data_1.max())
    if l == -1000:
        pyplot.title("3-Class classification (c = %i, i = %i)"
              % (c, i))
    else:
        pyplot.title("3-Class classification (c = %i, l = %f, , i = %i)"
              % (c, l, i))

    pyplot.show()

In [None]:
accuracies = []

for j in range(len(Data_train_valid[0])-1):
    accuracies.append([])
    for c in C:
        accuracies[j].append([])
        for l in L:
            score_valids = []
            for i in range(5):
            
                #Extracting validation samples
                starting_index = i*n_samples
                Data_validation = Data_train_valid[starting_index:starting_index+n_samples]
                Target_validation = Target_train_valid[starting_index:starting_index+n_samples]
            
                #Extracting remaining samples
                Data_train_tmp = np.concatenate((Data_train_valid[:starting_index], Data_train_valid[starting_index+n_samples:]))
                Target_train_tmp = np.concatenate((Target_train_valid[:starting_index], Target_train_valid[starting_index+n_samples:]))
            
                #Fitting train set
                modelSVM = svm.SVC(C=c, kernel='rbf', gamma=l)
                modelSVM.fit(Data_train_tmp[:,j:j+2], Target_train_tmp)
    
                #Predicting on validation split
                predictions_valid = modelSVM.predict(Data_validation[:, j:j+2])
                score_valids.append((predictions_valid[predictions_valid==Target_validation].sum())/len(Target_validation))
            
            #Computing mean of result for current values of hyperparameters
            score_valid = (sum(score_valids))/5
            #print("Result for validation split with c= ",c," and l= ",l," ",score_valid*100,"%")
            accuracies[j][len(accuracies[j])-1].append(score_valid)

In [None]:
best_i = 0
best_c = 0
best_l = 0
for i in range(len(Data_train_valid[0])-1):
    for c in range(len(accuracies[i])):
        l = np.argmax(accuracies[i][c])
        if accuracies[i][c][l] > accuracies[best_i][best_c][best_l]:
            best_i = i
            best_c = c
            best_l = l
best_c = C[best_c]
best_l = L[best_l]
print("Best pair: ",best_i," : ",best_i+1)
print("Best value for c = ",best_c)
print("Best value for l = ",best_l)

In [None]:
modelSVM = svm.SVC(C=best_c, kernel='rbf')
modelSVM.fit(Data_train[:,best_i:best_i+2], Target_train)

plot_boundaries_svm_bf(modelSVM, Data_train, Target_train, c=best_c, i=best_i, l=best_l)

In [None]:
test_predictions = modelSVM.predict(Data_test[:, best_i:best_i+2])
score_test = (test_predictions[test_predictions==Target_test].sum())/len(Target_test)
print("Result for test set with c = ",best_c," ",score_test*100,"%")