In [15]:
import numpy as np 
import pandas as pd 


In [16]:
import tensorflow as tf
from tensorflow.keras import models,layers
import matplotlib.pyplot as plt
import zipfile

In [17]:
df_train = pd.read_csv('train.csv')
df_train.head()

Unnamed: 0,id,species,margin1,margin2,margin3,margin4,margin5,margin6,margin7,margin8,...,texture55,texture56,texture57,texture58,texture59,texture60,texture61,texture62,texture63,texture64
0,1,Acer_Opalus,0.007812,0.023438,0.023438,0.003906,0.011719,0.009766,0.027344,0.0,...,0.007812,0.0,0.00293,0.00293,0.035156,0.0,0.0,0.004883,0.0,0.025391
1,2,Pterocarya_Stenoptera,0.005859,0.0,0.03125,0.015625,0.025391,0.001953,0.019531,0.0,...,0.000977,0.0,0.0,0.000977,0.023438,0.0,0.0,0.000977,0.039062,0.022461
2,3,Quercus_Hartwissiana,0.005859,0.009766,0.019531,0.007812,0.003906,0.005859,0.068359,0.0,...,0.1543,0.0,0.005859,0.000977,0.007812,0.0,0.0,0.0,0.020508,0.00293
3,5,Tilia_Tomentosa,0.0,0.003906,0.023438,0.005859,0.021484,0.019531,0.023438,0.0,...,0.0,0.000977,0.0,0.0,0.020508,0.0,0.0,0.017578,0.0,0.047852
4,6,Quercus_Variabilis,0.005859,0.003906,0.048828,0.009766,0.013672,0.015625,0.005859,0.0,...,0.09668,0.0,0.021484,0.0,0.0,0.0,0.0,0.0,0.0,0.03125


In [18]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedShuffleSplit

In [19]:
def encode(df_train):
    le = LabelEncoder().fit(df_train.species)
    labels = le.transform(df_train.species) 
    
    classes = list(le.classes_) 
    
    
    df_train = df_train.drop(['species','id'],axis = 1) 


    return df_train, labels, classes

df_train, labels, classes = encode(df_train)

In [20]:
X = df_train.values
y = labels

# PCA for dimensionality reduction

In [21]:
from sklearn.decomposition import PCA
pca = PCA(n_components='mle', svd_solver='full')
X=pca.fit_transform(X)
n_components = pca.n_components_

#Spliting the dataset into Training, Validation and Testing

In [22]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.20, random_state=5)
split.get_n_splits(X, y)
for train_index, test_index in split.split(X, y):
    

    X_train, X_test = X[train_index], X[test_index]

    y_train, y_test = y[train_index], y[test_index]
  
split = StratifiedShuffleSplit(n_splits=1, test_size=0.20, random_state=5)
for train_val_index, val_index in split.split(X_train, y_train):
    

    X_train_val, X_val = X_train[train_val_index], X_train[val_index]
    y_train_val, y_val = y_train[train_val_index], y_train[val_index] 

#KNN classifier training 

In [23]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, log_loss

KNN_clf = KNeighborsClassifier(3)
KNN_clf.fit(X_train_val, y_train_val) 


predictions_test_KNN = KNN_clf.predict(X_val)   
acc_test_KNN = accuracy_score(y_val, predictions_test_KNN)
print("Accuracy: {:.4%}".format(acc_test_KNN))


predictions_prob_test_KNN = KNN_clf.predict_proba(X_val) 
ll_test_KNN = log_loss(y_val, predictions_prob_test_KNN) 
print("Log Loss: {}".format(ll_test_KNN))

Accuracy: 82.3899%
Log Loss: 2.1970284259393225


In [24]:
predictions_test_2 = KNN_clf.predict(X_test)  
acc_test_2 = accuracy_score(y_test, predictions_test_2)
print("Test data Accuracy: {:.4%}".format(acc_test_2))

Test data Accuracy: 84.3434%


# 5 Fold cross validation

In [25]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(KNN_clf, X_train, y_train, cv=5, scoring='f1_micro')
print(KNN_clf,scores)
print("Cross Validation Avg Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

KNeighborsClassifier(n_neighbors=3) [0.86163522 0.79874214 0.87341772 0.79113924 0.89240506]
Cross Validation Avg Accuracy: 0.84 (+/- 0.08)


# Grid Search

In [26]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

k_range = list(range(1, 50))
param_grid = dict(n_neighbors=k_range)
cv= [ (train_val_index, val_index) ]
grid = GridSearchCV(KNeighborsClassifier(), param_grid, cv=cv, scoring='accuracy')
grid.fit(X_train, y_train)

print(grid.best_params_)
print(grid.best_score_)

{'n_neighbors': 1}
0.8616352201257862


In [27]:
y_pred = grid.best_estimator_.predict(X_test)

In [16]:

acc_test_G = accuracy_score(y_test, y_pred)
print("G Test data Accuracy: {:.4%}".format(acc_test_G))

G Test data Accuracy: 90.9091%


In [18]:
predictions_prob_test_KNN_G = grid.best_estimator_.predict_proba(X_test) 
ll_test_KNN_G = log_loss(y_test, predictions_prob_test_KNN_G)
print("Grid Log Loss: {}".format(ll_test_KNN_G))

Grid Log Loss: 3.2766957626470368


In [17]:
from sklearn.model_selection import cross_val_score

knn = KNeighborsClassifier(n_neighbors=grid.best_params_['n_neighbors'])
scores = cross_val_score(knn, X_train, y_train, cv=5)

print('Cross-validation scores:', scores)
print('Average cross-validation score:', scores.mean())

Cross-validation scores: [0.91823899 0.87421384 0.89873418 0.84810127 0.91772152]
Average cross-validation score: 0.8914019584427992


In [None]:
from sklearn.metrics import classification_report
report_RF = classification_report(y_test, y_pred, target_names=classes)
print(report_RF)

                              precision    recall  f1-score   support

             Acer_Capillipes       1.00      1.00      1.00         2
             Acer_Circinatum       1.00      0.50      0.67         2
                   Acer_Mono       1.00      1.00      1.00         2
                 Acer_Opalus       1.00      1.00      1.00         2
               Acer_Palmatum       1.00      1.00      1.00         2
                 Acer_Pictum       1.00      1.00      1.00         2
             Acer_Platanoids       1.00      1.00      1.00         2
                 Acer_Rubrum       0.67      1.00      0.80         2
              Acer_Rufinerve       0.67      1.00      0.80         2
            Acer_Saccharinum       1.00      1.00      1.00         2
               Alnus_Cordata       1.00      1.00      1.00         2
          Alnus_Maximowiczii       1.00      1.00      1.00         2
                 Alnus_Rubra       0.67      1.00      0.80         2
           Alnus_Si

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
