In [3]:
import numpy as np 
import pandas as pd 


In [4]:
import tensorflow as tf
from tensorflow.keras import models,layers
import matplotlib.pyplot as plt
import zipfile

In [5]:
df_train = pd.read_csv('train.csv')
df_train.head()

Unnamed: 0,id,species,margin1,margin2,margin3,margin4,margin5,margin6,margin7,margin8,...,texture55,texture56,texture57,texture58,texture59,texture60,texture61,texture62,texture63,texture64
0,1,Acer_Opalus,0.007812,0.023438,0.023438,0.003906,0.011719,0.009766,0.027344,0.0,...,0.007812,0.0,0.00293,0.00293,0.035156,0.0,0.0,0.004883,0.0,0.025391
1,2,Pterocarya_Stenoptera,0.005859,0.0,0.03125,0.015625,0.025391,0.001953,0.019531,0.0,...,0.000977,0.0,0.0,0.000977,0.023438,0.0,0.0,0.000977,0.039062,0.022461
2,3,Quercus_Hartwissiana,0.005859,0.009766,0.019531,0.007812,0.003906,0.005859,0.068359,0.0,...,0.1543,0.0,0.005859,0.000977,0.007812,0.0,0.0,0.0,0.020508,0.00293
3,5,Tilia_Tomentosa,0.0,0.003906,0.023438,0.005859,0.021484,0.019531,0.023438,0.0,...,0.0,0.000977,0.0,0.0,0.020508,0.0,0.0,0.017578,0.0,0.047852
4,6,Quercus_Variabilis,0.005859,0.003906,0.048828,0.009766,0.013672,0.015625,0.005859,0.0,...,0.09668,0.0,0.021484,0.0,0.0,0.0,0.0,0.0,0.0,0.03125


In [6]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedShuffleSplit

# Label Encoding 

In [7]:
def encode(df_train):
    le = LabelEncoder().fit(df_train.species)
    labels = le.transform(df_train.species) 
    
    classes = list(le.classes_) 
    
    
    df_train = df_train.drop(['species','id'],axis = 1) 

    return df_train, labels, classes

df_train, labels, classes = encode(df_train)

In [8]:
X = df_train.values
y = labels

# PCA for dimensionality reduction

In [9]:
from sklearn.decomposition import PCA
pca = PCA(n_components='mle', svd_solver='full')
X=pca.fit_transform(X)
n_components = pca.n_components_

In [10]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.20, random_state=5)
split.get_n_splits(X, y)

1

#Spliting the dataset into Training, Validation and Testing

In [11]:
for train_index, test_index in split.split(X, y):
    
   
    X_train, X_test = X[train_index], X[test_index]
    
    y_train, y_test = y[train_index], y[test_index]

In [12]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.20, random_state=5)
for train_val_index, val_index in split.split(X_train, y_train):
    

    X_train_val, X_val = X_train[train_val_index], X_train[val_index]
    y_train_val, y_val = y_train[train_val_index], y_train[val_index] 


#Random Forest classifier training and testing

In [13]:
from sklearn.metrics import accuracy_score, log_loss
from sklearn.ensemble import RandomForestClassifier


clf = RandomForestClassifier()
clf.fit(X_train_val, y_train_val) 


predictions_test = clf.predict(X_val)  
acc_test = accuracy_score(y_val, predictions_test)
print("Validation Accuracy : {:.4%}".format(acc_test))

predictions_prob_test = clf.predict_proba(X_val) 
ll_test_rf = log_loss(y_val, predictions_prob_test)
print("Validation Log Loss: {}".format(ll_test_rf))

Validation Accuracy : 83.0189%
Validation Log Loss: 1.7496383282929306


# 5 Fold cross validation 

In [14]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='f1_micro')
print(clf,scores)
print("Cross Validation Avg Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

RandomForestClassifier() [0.90566038 0.89937107 0.89240506 0.85443038 0.91139241]
Cross Validation Avg Accuracy: 0.89 (+/- 0.04)


In [None]:
predictions_test_2 = clf.predict(X_test)  
acc_test_2 = accuracy_score(y_test, predictions_test_2)
print("Test data Accuracy: {:.4%}".format(acc_test_2))

Test data Accuracy: 87.8788%


# Bayesian Optimization for hyperparameter tuning

In [None]:
from skopt import gp_minimize
from skopt import space
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score

# Define the objective function
def objective(params):
    n_estimators = params[0]
    max_depth = params[1]
    min_samples_split = params[2]
    
    # Initialize the random forest classifier with the given hyperparameters
    rf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split)
    
    # Compute the F1 score of the random forest classifier on the validation set
    f1 = cross_val_score(rf, X_train, y_train, cv=5, scoring='f1_macro').mean()
    
    return -f1

# Define the search space
search_space = [space.Integer(10, 170, name='n_estimators'),
                space.Integer(2, 34, name='max_depth'),
                space.Integer(2, 12, name='min_samples_split')]

# Run the optimization
result = gp_minimize(objective, search_space, n_calls=50)

# Print the best hyperparameters and the corresponding F1 score
print("Best hyperparameters: ", result.x)
rf_best = RandomForestClassifier(n_estimators=result.x[0], max_depth=result.x[1], min_samples_split=result.x[2])
rf_best.fit(X_train_val, y_train_val)
y_pred = rf_best.predict(X_val)
f1_best = f1_score(y_val, y_pred, average='macro')
print("Best F1 score: ", f1_best)




Best hyperparameters:  [170, 32, 2]
Best F1 score:  0.8269360269360269


In [None]:
acc_val_B = accuracy_score(y_val, y_pred)
print("BO Validation data Accuracy: {:.4%}".format(acc_val_B))

BO Validation data Accuracy: 84.9057%


In [None]:
y_pred_Test_B = rf_best.predict(X_test)
acc_test_B = accuracy_score(y_test, y_pred_Test_B)
print("BO Test data Accuracy: {:.4%}".format(acc_test_B))

BO Test data Accuracy: 91.4141%


In [None]:
from sklearn.model_selection import cross_val_score
scores_BO = cross_val_score(rf_best, X_train, y_train, cv=5, scoring='f1_micro')
print(rf_best,scores_BO)
print("Cross Validation Avg Accuracy: %0.2f (+/- %0.2f)" % (scores_BO.mean(), scores_BO.std() * 2))

RandomForestClassifier(max_depth=32, n_estimators=170) [0.91194969 0.90566038 0.91139241 0.90506329 0.90506329]
Cross Validation Avg Accuracy: 0.91 (+/- 0.01)


#Classification report

In [None]:
from sklearn.metrics import classification_report
report_RF = classification_report(y_test, y_pred_Test_B, target_names=classes)
print(report_RF)

                              precision    recall  f1-score   support

             Acer_Capillipes       1.00      1.00      1.00         2
             Acer_Circinatum       1.00      0.50      0.67         2
                   Acer_Mono       1.00      1.00      1.00         2
                 Acer_Opalus       1.00      1.00      1.00         2
               Acer_Palmatum       1.00      1.00      1.00         2
                 Acer_Pictum       1.00      1.00      1.00         2
             Acer_Platanoids       1.00      1.00      1.00         2
                 Acer_Rubrum       0.40      1.00      0.57         2
              Acer_Rufinerve       1.00      1.00      1.00         2
            Acer_Saccharinum       1.00      0.50      0.67         2
               Alnus_Cordata       0.67      1.00      0.80         2
          Alnus_Maximowiczii       1.00      1.00      1.00         2
                 Alnus_Rubra       0.67      1.00      0.80         2
           Alnus_Si

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
