In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix

In [2]:
# Import the training dataset

train = pd.read_csv('Data/train.csv',
                   low_memory = False)
y_train = train['Kingdom']
X_train = train.drop(columns = ['Kingdom', 'Unnamed: 0',
                               'SpeciesID', 'Ncodons',
                               'SpeciesName'],
                    axis = 1)

In [3]:
X_train

Unnamed: 0,DNAtype,UUU,UUC,UUA,UUG,CUU,CUC,CUA,CUG,AUU,...,CGG,AGA,AGG,GAU,GAC,GAA,GAG,UAA,UAG,UGA
0,0,0.03973,0.01880,0.01578,0.01327,0.02783,0.01448,0.00900,0.00578,0.03791,...,0.00070,0.01091,0.00801,0.03688,0.01897,0.01998,0.01909,0.00079,0.00003,0.00005
1,0,0.02127,0.01761,0.01895,0.02301,0.01881,0.00674,0.00777,0.01097,0.03407,...,0.00347,0.00955,0.00256,0.04814,0.01055,0.06094,0.01556,0.00161,0.00034,0.00081
2,0,0.00580,0.03205,0.00065,0.01229,0.00534,0.02608,0.00148,0.06283,0.00564,...,0.02555,0.00205,0.00486,0.01776,0.03614,0.03089,0.03183,0.00034,0.00043,0.00223
3,0,0.01502,0.02743,0.00338,0.00785,0.00789,0.02120,0.00411,0.04687,0.01119,...,0.00780,0.01263,0.01389,0.01958,0.03045,0.02116,0.04998,0.00018,0.00063,0.00158
4,2,0.07059,0.01961,0.03922,0.02745,0.02157,0.00588,0.01569,0.00784,0.05098,...,0.00980,0.02255,0.00588,0.02745,0.00588,0.03529,0.01176,0.00000,0.00000,0.00196
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9764,0,0.00677,0.03625,0.00242,0.01353,0.02078,0.05607,0.00532,0.01788,0.00967,...,0.00725,0.00290,0.00290,0.01740,0.02658,0.01643,0.02707,0.00000,0.00000,0.00048
9765,1,0.03968,0.04586,0.01499,0.00441,0.02734,0.06349,0.03968,0.01675,0.05026,...,0.00000,0.00000,0.00000,0.00265,0.02646,0.01235,0.00088,0.00176,0.00088,0.02205
9766,0,0.01432,0.03411,0.00211,0.00884,0.02063,0.03158,0.00758,0.02063,0.02189,...,0.00211,0.00674,0.00505,0.02611,0.02568,0.01979,0.03579,0.00211,0.00084,0.00000
9767,0,0.02347,0.01578,0.01000,0.01830,0.01953,0.01129,0.01218,0.02150,0.02034,...,0.00592,0.01640,0.00640,0.02511,0.02715,0.03368,0.02551,0.00204,0.00048,0.00102


In [29]:
# Initialize the Random Forest model

rf = RandomForestClassifier()

# Define the grid search parameters

params = {'n_estimators': [100, 150],
         'max_depth': [5, 7],
         'verbose':[0]}

# Initialize the grid search object

grid = GridSearchCV(rf, params)

# Fit the GridSearchCV object

grid.fit(X_train, y_train)

GridSearchCV(estimator=RandomForestClassifier(),
             param_grid={'max_depth': [5, 7], 'n_estimators': [100, 150],
                         'verbose': [0]})

In [10]:
# Check the best score from the grid search cross-validation

print(f'Best score from the grid search: {grid.best_score_}')

# Check the best parameters from the grid search cross-validation

print(f'The best parameters were: {grid.best_params_}')

Best score from the grid search: 0.7826801377928925
The best parameters were: {'max_depth': 7, 'n_estimators': 150, 'verbose': 1}


In [15]:
# Load the testing data into dataframe

test = pd.read_csv('Data/test.csv')
y_test = test['Kingdom']
X_test = test.drop(columns = ['Kingdom', 'Unnamed: 0',
                               'SpeciesID', 'Ncodons',
                               'SpeciesName'],
                  axis = 1)

# Make predictions using the grid search fitted params

preds = grid.predict(X_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    0.1s finished


In [28]:
# Check the metric scores

print(f'Accuracy: {accuracy_score(y_test, preds)}')
print(f'Recall: {recall_score(y_test, preds, average = "weighted")}')
print(f'Precision: {precision_score(y_test, preds, average = "weighted", zero_division = 1)}')
print(f'Confusion Matrix: \n {confusion_matrix(y_test, preds)}')

Accuracy: 0.802272029474977
Recall: 0.802272029474977
Precision: 0.8281696354899317
Confusion Matrix: 
 [[  0   5   0   0   0   0   6   0   0   7   0]
 [  0 718   2   0   0   0  12   0   0  15   0]
 [  0  56 118   0   0   0  54   0   0 120   5]
 [  0   0   1 100   0   0   2   0   0   9  30]
 [  0  46   0   0   0   0   1   0   0   4   0]
 [  0   3   0   0   0   0   0   0   0   1   0]
 [  0  11   1   0   0   0 536   0   0  78   1]
 [  0   0   0   7   0   0   2   1   0   5  28]
 [  0   0   0  16   0   0   1   0  10   7  15]
 [  0  18   0   0   0   0  21   0   0 689   1]
 [  0   1   1   1   0   0  11   0   0  39 441]]
