# Baseline classification model

This is the baseline model for the project.  It is a random forest model coupled with a basic grid search.  The scores for this baseline model are as follows:  <br>
Accuracy: 0.794 <br>
Recall: 0.794 <br>
Precision: 0.821 <br>

In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix

In [2]:
# Import the training dataset

train = pd.read_csv('Data/train.csv',
                   low_memory = False)
y_train = train['Kingdom']
X_train = train.drop(columns = ['Kingdom', 'Unnamed: 0',
                               'SpeciesID', 'Ncodons',
                               'SpeciesName'],
                    axis = 1)

In [3]:
X_train

Unnamed: 0,DNAtype,UUU,UUC,UUA,UUG,CUU,CUC,CUA,CUG,AUU,...,CGG,AGA,AGG,GAU,GAC,GAA,GAG,UAA,UAG,UGA
0,0,0.03518,0.01379,0.02855,0.01564,0.02837,0.01821,0.00813,0.00778,0.04234,...,0.00362,0.00893,0.00283,0.03235,0.01016,0.04189,0.01229,0.00194,0.00080,0.00133
1,2,0.06000,0.03200,0.03400,0.02800,0.02300,0.00200,0.01900,0.00800,0.04700,...,0.00400,0.01800,0.00800,0.02700,0.00800,0.04600,0.01000,0.00000,0.00000,0.00200
2,0,0.03044,0.01872,0.01767,0.01817,0.02890,0.01387,0.01117,0.01049,0.03032,...,0.00061,0.01252,0.00608,0.03185,0.01921,0.02406,0.02160,0.00104,0.00037,0.00018
3,1,0.03519,0.02463,0.02609,0.01914,0.02109,0.01279,0.01376,0.01024,0.03371,...,0.00872,0.01729,0.00935,0.02493,0.00973,0.03027,0.01473,0.00190,0.00139,0.00177
4,0,0.01982,0.02294,0.00915,0.02284,0.02213,0.01338,0.00714,0.00895,0.02324,...,0.00382,0.01197,0.01247,0.02978,0.01660,0.03612,0.03209,0.00231,0.00020,0.00161
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9755,0,0.00677,0.03625,0.00242,0.01353,0.02078,0.05607,0.00532,0.01788,0.00967,...,0.00725,0.00290,0.00290,0.01740,0.02658,0.01643,0.02707,0.00000,0.00000,0.00048
9756,1,0.04499,0.01969,0.06355,0.00900,0.02250,0.01012,0.03375,0.00619,0.05568,...,0.00056,0.00056,0.00000,0.00900,0.00787,0.01744,0.00169,0.00225,0.00056,0.02812
9757,0,0.02657,0.02523,0.01039,0.01821,0.02509,0.01565,0.01066,0.00958,0.01713,...,0.00324,0.01214,0.00944,0.03170,0.02657,0.02752,0.03332,0.00094,0.00027,0.00148
9758,0,0.02347,0.01578,0.01000,0.01830,0.01953,0.01129,0.01218,0.02150,0.02034,...,0.00592,0.01640,0.00640,0.02511,0.02715,0.03368,0.02551,0.00204,0.00048,0.00102


In [4]:
# Initialize the Random Forest model

rf = RandomForestClassifier()

# Define the grid search parameters

params = {'n_estimators': [100, 150],
         'max_depth': [5, 7],
         'verbose':[0]}

# Initialize the grid search object

grid = GridSearchCV(rf, params)

# Fit the GridSearchCV object

grid.fit(X_train, y_train)

GridSearchCV(estimator=RandomForestClassifier(),
             param_grid={'max_depth': [5, 7], 'n_estimators': [100, 150],
                         'verbose': [0]})

In [5]:
# Check the best score from the grid search cross-validation

print(f'Best score from the grid search: {grid.best_score_}')

# Check the best parameters from the grid search cross-validation

print(f'The best parameters were: {grid.best_params_}')

Best score from the grid search: 0.7860655737704918
The best parameters were: {'max_depth': 7, 'n_estimators': 150, 'verbose': 0}


In [6]:
# Load the testing data into dataframe

test = pd.read_csv('Data/test.csv')
y_test = test['Kingdom']
X_test = test.drop(columns = ['Kingdom', 'Unnamed: 0',
                               'SpeciesID', 'Ncodons',
                               'SpeciesName'],
                  axis = 1)

# Make predictions using the grid search fitted params

preds = grid.predict(X_test)

In [7]:
# Check the metric scores

print(f'Accuracy: {accuracy_score(y_test, preds)}')
print(f'Recall: {recall_score(y_test, preds, average = "weighted")}')
print(f'Precision: {precision_score(y_test, preds, average = "weighted", zero_division = 1)}')
print(f'Confusion Matrix: \n {confusion_matrix(y_test, preds)}')

Accuracy: 0.7808850645359557
Recall: 0.7808850645359557
Precision: 0.8120556665282921
Confusion Matrix: 
 [[  0  14   0   0   0   0   7   0   0   7   0]
 [  0 680   2   0   0   0   7   0   0  33   0]
 [  0  52 109   0   0   0  58   0   0 115  10]
 [  0   1   2 102   0   0   2   0   0   7  28]
 [  0  45   0   0   0   0   5   0   0  10   0]
 [  0   5   0   0   0   0   0   0   0   1   0]
 [  0  26   0   0   0   0 529   0   0  97   0]
 [  0   1   0   6   0   0   1   2   0   8  32]
 [  0   0   0  20   0   0   2   0   9   5  14]
 [  0  13   0   0   0   0  31   0   0 677   1]
 [  0   1   1   0   0   0  11   0   0  32 433]]
