# Random Forest Classification Model

Random forest model performed using a standard data science pipeline for all preprocessing and feature engineering steps as well as model fitting and grid search.  

The top scores achieved for the model are as follows: <br>

Accuracy:  <br>
Recall: <br>
Precision:  <br>

In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, RobustScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix

import functions
from functions import get_AA, translation

In [2]:
# Import the training dataset

train = pd.read_csv('Data/train.csv',
                   low_memory = False)
y_train = train['Kingdom']
X_train = train.drop(columns = ['Kingdom'],
                    axis = 1)

In [3]:
X_train.columns

Index(['Unnamed: 0', 'DNAtype', 'SpeciesID', 'Ncodons', 'SpeciesName', 'UUU',
       'UUC', 'UUA', 'UUG', 'CUU', 'CUC', 'CUA', 'CUG', 'AUU', 'AUC', 'AUA',
       'AUG', 'GUU', 'GUC', 'GUA', 'GUG', 'GCU', 'GCC', 'GCA', 'GCG', 'CCU',
       'CCC', 'CCA', 'CCG', 'UGG', 'GGU', 'GGC', 'GGA', 'GGG', 'UCU', 'UCC',
       'UCA', 'UCG', 'AGU', 'AGC', 'ACU', 'ACC', 'ACA', 'ACG', 'UAU', 'UAC',
       'CAA', 'CAG', 'AAU', 'AAC', 'UGU', 'UGC', 'CAU', 'CAC', 'AAA', 'AAG',
       'CGU', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG', 'GAU', 'GAC', 'GAA', 'GAG',
       'UAA', 'UAG', 'UGA'],
      dtype='object')

In [4]:
# Create a list of codon columns and categorical columns for preprocessing in pipeline

# Codons columns
columns = X_train.columns.tolist()
codons = columns
unwanted = ['Unnamed: 0', 'DNAtype', 'SpeciesID', 'Ncodons', 'SpeciesName']
for ele in unwanted:
    codons.remove(ele)

# Amino acid columns
amino_acids = ['Ala', 'Arg', 'Asn', 'Asp', 'Cys', 'Gln', 'Glu',
       'Gly', 'His', 'Ile', 'Leu', 'Lys', 'Met', 'Phe', 'Pro', 'Ser', 'Stop',
       'Thr', 'Trp', 'Tyr', 'Val']

# Categorical columns

categorical_cols = ['DNAtype']

# Numeric columns

numeric_cols = codons
for aa in amino_acids:
    numeric_cols.append(aa)

#### Feature engineering 

In [5]:
# Define the custom transformers for feature engineering

AA_transformer = FunctionTransformer(get_AA)  # Get the amino acid frequencies for each organism


# Instantiate the feature engineering pipeline

engineering = Pipeline([('amino', AA_transformer)])

####  Preprocessing

In [6]:
print(numeric_cols)

['UUU', 'UUC', 'UUA', 'UUG', 'CUU', 'CUC', 'CUA', 'CUG', 'AUU', 'AUC', 'AUA', 'AUG', 'GUU', 'GUC', 'GUA', 'GUG', 'GCU', 'GCC', 'GCA', 'GCG', 'CCU', 'CCC', 'CCA', 'CCG', 'UGG', 'GGU', 'GGC', 'GGA', 'GGG', 'UCU', 'UCC', 'UCA', 'UCG', 'AGU', 'AGC', 'ACU', 'ACC', 'ACA', 'ACG', 'UAU', 'UAC', 'CAA', 'CAG', 'AAU', 'AAC', 'UGU', 'UGC', 'CAU', 'CAC', 'AAA', 'AAG', 'CGU', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG', 'GAU', 'GAC', 'GAA', 'GAG', 'UAA', 'UAG', 'UGA', 'Ala', 'Arg', 'Asn', 'Asp', 'Cys', 'Gln', 'Glu', 'Gly', 'His', 'Ile', 'Leu', 'Lys', 'Met', 'Phe', 'Pro', 'Ser', 'Stop', 'Thr', 'Trp', 'Tyr', 'Val']


In [7]:
# define preprocessing steps for categorical features

cat_transform = Pipeline([('ohc', OneHotEncoder())])

# Define preprocessing steps for numerical features

num_transform = Pipeline([('scaler', RobustScaler())])

# Instantiate the column transformer for preprocessing

preprocessing = ColumnTransformer([('cat', cat_transform, categorical_cols),
                                  ('num', num_transform, numeric_cols)])

#### Modelling Pipeline

In [8]:
# Instantiate a pipeline with random forest model 

pipeline = Pipeline([('engineering', engineering),
                    ('preprocessing', preprocessing),
                    ('model', RandomForestClassifier())])

In [9]:
# Set the parameters to be used with grid search

params = {'model__n_estimators': [75, 100, 125, 150, 200],
         'model__max_depth': [5, 6, 7],
         'model__criterion': ['gini', 'entropy']}


grid_search = GridSearchCV(pipeline,
                          param_grid = params,
                          cv = 10,
                          verbose = 2)

In [10]:
# Fit the grid search

grid_search.fit(X_train, y_train)

Fitting 10 folds for each of 30 candidates, totalling 300 fits
[CV] END model__criterion=gini, model__max_depth=5, model__n_estimators=75; total time=   1.4s
[CV] END model__criterion=gini, model__max_depth=5, model__n_estimators=75; total time=   1.3s
[CV] END model__criterion=gini, model__max_depth=5, model__n_estimators=75; total time=   1.3s
[CV] END model__criterion=gini, model__max_depth=5, model__n_estimators=75; total time=   1.4s
[CV] END model__criterion=gini, model__max_depth=5, model__n_estimators=75; total time=   1.3s
[CV] END model__criterion=gini, model__max_depth=5, model__n_estimators=75; total time=   1.3s
[CV] END model__criterion=gini, model__max_depth=5, model__n_estimators=75; total time=   1.4s
[CV] END model__criterion=gini, model__max_depth=5, model__n_estimators=75; total time=   1.3s
[CV] END model__criterion=gini, model__max_depth=5, model__n_estimators=75; total time=   1.3s
[CV] END model__criterion=gini, model__max_depth=5, model__n_estimators=75; total 

300 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "/Applications/anaconda3/envs/codon/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Applications/anaconda3/envs/codon/lib/python3.7/site-packages/sklearn/pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Applications/anaconda3/envs/codon/lib/python3.7/site-packages/sklearn/ensemble/_forest.py", line 328, in fit
    X, y, multi_output=True, accept_sparse="csc", dtype=DTYPE
  File "/Applications/anaconda3/envs/codon/l

ValueError: Found array with 0 sample(s) (shape=(0,)) while a minimum of 1 is required.

In [None]:
# Import the test dataset

train = pd.read_csv('Data/test.csv',
                   low_memory = False)
y_train = train['Kingdom']
X_train = train.drop(columns = ['Kingdom'],
                    axis = 1)

In [None]:
# Make predictions using the fitted model

preds = grid.predict(X_test)