# Random Forest Classification Model

Random forest model performed using a standard data science pipeline for all preprocessing and feature engineering steps as well as model fitting and grid search.  

The top scores achieved for the model are as follows: <br>

Accuracy:  <br>
Recall: <br>
Precision:  <br>

In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, RobustScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix

import functions
from functions import get_AA, translation

In [2]:
# Import the training dataset

train = pd.read_csv('Data/train.csv',
                   low_memory = False)
y_train = train['Kingdom']
X_train = train.drop(columns = ['Kingdom'],
                    axis = 1)

In [3]:
X_train.columns

Index(['Unnamed: 0', 'DNAtype', 'SpeciesID', 'Ncodons', 'SpeciesName', 'UUU',
       'UUC', 'UUA', 'UUG', 'CUU', 'CUC', 'CUA', 'CUG', 'AUU', 'AUC', 'AUA',
       'AUG', 'GUU', 'GUC', 'GUA', 'GUG', 'GCU', 'GCC', 'GCA', 'GCG', 'CCU',
       'CCC', 'CCA', 'CCG', 'UGG', 'GGU', 'GGC', 'GGA', 'GGG', 'UCU', 'UCC',
       'UCA', 'UCG', 'AGU', 'AGC', 'ACU', 'ACC', 'ACA', 'ACG', 'UAU', 'UAC',
       'CAA', 'CAG', 'AAU', 'AAC', 'UGU', 'UGC', 'CAU', 'CAC', 'AAA', 'AAG',
       'CGU', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG', 'GAU', 'GAC', 'GAA', 'GAG',
       'UAA', 'UAG', 'UGA'],
      dtype='object')

In [4]:
# Create a list of codon columns and categorical columns for preprocessing in pipeline

# Codons columns
columns = X_train.columns.tolist()
codons = columns
unwanted = ['Unnamed: 0', 'DNAtype', 'SpeciesID', 'Ncodons', 'SpeciesName']
for ele in unwanted:
    codons.remove(ele)

# Amino acid columns
amino_acids = ['Ala', 'Arg', 'Asn', 'Asp', 'Cys', 'Gln', 'Glu',
       'Gly', 'His', 'Ile', 'Leu', 'Lys', 'Met', 'Phe', 'Pro', 'Ser', 'Stop',
       'Thr', 'Trp', 'Tyr', 'Val']

# Categorical columns

categorical_cols = ['DNAtype']

# Numeric columns

numeric_cols = []

for codon in codons:
    numeric_cols.append(codon)
for aa in amino_acids:
    numeric_cols.append(aa)

In [5]:
def get_AA(df):
    
    """
    Produce new columns representing amino acid frequency for each observation.
    
    Inputs: 
    df - Dataframe containing codon frequencies that we wish get amino acid frequencies for
    columns - A list of column labels representing codons
    
    Output: 
    
    A new dataframe containing both the codon frequencies and the amino acid frequencies
    
    """

    # Define a list of codons to use for the translation
    
    columns = ['UUU', 'UUC', 'UUA', 'UUG', 'CUU', 'CUC', 'CUA', 'CUG', 'AUU', 'AUC', 'AUA',
       'AUG', 'GUU', 'GUC', 'GUA', 'GUG', 'GCU', 'GCC', 'GCA', 'GCG', 'CCU',
       'CCC', 'CCA', 'CCG', 'UGG', 'GGU', 'GGC', 'GGA', 'GGG', 'UCU', 'UCC',
       'UCA', 'UCG', 'AGU', 'AGC', 'ACU', 'ACC', 'ACA', 'ACG', 'UAU', 'UAC',
       'CAA', 'CAG', 'AAU', 'AAC', 'UGU', 'UGC', 'CAU', 'CAC', 'AAA', 'AAG',
       'CGU', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG', 'GAU', 'GAC', 'GAA', 'GAG',
       'UAA', 'UAG', 'UGA']
    
    # Append 'SpeciesName' to the columns list
    
    columns.append('SpeciesName')
    
    # Build a new dataframe with the where the columns are labelled by the amino acid they represent 
    # rather than the codon they represent
    df_AA = df[columns].rename(translation,
                     axis = 1)

    # Group columns representing the same amino acid by their sums
    df_AA = df_AA.groupby(lambda x: x,
                         axis = 1).sum()
    
    # Merge df_AA with df
    df = df.merge(df_AA,
            on = 'SpeciesName',
                 how = 'right')
    return df

#### Feature engineering 

In [6]:
# Define the custom transformers for feature engineering

AA_transformer = FunctionTransformer(get_AA)  # Get the amino acid frequencies for each organism


# Instantiate the feature engineering pipeline

engineering = Pipeline([('amino', AA_transformer)])

####  Preprocessing

In [7]:
print(numeric_cols)

['UUU', 'UUC', 'UUA', 'UUG', 'CUU', 'CUC', 'CUA', 'CUG', 'AUU', 'AUC', 'AUA', 'AUG', 'GUU', 'GUC', 'GUA', 'GUG', 'GCU', 'GCC', 'GCA', 'GCG', 'CCU', 'CCC', 'CCA', 'CCG', 'UGG', 'GGU', 'GGC', 'GGA', 'GGG', 'UCU', 'UCC', 'UCA', 'UCG', 'AGU', 'AGC', 'ACU', 'ACC', 'ACA', 'ACG', 'UAU', 'UAC', 'CAA', 'CAG', 'AAU', 'AAC', 'UGU', 'UGC', 'CAU', 'CAC', 'AAA', 'AAG', 'CGU', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG', 'GAU', 'GAC', 'GAA', 'GAG', 'UAA', 'UAG', 'UGA', 'Ala', 'Arg', 'Asn', 'Asp', 'Cys', 'Gln', 'Glu', 'Gly', 'His', 'Ile', 'Leu', 'Lys', 'Met', 'Phe', 'Pro', 'Ser', 'Stop', 'Thr', 'Trp', 'Tyr', 'Val']


In [8]:
# define preprocessing steps for categorical features

cat_transform = Pipeline([('ohc', OneHotEncoder(handle_unknown = 'ignore'))])

# Define preprocessing steps for numerical features

num_transform = Pipeline([('scaler', RobustScaler())])

# Instantiate the column transformer for preprocessing

preprocessing = ColumnTransformer([('cat', cat_transform, categorical_cols),
                                  ('num', num_transform, numeric_cols)])

#### Modelling Pipeline

In [9]:
# Instantiate a pipeline with random forest model 

pipeline = Pipeline([('engineering', engineering),
                    ('preprocessing', preprocessing),
                    ('model', RandomForestClassifier())])

In [10]:
# Set the parameters to be used with grid search

params = {'model__n_estimators': [75, 100, 150, 200],
         'model__max_depth': [5, 6, 7],
         'model__criterion': ['entropy', 'gini']}


grid_search = GridSearchCV(pipeline,
                          param_grid = params,
                          cv = 10,
                          verbose = 2)

In [11]:
X_train.shape

(9760, 69)

In [12]:
# Fit the grid search

grid_search.fit(X_train, y_train)

Fitting 10 folds for each of 24 candidates, totalling 240 fits
[CV] END model__criterion=entropy, model__max_depth=5, model__n_estimators=75; total time=   4.7s
[CV] END model__criterion=entropy, model__max_depth=5, model__n_estimators=75; total time=   4.6s
[CV] END model__criterion=entropy, model__max_depth=5, model__n_estimators=75; total time=   4.6s
[CV] END model__criterion=entropy, model__max_depth=5, model__n_estimators=75; total time=   4.6s
[CV] END model__criterion=entropy, model__max_depth=5, model__n_estimators=75; total time=   4.6s
[CV] END model__criterion=entropy, model__max_depth=5, model__n_estimators=75; total time=   4.6s
[CV] END model__criterion=entropy, model__max_depth=5, model__n_estimators=75; total time=   4.7s
[CV] END model__criterion=entropy, model__max_depth=5, model__n_estimators=75; total time=   4.5s
[CV] END model__criterion=entropy, model__max_depth=5, model__n_estimators=75; total time=   4.6s
[CV] END model__criterion=entropy, model__max_depth=5, 

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('engineering',
                                        Pipeline(steps=[('amino',
                                                         FunctionTransformer(func=<function get_AA at 0x7fe899fb9680>))])),
                                       ('preprocessing',
                                        ColumnTransformer(transformers=[('cat',
                                                                         Pipeline(steps=[('ohc',
                                                                                          OneHotEncoder(handle_unknown='ignore'))]),
                                                                         ['DNAtype']),
                                                                        ('num',
                                                                         Pipeline(steps=[('scaler',
                                                                                          RobustScale

In [13]:
grid_search.best_score_

0.8090163934426229

In [14]:
# Import the test dataset

train = pd.read_csv('Data/test.csv',
                   low_memory = False)
y_train = train['Kingdom']
X_train = train.drop(columns = ['Kingdom'],
                    axis = 1)

In [15]:
# Make predictions using the fitted model

preds = grid.predict(X_test)

NameError: name 'grid' is not defined