# Support Vector Machines Classifier

SVM classifier model using a pipeline for all preprocessing and feature engineering steps alongside model fitting within a grid search.

A feature union between select K best and PCA was attempted in the pipeline, but resulted in a significant decrease in all model metrics.

Trained model currently performing as follows: <br>

Accuracy:  0.949<br>
Recall: 0.949<br>
Precision:  0.949<br>

In [1]:
import pandas as pd
import numpy as np

from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import OneHotEncoder, RobustScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix

import matplotlib.pyplot as plt

import functions
from functions import get_AA, translation

In [2]:
# Import the training dataset

train = pd.read_csv('Data/train.csv',
                   low_memory = False)
y_train = train['Kingdom']
X_train = train.drop(columns = ['Kingdom'],
                    axis = 1)

In [3]:
# Create a list of codon columns and categorical columns for preprocessing in pipeline

# Codons columns
columns = X_train.columns.tolist()
codons = columns
unwanted = ['Unnamed: 0', 'DNAtype', 'SpeciesID', 'Ncodons', 'SpeciesName']
for ele in unwanted:
    codons.remove(ele)

# Amino acid columns
amino_acids = ['Ala', 'Arg', 'Asn', 'Asp', 'Cys', 'Gln', 'Glu',
       'Gly', 'His', 'Ile', 'Leu', 'Lys', 'Met', 'Phe', 'Pro', 'Ser', 'Stop',
       'Thr', 'Trp', 'Tyr', 'Val']

# Categorical columns

categorical_cols = ['DNAtype']

# Numeric columns

numeric_cols = []

for codon in codons:
    numeric_cols.append(codon)
for aa in amino_acids:
    numeric_cols.append(aa)

In [4]:
def get_AA(df):
    
    """
    Produce new columns representing amino acid frequency for each observation.
    
    Inputs: 
    df - Dataframe containing codon frequencies that we wish get amino acid frequencies for
    columns - A list of column labels representing codons
    
    Output: 
    
    A new dataframe containing both the codon frequencies and the amino acid frequencies
    
    """

    # Define a list of codons to use for the translation
    
    columns = ['UUU', 'UUC', 'UUA', 'UUG', 'CUU', 'CUC', 'CUA', 'CUG', 'AUU', 'AUC', 'AUA',
       'AUG', 'GUU', 'GUC', 'GUA', 'GUG', 'GCU', 'GCC', 'GCA', 'GCG', 'CCU',
       'CCC', 'CCA', 'CCG', 'UGG', 'GGU', 'GGC', 'GGA', 'GGG', 'UCU', 'UCC',
       'UCA', 'UCG', 'AGU', 'AGC', 'ACU', 'ACC', 'ACA', 'ACG', 'UAU', 'UAC',
       'CAA', 'CAG', 'AAU', 'AAC', 'UGU', 'UGC', 'CAU', 'CAC', 'AAA', 'AAG',
       'CGU', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG', 'GAU', 'GAC', 'GAA', 'GAG',
       'UAA', 'UAG', 'UGA']
    
    # Append 'SpeciesName' to the columns list
    
    columns.append('SpeciesName')
    
    # Build a new dataframe with the where the columns are labelled by the amino acid they represent 
    # rather than the codon they represent
    df_AA = df[columns].rename(translation,
                     axis = 1)

    # Group columns representing the same amino acid by their sums
    df_AA = df_AA.groupby(lambda x: x,
                         axis = 1).sum()
    
    # Merge df_AA with df
    df = df.merge(df_AA,
            on = 'SpeciesName',
                 how = 'right')
    return df

In [5]:
# Define the custom transformers for feature engineering

AA_transformer = FunctionTransformer(get_AA)  # Get the amino acid frequencies for each organism


# Instantiate the feature engineering pipeline

engineering = Pipeline([('amino', AA_transformer)])

In [6]:
# define preprocessing steps for categorical features

cat_transform = Pipeline([('ohc', OneHotEncoder(handle_unknown = 'ignore'))])

# Define preprocessing steps for numerical features

num_transform = Pipeline([('scaler', RobustScaler())])

# Instantiate the column transformer for preprocessing

preprocessing = ColumnTransformer([('cat', cat_transform, categorical_cols),
                                  ('num', num_transform, numeric_cols)])

In [7]:
# Instantiate a pipeline with support vector classifier

pipeline = Pipeline([('engineering', engineering),
                    ('preprocessing', preprocessing),
#                      ('features', feature_union),
                    ('model', SVC())])

In [8]:
# Set the parameters to be used with grid search

params = {'model__C': [.1, 1, 3],
         'model__kernel': ['linear', 'rbf'],
          'model__break_ties': [True, False]
         }


grid_search = GridSearchCV(pipeline,
                          param_grid = params,
                          cv = 10,
                          verbose = 3)

In [9]:
X_train.shape

(9760, 69)

In [10]:
grid_search.fit(X_train, y_train)

Fitting 10 folds for each of 12 candidates, totalling 120 fits
[CV 1/10] END model__C=0.1, model__break_ties=True, model__kernel=linear;, score=0.900 total time=   3.3s
[CV 2/10] END model__C=0.1, model__break_ties=True, model__kernel=linear;, score=0.863 total time=   3.1s
[CV 3/10] END model__C=0.1, model__break_ties=True, model__kernel=linear;, score=0.893 total time=   2.9s
[CV 4/10] END model__C=0.1, model__break_ties=True, model__kernel=linear;, score=0.875 total time=   3.0s
[CV 5/10] END model__C=0.1, model__break_ties=True, model__kernel=linear;, score=0.866 total time=   2.8s
[CV 6/10] END model__C=0.1, model__break_ties=True, model__kernel=linear;, score=0.880 total time=   3.3s
[CV 7/10] END model__C=0.1, model__break_ties=True, model__kernel=linear;, score=0.887 total time=   3.5s
[CV 8/10] END model__C=0.1, model__break_ties=True, model__kernel=linear;, score=0.892 total time=   3.0s
[CV 9/10] END model__C=0.1, model__break_ties=True, model__kernel=linear;, score=0.899 to

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('engineering',
                                        Pipeline(steps=[('amino',
                                                         FunctionTransformer(func=<function get_AA at 0x7fef1e49bb00>))])),
                                       ('preprocessing',
                                        ColumnTransformer(transformers=[('cat',
                                                                         Pipeline(steps=[('ohc',
                                                                                          OneHotEncoder(handle_unknown='ignore'))]),
                                                                         ['DNAtype']),
                                                                        ('num',
                                                                         Pipeline(steps=[('scaler',
                                                                                          RobustScale

In [11]:
print(f"Best parameters: {grid_search.best_params_}")
print(f'Best score: {grid_search.best_score_}')

Best parameters: {'model__C': 3, 'model__break_ties': False, 'model__kernel': 'rbf'}
Best score: 0.9375


In [12]:
# Import the test dataset

test = pd.read_csv('Data/test.csv',
                   low_memory = False)
y_test = test['Kingdom']
X_test = test.drop(columns = ['Kingdom'],
                    axis = 1)

In [13]:
# Make predictions using the fitted model

preds = grid_search.predict(X_test)

In [14]:
# Check the metric scores

print(f'Accuracy: {accuracy_score(y_test, preds)}')
print(f'Recall: {recall_score(y_test, preds, average = "weighted")}')
print(f'Precision: {precision_score(y_test, preds, average = "weighted", zero_division = 1)}')
print(f'Confusion Matrix: \n {confusion_matrix(y_test, preds)}')

Accuracy: 0.9373079287031346
Recall: 0.9373079287031346
Precision: 0.9372122420100695
Confusion Matrix: 
 [[ 17  11   0   0   0   0   0   0   0   0   0]
 [  1 703   7   0   5   0   2   0   0   4   0]
 [  0   5 299   1   0   0  17   0   0  18   4]
 [  0   0   0 132   0   0   1   3   2   0   4]
 [  1  14   1   0  42   0   0   0   0   2   0]
 [  0   6   0   0   0   0   0   0   0   0   0]
 [  0   8  15   0   0   0 615   0   0  13   1]
 [  0   0   0   8   0   0   0  34   0   2   6]
 [  0   0   0  13   0   0   0   0  34   0   3]
 [  0   2   6   1   1   0   6   0   0 705   1]
 [  0   0   6   1   0   0   1   0   0   1 469]]
