# Keras Deep Neural Network Classifier

NN classifier model using a pipeline for all preprocessing and feature engineering steps

Trained model currently performing as follows: <br>

Accuracy:  0.942<br>
Recall: 0.942<br>
Precision:  0.942<br>

In [1]:
import warnings
warnings.simplefilter("ignore")

In [86]:
import pandas as pd

import tensorflow
import keras

from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, RobustScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix, f1_score

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasClassifier

import functions
from functions import get_AA, translation

In [3]:
# Import training set

train = pd.read_csv('Data/train.csv',
                   low_memory = False)

# Separate target feature
y_train = train['Kingdom']

# Drop target from training data
X_train = train.drop(columns = ['Kingdom'],
                    axis = 1)

In [4]:
# Create a list of codon columns and categorical columns for preprocessing in pipeline

# Codons columns
columns = X_train.columns.tolist()
codons = columns
unwanted = ['Unnamed: 0', 'DNAtype', 'SpeciesID', 'Ncodons', 'SpeciesName']
for ele in unwanted:
    codons.remove(ele)

# Amino acid columns
amino_acids = ['Ala', 'Arg', 'Asn', 'Asp', 'Cys', 'Gln', 'Glu',
       'Gly', 'His', 'Ile', 'Leu', 'Lys', 'Met', 'Phe', 'Pro', 'Ser', 'Stop',
       'Thr', 'Trp', 'Tyr', 'Val']

# Categorical columns

categorical_cols = ['DNAtype']

# Numeric columns

numeric_cols = []

for codon in codons:
    numeric_cols.append(codon)
for aa in amino_acids:
    numeric_cols.append(aa)

In [5]:
def get_AA(df):
    
    """
    Produce new columns representing amino acid frequency for each observation.
    
    Inputs: 
    df - Dataframe containing codon frequencies that we wish get amino acid frequencies for
    columns - A list of column labels representing codons
    
    Output: 
    
    A new dataframe containing both the codon frequencies and the amino acid frequencies
    
    """

    # Define a list of codons to use for the translation
    
    columns = ['UUU', 'UUC', 'UUA', 'UUG', 'CUU', 'CUC', 'CUA', 'CUG', 'AUU', 'AUC', 'AUA',
       'AUG', 'GUU', 'GUC', 'GUA', 'GUG', 'GCU', 'GCC', 'GCA', 'GCG', 'CCU',
       'CCC', 'CCA', 'CCG', 'UGG', 'GGU', 'GGC', 'GGA', 'GGG', 'UCU', 'UCC',
       'UCA', 'UCG', 'AGU', 'AGC', 'ACU', 'ACC', 'ACA', 'ACG', 'UAU', 'UAC',
       'CAA', 'CAG', 'AAU', 'AAC', 'UGU', 'UGC', 'CAU', 'CAC', 'AAA', 'AAG',
       'CGU', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG', 'GAU', 'GAC', 'GAA', 'GAG',
       'UAA', 'UAG', 'UGA']
    
    # Append 'SpeciesName' to the columns list
    
    columns.append('SpeciesName')
    
    # Build a new dataframe with the where the columns are labelled by the amino acid they represent 
    # rather than the codon they represent
    df_AA = df[columns].rename(translation,
                     axis = 1)

    # Group columns representing the same amino acid by their sums
    df_AA = df_AA.groupby(lambda x: x,
                         axis = 1).sum()
    
    # Merge df_AA with df
    df = df.merge(df_AA,
            on = 'SpeciesName',
                 how = 'right')
    return df

### Building our Neural Network

In [6]:
# Define the function to create our KerasClassifier model

def create_NN():
    
    # Model architecture
    model = Sequential()
    model.add(Dense(144, input_dim = 96, activation = 'relu'))
    model.add(Dropout(0.5))
    model.add(Dense(72, activation = 'relu'))
    model.add(Dropout(0.25))
    model.add(Dense(36, activation = 'relu'))
#     model.add(Dropout(0.2))
    model.add(Dense(18, activation = 'relu'))
    model.add(Dense(11, activation = 'softmax'))
    
    #Model compilation
    model.compile(loss = 'categorical_crossentropy',
                 optimizer = 'adam',
                 metrics = ['accuracy'])
    
    return model

In [7]:
# Define the custom transformers for feature engineering

AA_transformer = FunctionTransformer(get_AA)  # Get the amino acid frequencies for each organism


# Instantiate the feature engineering pipeline

engineering = Pipeline([('amino', AA_transformer)])

In [8]:
# define preprocessing steps for categorical features

cat_transform = Pipeline([('ohc', OneHotEncoder(handle_unknown = 'ignore'))])

# Define preprocessing steps for numerical features

num_transform = Pipeline([('scaler', RobustScaler())])

# Instantiate the column transformer for preprocessing

preprocessing = ColumnTransformer([('cat', cat_transform, categorical_cols),
                                  ('num', num_transform, numeric_cols)])

In [9]:
# Instantiate a pipeline with keras classifier model

pipeline = Pipeline([('engineering', engineering),
                    ('preprocessing', preprocessing),
                    ('model', KerasClassifier(build_fn = create_NN,
                                             epochs = 250,
                                             batch_size = 2048,
                                             verbose = 1))])

In [10]:
history = pipeline.fit(X_train, y_train)




2022-01-13 13:38:52.919447: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA


Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Epoch 32/250
Epoch 33/250
Epoch 34/250
Epoch 35/250
Epoch 36/250
Epoch 37/250
Epoch 38/250
Epoch 39/250
Epoch 40/250
Epoch 41/250
Epoch 42/250
Epoch 43/250
Epoch 44/250
Epoch 45/250
Epoch 46/250
Epoch 47/250
Epoch 48/250
Epoch 49/250
Epoch 50/250
Epoch 51/250
Epoch 52/250
Epoch 53/250
Epoch 54/250
Epoch 55/250
Epoch 56/250
Epoch 57/250
Epoch 58/250
Epoch 59/250
Epoch 60/250
Epoch 61/250
Epoch 62/250
Epoch 63/250
Epoch 64/250
Epoch 65/250
Epoch 66/250
Epoch 67/250
Epoch 68/250
Epoch 69/250
Epoch 70/250
Epoch 71/250
Epoch 72/250
Epoch 73/250
Epoch 74/250
Epoch 75/250
Epoch 76/250
Epoch 77/250
Epoch 78

### Model Evaluation

In [11]:
# Import the test dataset

test = pd.read_csv('Data/test.csv',
                   low_memory = False)
y_test = test['Kingdom']
X_test = test.drop(columns = ['Kingdom'],
                    axis = 1)

In [12]:
# Make predictions on the test set using the CNN

preds = history.predict(X_test)



In [89]:
# Check the metric scores

labels = y_train.unique().tolist()
print(labels)

print(f'Accuracy: {accuracy_score(y_test, preds)}')
print(f'Recall: {recall_score(y_test, preds, average = "weighted")}')
print(f'Precision: {precision_score(y_test, preds, average = "weighted", zero_division = 1)}')
print(f'F1 Score: {f1_score(y_test, preds, average = "weighted")}')
print(f'Confusion Matrix: \n {confusion_matrix(y_test, preds, labels = y_train.unique().tolist())}')

['bacteria', 'plant', 'virus', 'invertebrate', 'archaea', 'vertebrate', 'primate', 'bacteriophage', 'rodent', 'mammal', 'plasmid']
Accuracy: 0.9326982175783651
Recall: 0.9326982175783651
Precision: 0.932214555297477
F1 Score: 0.9307379376615177
Confusion Matrix: 
 [[696   4   2  11   2   0   0   7   0   0   0]
 [  8 618   6  18   0   2   0   0   0   0   0]
 [  3  10 699   8   1   0   0   0   0   1   0]
 [  3  14  18 300   0   7   1   0   1   0   0]
 [  4   0   0   0  24   0   0   0   0   0   0]
 [  0   3   2   3   0 466   1   0   0   3   0]
 [  0   0   1   0   0   4  32   0   1  12   0]
 [ 13   1   3   1   1   0   0  41   0   0   0]
 [  0   0   0   0   0   4   1   0  30  15   0]
 [  0   0   0   0   0   5   5   0   3 129   0]
 [  5   0   0   0   0   0   0   1   0   0   0]]


In [71]:
# Create the confusion matrix as a dataframe

matrix = pd.DataFrame(confusion_matrix(y_test, preds, labels = labels))

In [72]:
# Replace the integer values in the column and index with their kingdom names

num_to_val = {0: 'bacteria',
 1: 'plant',
 2: 'virus',
 3: 'invertebrate',
 4: 'archaea',
 5: 'vertebrate',
 6: 'primate',
 7: 'bacteriophage',
 8: 'rodent',
 9: 'mammal',
 10:'plasmid'}

matrix.rename(mapper = num_to_val,
             inplace = True,
             axis = 0) # Index
matrix.rename(mapper = num_to_val,
             inplace = True,
             axis = 1) # Columns


# make a list of kingdom columns
kingdoms = matrix.columns

In [90]:
# Make a column with the number of correctly identified samples
true_positives = []
for i in range(len(columns)):
    true_positives.append(matrix[kingdoms[i]].iloc[i])
matrix['true positives'] = true_positives

# Get the total number of labels assigned per kingdom
matrix['total positives'] = matrix[kingdoms].sum(axis = 0)

# Get the number of false negatives
matrix['false negatives'] = matrix[kingdoms].sum(axis = 1) - matrix['true positives']

# Get the number of false positives
matrix['false positives'] = (matrix.sum(axis = 0) - matrix['true positives']).astype(int)

# Calculate the recall score for each kingdom
matrix['recall'] = (matrix['true positives']/(matrix['true positives'] + matrix['false negatives'])).round(2) # Round to 2 decimal places

# Calculate the precision score for each kingdom
matrix['precision'] = (matrix['true positives']/(matrix['true positives'] + matrix['false positives'])).round(2)# Round to 2 decimal places



IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer

In [85]:
matrix

Unnamed: 0,bacteria,plant,virus,invertebrate,archaea,vertebrate,primate,bacteriophage,rodent,mammal,plasmid,true positives,total positives,false negatives,false positives,recall,precision
bacteria,696,4,2,11,2,0,0,7,0,0,0,696,732,26,36.0,0.96,0.95
plant,8,618,6,18,0,2,0,0,0,0,0,618,650,34,32.0,0.95,0.95
virus,3,10,699,8,1,0,0,0,0,1,0,699,731,23,32.0,0.97,0.96
invertebrate,3,14,18,300,0,7,1,0,1,0,0,300,341,44,41.0,0.87,0.88
archaea,4,0,0,0,24,0,0,0,0,0,0,24,28,4,4.0,0.86,0.86
vertebrate,0,3,2,3,0,466,1,0,0,3,0,466,488,12,22.0,0.97,0.95
primate,0,0,1,0,0,4,32,0,1,12,0,32,40,18,8.0,0.64,0.8
bacteriophage,13,1,3,1,1,0,0,41,0,0,0,41,49,19,8.0,0.68,0.84
rodent,0,0,0,0,0,4,1,0,30,15,0,30,35,20,5.0,0.6,0.86
mammal,0,0,0,0,0,5,5,0,3,129,0,129,160,13,31.0,0.91,0.81
