In [1]:
import warnings
warnings.simplefilter("ignore")

In [2]:
import pandas as pd

import tensorflow
import keras

from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, RobustScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasClassifier

import functions
from functions import get_AA, translation

Using TensorFlow backend.


In [3]:
# Import training set

train = pd.read_csv('Data/train.csv',
                   low_memory = False)

# Separate target feature
y_train = train['Kingdom']

# Drop target from training data
X_train = train.drop(columns = ['Kingdom'],
                    axis = 1)

In [4]:
# Create a list of codon columns and categorical columns for preprocessing in pipeline

# Codons columns
columns = X_train.columns.tolist()
codons = columns
unwanted = ['Unnamed: 0', 'DNAtype', 'SpeciesID', 'Ncodons', 'SpeciesName']
for ele in unwanted:
    codons.remove(ele)

# Amino acid columns
amino_acids = ['Ala', 'Arg', 'Asn', 'Asp', 'Cys', 'Gln', 'Glu',
       'Gly', 'His', 'Ile', 'Leu', 'Lys', 'Met', 'Phe', 'Pro', 'Ser', 'Stop',
       'Thr', 'Trp', 'Tyr', 'Val']

# Categorical columns

categorical_cols = ['DNAtype']

# Numeric columns

numeric_cols = []

for codon in codons:
    numeric_cols.append(codon)
for aa in amino_acids:
    numeric_cols.append(aa)

In [5]:
def get_AA(df):
    
    """
    Produce new columns representing amino acid frequency for each observation.
    
    Inputs: 
    df - Dataframe containing codon frequencies that we wish get amino acid frequencies for
    columns - A list of column labels representing codons
    
    Output: 
    
    A new dataframe containing both the codon frequencies and the amino acid frequencies
    
    """

    # Define a list of codons to use for the translation
    
    columns = ['UUU', 'UUC', 'UUA', 'UUG', 'CUU', 'CUC', 'CUA', 'CUG', 'AUU', 'AUC', 'AUA',
       'AUG', 'GUU', 'GUC', 'GUA', 'GUG', 'GCU', 'GCC', 'GCA', 'GCG', 'CCU',
       'CCC', 'CCA', 'CCG', 'UGG', 'GGU', 'GGC', 'GGA', 'GGG', 'UCU', 'UCC',
       'UCA', 'UCG', 'AGU', 'AGC', 'ACU', 'ACC', 'ACA', 'ACG', 'UAU', 'UAC',
       'CAA', 'CAG', 'AAU', 'AAC', 'UGU', 'UGC', 'CAU', 'CAC', 'AAA', 'AAG',
       'CGU', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG', 'GAU', 'GAC', 'GAA', 'GAG',
       'UAA', 'UAG', 'UGA']
    
    # Append 'SpeciesName' to the columns list
    
    columns.append('SpeciesName')
    
    # Build a new dataframe with the where the columns are labelled by the amino acid they represent 
    # rather than the codon they represent
    df_AA = df[columns].rename(translation,
                     axis = 1)

    # Group columns representing the same amino acid by their sums
    df_AA = df_AA.groupby(lambda x: x,
                         axis = 1).sum()
    
    # Merge df_AA with df
    df = df.merge(df_AA,
            on = 'SpeciesName',
                 how = 'right')
    return df

In [38]:
# Define the function to create our KerasClassifier model

def create_NN():
    
    # Model architecture
    model = Sequential()
    model.add(Dense(144, input_dim = 96, activation = 'relu'))
    model.add(Dropout(0.5))
    model.add(Dense(72, activation = 'relu'))
    model.add(Dropout(0.25))
    model.add(Dense(36, activation = 'relu'))
#     model.add(Dropout(0.2))
    model.add(Dense(18, activation = 'relu'))
    model.add(Dense(11, activation = 'softmax'))
    
    #Model compilation
    model.compile(loss = 'categorical_crossentropy',
                 optimizer = 'adam',
                 metrics = ['accuracy'])
    
    return model

In [31]:
# Define the custom transformers for feature engineering

AA_transformer = FunctionTransformer(get_AA)  # Get the amino acid frequencies for each organism


# Instantiate the feature engineering pipeline

engineering = Pipeline([('amino', AA_transformer)])

In [32]:
# define preprocessing steps for categorical features

cat_transform = Pipeline([('ohc', OneHotEncoder(handle_unknown = 'ignore'))])

# Define preprocessing steps for numerical features

num_transform = Pipeline([('scaler', RobustScaler())])

# Instantiate the column transformer for preprocessing

preprocessing = ColumnTransformer([('cat', cat_transform, categorical_cols),
                                  ('num', num_transform, numeric_cols)])

In [33]:
# Instantiate a pipeline with keras classifier model

pipeline = Pipeline([('engineering', engineering),
                    ('preprocessing', preprocessing),
                    ('model', KerasClassifier(build_fn = create_NN,
                                             epochs = 250,
                                             batch_size = 2048,
                                             verbose = 2))])

In [34]:
history = pipeline.fit(X_train, y_train)

Epoch 1/250
 - 0s - loss: 2.3444 - accuracy: 0.1949
Epoch 2/250
 - 0s - loss: 2.1004 - accuracy: 0.3112
Epoch 3/250
 - 0s - loss: 1.9391 - accuracy: 0.3696
Epoch 4/250
 - 0s - loss: 1.8090 - accuracy: 0.4198
Epoch 5/250
 - 0s - loss: 1.6800 - accuracy: 0.4567
Epoch 6/250
 - 0s - loss: 1.5447 - accuracy: 0.5259
Epoch 7/250
 - 0s - loss: 1.4035 - accuracy: 0.5853
Epoch 8/250
 - 0s - loss: 1.2674 - accuracy: 0.6220
Epoch 9/250
 - 0s - loss: 1.1516 - accuracy: 0.6558
Epoch 10/250
 - 0s - loss: 1.0627 - accuracy: 0.6690
Epoch 11/250
 - 0s - loss: 0.9919 - accuracy: 0.6980
Epoch 12/250
 - 0s - loss: 0.9228 - accuracy: 0.7170
Epoch 13/250
 - 0s - loss: 0.8719 - accuracy: 0.7311
Epoch 14/250
 - 0s - loss: 0.8156 - accuracy: 0.7468
Epoch 15/250
 - 0s - loss: 0.7627 - accuracy: 0.7658
Epoch 16/250
 - 0s - loss: 0.7342 - accuracy: 0.7698
Epoch 17/250
 - 0s - loss: 0.6929 - accuracy: 0.7791
Epoch 18/250
 - 0s - loss: 0.6576 - accuracy: 0.7916
Epoch 19/250
 - 0s - loss: 0.6353 - accuracy: 0.7968
Ep

In [35]:
# Import the test dataset

test = pd.read_csv('Data/test.csv',
                   low_memory = False)
y_test = test['Kingdom']
X_test = test.drop(columns = ['Kingdom'],
                    axis = 1)

In [36]:
# Make predictions on the test set using the CNN

preds = history.predict(X_test)

In [37]:
# Check the metric scores

print(f'Accuracy: {accuracy_score(y_test, preds)}')
print(f'Recall: {recall_score(y_test, preds, average = "weighted")}')
print(f'Precision: {precision_score(y_test, preds, average = "weighted", zero_division = 1)}')
print(f'Confusion Matrix: \n {confusion_matrix(y_test, preds)}')

Accuracy: 0.9419176398279041
Recall: 0.9419176398279041
Precision: 0.941863035075082
Confusion Matrix: 
 [[ 21   6   0   0   0   0   0   0   0   1   0]
 [  1 702   8   0   1   0   8   0   0   2   0]
 [  0   3 311   1   0   0  12   1   0  11   5]
 [  0   0   0 129   0   0   0   2   6   1   4]
 [  1  14   0   0  41   0   0   0   0   4   0]
 [  0   5   0   0   1   0   0   0   0   0   0]
 [  0   8  12   0   0   0 623   0   0   8   1]
 [  0   0   0   9   0   0   0  35   0   1   5]
 [  0   0   0  14   0   0   0   0  35   0   1]
 [  1   2   9   0   1   0   8   0   0 701   0]
 [  0   0   9   0   0   0   1   0   0   1 467]]
