# Final Models

This notebook is responsible for training and evaluating the final LOPO models.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras import optimizers, Input
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
import numpy as np

In [1]:
def label_type(row):
    """ Converts continuous label to categorical label
    """
    if row['scaled_effect'] < 0.95:
        return('Deleterious')
    elif row['scaled_effect'] > 1.05:
        return('Beneficial')
    else:
        return('Neutral')
    
def lopo_train_test_split(protein, curr_data):
    """ Splits data into train/test splits by leaving one protein out of training data
    """
    train_data = curr_data[curr_data.protein != protein].drop(['protein', 'pdb', 'resnum'], axis=1)
    test_data = curr_data[curr_data.protein == protein].drop(['protein', 'pdb', 'resnum'], axis=1)
    
    # Set up Training Data
    ## Need to one-hot encode labels
    y_train = train_data.type
    encoder_train = LabelEncoder()
    encoder_train.fit(y_train)
    y_train = to_categorical(encoder_train.transform(y_train))
    
    x_train = scale(train_data.drop(['type'], axis=1))
    
    # Set up Tresting Data
    ## Need to one-hot encode labels
    y_test = test_data.type
    encoder_test = LabelEncoder()
    encoder_test.fit(y_test)
    y_test = to_categorical(encoder_test.transform(y_test))
    
    x_test = scale(test_data.drop(['type'], axis=1))

    return x_train, y_train, x_test, y_test

def nn_model():
    model = Sequential()
    inputs = Input(shape=(969,))
    x = Dense(400, activation=tf.nn.relu)(inputs)
    for layers in range(16):
        x = Dense(400, activation=tf.nn.relu)(x)
    outputs = Dense(3, activation=tf.nn.softmax)(x)
    opt = optimizers.Adam(learning_rate = 0.1)
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    model.compile(loss='categorical_crossentropy',
                  optimizer=opt,
                  metrics=[tf.keras.metrics.CategoricalAccuracy(),
                           tf.keras.metrics.Precision(),
                           tf.keras.metrics.Recall()])
    return(model)

In [2]:
data_path = 'data/merged.csv'
data = pd.read_csv(data_path)
data['type'] = data.apply(lambda row: label_type(row), axis = 1)
proteins = list(data.protein.unique())

NameError: name 'pd' is not defined

In [None]:
for protein in proteins:
    x_train, y_train, x_test, y_test = lopo_train_test_split(protein, data_final)
    
    # Monitoring Statement
    print("Current Protein: " + protein)
    curr_model = nn_model()
    curr_model.fit(x_train, y_train, epochs = 10, batch_size = 32, verbose=1)
    loss, acc, prec, rec = curr_model.evaluate(x_test, y_test)
    print(loss, acc, prec, rec)
    predictions = curr_model.predict(x_test, y_test)
    y_true = tf.argmax(y_test, 1)
    y_pred = tf.argmax(predictions, 1)
    
    cmatrix = confusion_matrix(y_true, y_pred)
    fig, ax = plt.subplots()
    sns.heatmap(cmatrix, xticklabels = ['Beneficial', 'Deleterious', 'Neutral'], yticklabels = ['Beneficial', 'Deleterious', 'Neutral'])
    ax.set_title('{}'.format(protein))
    plt.savefig('img/finalmodels-cmatrix/' + protein + '-finalmodels-cmatrix.png', dpi=300)
    
    