# Convolutional Neural Network (1x1/2 width/strides, 8 filters; 5x5/2 width/strides, 32 filters)
To run this notebook, install the [hyperas](https://github.com/maxpumperla/hyperas) dependency and run each cell. This notebook trains the feed forward neural network on the training data, and evaluates it on the test dataset. It saves the model into the `\weights` directory.

In [None]:
# Install dependencies
%pip install hyperas

In [2]:
import h5py
from os.path import join,exists
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Conv2D, Flatten, Dense, MaxPool2D, Dropout
from tensorflow.keras.optimizers import RMSprop
from hyperas.distributions import choice
from hyperas import optim
from keras.callbacks import ModelCheckpoint
from keras.constraints import maxnorm
from random import randint
from keras import backend as K
import os
import numpy as np
from hyperopt import Trials, STATUS_OK, tpe
import tensorflow as tf
from sklearn.utils import shuffle
from sklearn.metrics import roc_curve,roc_auc_score
K.set_image_data_format('channels_first')

In [3]:
# Changes to working directory
os.chdir('/content/drive/My Drive/github/NNanobody/regression')

In [8]:
def create_model(X_train, Y_train, X_test, Y_test):
    """
    create_model compiles and fits the model for five epochs in order for tuning 
    of hyperparameters.

    :param X_train: Independent variable of training data (one hot encoded CDR3 sequences)
    :param Y_train: Dependent variable of training data (enrichment of CDR3 sequences)
    :param X_test: Independent variable of test data (one hot encoded CDR3 sequences)
    :param Y_test: Dependent variable of test data (enrichment of CDR3 sequences)
    :return: Returns the validation loss, hyperopt status, and the model itself after training
    """
    
    W_maxnorm = 3 # Normalization of weights to prevent overfit
    DROPOUT = {{choice([0.3,0.5,0.7])}} # Dropout choices for hyperparameters optimization

    # Construct CNN. Uses leaky ReLU activation function to fix "dying ReLU" problem and speed up training.
    model = Sequential()
    model.add(Conv2D(8, (1, 1), padding='same', input_shape=(20, 1, 20),activation='relu',kernel_constraint=maxnorm(W_maxnorm)))
    model.add(MaxPool2D(pool_size=(1, 1),strides=(1,1)))
    model.add(Conv2D(64, (1, 5), padding='same', activation='relu',kernel_constraint=maxnorm(W_maxnorm)))
    model.add(MaxPool2D(pool_size=(1, 2),strides=(1,2)))
    model.add(Flatten())
    model.add(Dense(16,activation='relu',kernel_constraint=maxnorm(W_maxnorm)))
    model.add(Dropout(DROPOUT))
    model.add(Dense(2,kernel_constraint=maxnorm(W_maxnorm),activation='softmax'))


    myoptimizer = RMSprop(learning_rate={{choice([1e-1,0.01,0.001,0.0001,1e-5])}}, rho=0.9, epsilon=1e-06) # RMSProp optimizer for the model. Learning rate choices for hyperparameter optimization.
    mylossfunc = 'binary_crossentropy'
    model.compile(loss=mylossfunc, optimizer=myoptimizer, metrics=['accuracy']) # Compile model
    result = model.fit(X_train, Y_train, batch_size=100, epochs=5,validation_split=0.1,verbose=False) # Train model for 5 epochs (hyperparameter optimization)

    val_loss = np.amax(result.history['val_loss']) # Save validation loss to return

    return {'loss': val_loss, 'status': STATUS_OK,'model':model}

In [5]:
def train_data():
  data_train = h5py.File('./data/train.h5.batch1','r')
  X_train = np.array(data_train['data'])
  Y_train = np.array(data_train['label'])
  X_train_shuffled, Y_train_shuffled = shuffle(X_train, Y_train)

  data_test = h5py.File('./data/test.h5.batch1', 'r')
  X_test = np.array(data_test['data'])
  Y_test = np.array(data_test['label'])
  X_test_shuffled, Y_test_shuffled = shuffle(X_test, Y_test)
  return X_train_shuffled, Y_train_shuffled, X_test_shuffled, np.fliplr(Y_test_shuffled)

In [9]:
def data():
  """
  data is a helper function that returns the data necessary for training and validation.

  :return: Returns the shuffled training data and testing data
  """
  
  data_train = h5py.File('./data/Hold out Top 4%/train.h5.batch1', 'r') # Load embedded data from .h5 file
  X_train = np.array(data_train['data'])
  Y_train = np.array(data_train['label'])
  X_train_shuffled, Y_train_shuffled = shuffle(X_train, Y_train) # Shuffle training data

  data_test = h5py.File('./data/Test set Regression/test.h5.batch1', 'r')
  X_test = np.array(data_test['data'])
  Y_test = np.array(data_test['label'])
  return X_train_shuffled, Y_train_shuffled, X_test, Y_test

100%|██████████| 10/10 [03:28<00:00, 20.82s/it, best loss: 0.1543642282485962]


In [None]:
# Hyperparameter optimization according to the hyperas documentation. Evaluates 10 times with the tpe suggested search method.
best_run, best_model = optim.minimize(model=create_model,
                                          data=data,
                                          algo=tpe.suggest,
                                          max_evals=10,
                                          trials=Trials(),
                                          notebook_name='seq_32_32',
                                          verbose=False)

In [10]:
X_train, Y_train, X_test, Y_test = data() # Load all data
best_model.fit(X_train, Y_train, batch_size=100, epochs=20,validation_split=0.1,verbose=True) # Train the hyperparameter optimized model

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7faca007f810>

In [12]:
best_model.save('./seq_emb_32x1_16')

INFO:tensorflow:Assets written to: ./seq_emb_32x1_16/assets
