In [1]:
from sandbox import *

In [2]:
random_state = 42

# Data loading

We use only 100k of the approximately 550k samples (more than that seems to break the kernel of our handin jupyter notebook) provided in the human_dna_train-&validation_split in this analysis.

In a first step we combine the train&validation_split data since we do a 5-fold crossvalidation of our models. Then we one-hot encode the sequence data resulting in a training dataset with input shape of (sample number, sequence length, amino acids).

In [3]:
df_train = pd.concat([pd.read_csv('exercise_data/human_dna_train_split.csv', header=0, names =['seq', 'label']),
                       pd.read_csv('exercise_data/human_dna_validation_split.csv', header=0, names =['seq', 'label'])])
y_train = df_train.label
x_train = df_train.seq

In [4]:
# TRAINING DATASET

y_train=y_train[:100000]
x_train=x_train[:100000]
x_prep, y_prep = prep_data_cnn(x_train, y_train)

# print(x_prep.shape)
# print(y_prep.shape)
# print((np.asarray(x_prep)).shape)

In [5]:
# TESTING DATASETS
df_test = pd.read_csv('exercise_data/human_dna_test_split.csv', header=0, names =['seq', 'label'])
df_test_hidden = pd.read_csv('exercise_data/human_dna_test_hidden_split.csv', header=0, names =['seq'])
y_test = df_test.label
x_test = df_test.seq
x_testh = df_test_hidden.seq

In [6]:
x_prep_test, y_prep_test = prep_data_cnn(x_test, y_test)

# print(x_prep_test.shape)
# print(y_prep_test.shape)
# print((np.asarray(x_prep_test)).shape)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y[y==-1] = 0


In [7]:
x_prep_testh = prep_data_cnn(x_testh, use_y=False)

# print(x_prep_testh.shape)

# Analysis / Methods

Using the one-hot encoded data we here investigate three neural network architectures on their capability to predict splice sites (binary problem described in the project_4 description).

A bit of detail on the 3 architectures we provide:

## 'cnn'

This is a model uses 2 convolutional blocks followed by a 3 feed forward layer blocks. The convolutional block uses a convolutional layer with 32 filters and kernel size 5, followed by a maximum pooling. The onehot-encoded input is passed through the convolutional blocks and is then flattened and further processed by the following feed forward layer block: The feed forward layer blocks are decreasing in number of nodes in the linear layers, starting at 256, 128 and then 85. Between the linear layers of the feed forward layer block we apply ReLU activation function and use dropout of 0.5.


## 'cnn2'

This model uses 2 convolutional blocks with decreasing number of filters. First layer block uses 32 filters the second block uses 16 filters, both use kernel size of 3. We then use maxpooling and apply flatten. Then we use a single dense layer with ReLU activation function with 128 nodes followed by an output layer using softmax activation with 2 nodes.


## 'rnn'

This is a simple recurrent neural network architecture that uses lstm cells (32 units) followed by a feedforward layer block with 3 dense layers, relu activation function and dropout as described in 'cnn'. The hidden state of the LSTM layer is used as input of the feed forward layer block. 
Finally we use 2 output nodes with softmax activation to predict.



In [8]:
# Settings is a list of lists containing our evaluation settings.
# We optimize modeltype, batch_size and learning_rate.
def save(cv, run='elegans'):
    try:
        import cPickle as pickle
    except ImportError:  # python 3.x
        import pickle
    with open(run+'_data.p', 'wb') as fp:
        pickle.dump(cv, fp, protocol=pickle.HIGHEST_PROTOCOL)
        
def load(run='elegans'):
    data = None
    try:
        import cPickle as pickle
    except ImportError:  # python 3.x
        import pickle
    with open(run+'_data.p', 'rb') as fp:
        data = pickle.load(fp)
    return data

def get_settings():
    settings = []
    for m in ['cnn', 'cnn2', 'rnn']:
        for l in [0.01, 0.001]:
            for b in [64]:
                settings.append([m, l, b])
    return settings

In [9]:
# We generate a learning setting list (notice this one is considerably shorter than our 
# evalutation on the elegans set. But with leonhard down 
# we are only capable of optimizing on our local machine.)

settings = get_settings()
# print(settings)

In [10]:
# This is a full run... often the kernel dies using the full dataset. run next box for individual runs starting at idx = 0
cv = cross_validation(x_prep, y_prep, settings, 
                      x_test=x_prep_test, y_test=y_prep_test, 
                      x_testh=x_prep_testh, 
                      k=5, epochs=3, test_hidden=True)
# Here we only do 3 epochs (more data, no clusters available) 

(100000,)
weights for classes: {0: 0.5015699138302888, 1: 159.7444089456869}
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3


Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [1]:
save(cv, run='human')

'run=human)' was not found in history, as a file, url, nor in the user namespace.


In [2]:
best = 1000
for i in range(len(cv)):
    # print(cv[i])
    if i%5==0:
        new = cv[i][2]
    else:
        new += cv[i][2]
    # print(cv[i][2], new)
    if i%5==4:
        if best > cv[i][2]:
            print("new best:", new/5)
            best = new/5
            best_s = cv[i][1]  # validation loss
            best_pred_test = cv[i][3]
            best_pred_testh = cv[i][5]
print("Best setting was {} ([model type, learning rate, batch size])".format(best_s))

NameError: name 'cv' is not defined

Using the best model (average of cross validation)!

# Discussion

In [None]:
# cv = load(run='human')

In [27]:
def get_score(cv, run=0):
    return cv[run][4]

[[0, ['cnn', 0.01, 64], 0.019022755324840546, array([0, 0, 0, ..., 0, 0, 0]), [0.996879968799688, 0.0, 0.0, 0.0], array([0, 0, 0, ..., 0, 0, 0])], [1, ['cnn', 0.01, 64], 0.022709164768457413, array([0, 0, 0, ..., 0, 0, 0]), [0.996879968799688, 0.0, 0.0, 0.0], array([0, 0, 0, ..., 0, 0, 0])], [2, ['cnn', 0.01, 64], 0.0211052093654871, array([0, 0, 0, ..., 0, 0, 0]), [0.996879968799688, 0.0, 0.0, 0.0], array([0, 0, 0, ..., 0, 0, 0])], [3, ['cnn', 0.01, 64], 0.022699421271681786, array([0, 0, 0, ..., 0, 0, 0]), [0.996879968799688, 0.0, 0.0, 0.0], array([0, 0, 0, ..., 0, 0, 0])], [4, ['cnn', 0.01, 64], 0.021435614675283432, array([0, 0, 0, ..., 0, 0, 0]), [0.996879968799688, 0.0, 0.0, 0.0], array([0, 0, 0, ..., 0, 0, 0])], [5, ['cnn', 0.001, 64], 0.020988604053854942, array([0, 0, 0, ..., 0, 0, 0]), [0.996879968799688, 0.0, 0.0, 0.0], array([0, 0, 0, ..., 0, 0, 0])], [6, ['cnn', 0.001, 64], 0.028085269033908844, array([0, 0, 0, ..., 0, 0, 0]), [0.996879968799688, 0.0, 0.0, 0.0], array([0, 

In [11]:
# Dummy-Check (see bincounts to check if we only predict majority class)

"""
for i in range(len(cv)):
    sample = cv[i][3]
    print(sample)
    print(np.bincount(sample))
"""

'\nfor i in range(len(cv)):\n    sample = cv[i][3]\n    print(sample)\n    print(np.bincount(sample))\n'

We found that our model 'cnn2' performed best out of all 3 of the ones we investigated. However, due to lack of computational ressources we could not investigate bigger models (which for example with RNNs hidden states may be a big difference!).

In [None]:
# Save best models predictions
np.save('humans_test_nn.npy', best_pred_test)
np.save('humans_test_nn.npy', best_pred_testh)