In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import plotly.express as px
from tensorflow.keras import layers
import tensorflow as tf
from sklearn.model_selection import train_test_split


from sklearn.manifold import TSNE

In [27]:
df = pd.read_json('./train.json', lines=True).drop('index', axis=1)
test = pd.read_json('./test.json', lines=True).drop('index', axis=1)

In [28]:
def pandas_list_to_array(df):
    """
    Input: dataframe of shape (x, y), containing list of length l
    Return: np.array of shape (x, l, y)
    """
    
    return np.transpose(
        np.array(df.values.tolist()),
        (0, 2, 1)
    )

def preprocess_inputs(df, token2int, cols=['sequence', 'structure', 'predicted_loop_type']):
    return pandas_list_to_array(
        df[cols].applymap(lambda seq: [token2int[x] for x in seq])
    )

# preprocessing

In [29]:
# codon mapping
def unique_codons(series):
    result = series.apply(lambda x: [x[i:i+3] for i in range(0,len(x),3)])
    return np.unique(np.array(result.to_list()).flatten())

def create_codons(df, mapper, cols=['sequence', 'structure', 'predicted_loop_type']):
    for col in cols:
        df[col] = df[col].apply(lambda x: [mapper[x[i:i+3]] for i in range(0,len(x),3)])
        

In [33]:
u_seqs = unique_codons(df.sequence)
u_structs = unique_codons(df.structure)
u_loops = unique_codons(df.predicted_loop_type)

tokens = np.concatenate([u_seqs, u_structs, u_loops])

token_dict = dict(zip(tokens,range(0,len(tokens),1)))

create_codons(df, token_dict)

In [35]:
df.head()

Unnamed: 0,id,sequence,structure,predicted_loop_type,signal_to_noise,SN_filter,seq_length,seq_scored,reactivity_error,deg_error_Mg_pH10,deg_error_pH10,deg_error_Mg_50C,deg_error_50C,reactivity,deg_Mg_pH10,deg_pH10,deg_Mg_50C,deg_50C
0,id_001f94081,"[41, 0, 40, 29, 13, 5, 41, 34, 51, 34, 50, 45,...","[85, 83, 65, 66, 85, 85, 72, 75, 74, 68, 85, 6...","[93, 94, 128, 125, 97, 97, 128, 116, 130, 119,...",6.894,1,107,68,"[0.1359, 0.20700000000000002, 0.1633, 0.1452, ...","[0.26130000000000003, 0.38420000000000004, 0.1...","[0.2631, 0.28600000000000003, 0.0964, 0.1574, ...","[0.1501, 0.275, 0.0947, 0.18660000000000002, 0...","[0.2167, 0.34750000000000003, 0.188, 0.2124, 0...","[0.3297, 1.5693000000000001, 1.1227, 0.8686, 0...","[0.7556, 2.983, 0.2526, 1.3789, 0.637600000000...","[2.3375, 3.5060000000000002, 0.3008, 1.0108, 0...","[0.35810000000000003, 2.9683, 0.2589, 1.4552, ...","[0.6382, 3.4773, 0.9988, 1.3228, 0.78770000000..."
1,id_0049f53ba,"[41, 0, 10, 39, 26, 44, 51, 26, 39, 32, 63, 26...","[85, 83, 65, 65, 65, 65, 65, 65, 65, 68, 84, 7...","[93, 94, 128, 128, 128, 128, 128, 128, 128, 11...",0.193,0,107,68,"[2.8272, 2.8272, 2.8272, 4.7343, 2.5676, 2.567...","[73705.3985, 73705.3985, 73705.3985, 73705.398...","[10.1986, 9.2418, 5.0933, 5.0933, 5.0933, 5.09...","[16.6174, 13.868, 8.1968, 8.1968, 8.1968, 8.19...","[15.4857, 7.9596, 13.3957, 5.8777, 5.8777, 5.8...","[0.0, 0.0, 0.0, 2.2965, 0.0, 0.0, 0.0, 0.0, 0....","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[4.947, 4.4523, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[4.8511, 4.0426, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[7.6692, 0.0, 10.9561, 0.0, 0.0, 0.0, 0.0, 0.0..."
2,id_006f36f57,"[41, 2, 58, 53, 36, 2, 29, 10, 55, 3, 10, 3, 2...","[85, 83, 65, 77, 85, 83, 65, 77, 68, 85, 72, 8...","[93, 94, 128, 108, 102, 103, 128, 114, 118, 97...",8.8,1,107,68,"[0.0931, 0.13290000000000002, 0.11280000000000...","[0.1365, 0.2237, 0.1812, 0.1333, 0.1148, 0.160...","[0.17020000000000002, 0.178, 0.111, 0.091, 0.0...","[0.1033, 0.1464, 0.1126, 0.09620000000000001, ...","[0.14980000000000002, 0.1761, 0.1517, 0.116700...","[0.44820000000000004, 1.4822, 1.1819, 0.743400...","[0.2504, 1.4021, 0.9804, 0.49670000000000003, ...","[2.243, 2.9361, 1.0553, 0.721, 0.6396000000000...","[0.5163, 1.6823000000000001, 1.0426, 0.7902, 0...","[0.9501000000000001, 1.7974999999999999, 1.499..."
3,id_0082d463b,"[41, 0, 39, 26, 39, 26, 39, 25, 0, 10, 39, 26,...","[85, 85, 65, 65, 65, 65, 65, 68, 85, 80, 72, 7...","[93, 93, 128, 128, 128, 128, 128, 118, 97, 101...",0.104,0,107,68,"[3.5229, 6.0748, 3.0374, 3.0374, 3.0374, 3.037...","[73705.3985, 73705.3985, 73705.3985, 73705.398...","[11.8007, 12.7566, 5.7733, 5.7733, 5.7733, 5.7...","[121286.7181, 121286.7182, 121286.7181, 121286...","[15.3995, 8.1124, 7.7824, 7.7824, 7.7824, 7.78...","[0.0, 2.2399, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0....","[0.0, -0.5083, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0...","[3.4248, 6.8128, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[0.0, -0.8365, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0...","[7.6692, -1.3223, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0..."
4,id_0087940f4,"[41, 0, 52, 13, 13, 52, 52, 13, 3, 13, 61, 51,...","[85, 83, 65, 65, 77, 65, 65, 65, 67, 65, 65, 6...","[93, 94, 128, 128, 91, 128, 128, 128, 116, 128...",0.423,0,107,68,"[1.665, 2.1728, 2.0041, 1.2405, 0.620200000000...","[4.2139, 3.9637000000000002, 3.2467, 2.4716, 1...","[3.0942, 3.015, 2.1212, 2.0552, 0.881500000000...","[2.6717, 2.4818, 1.9919, 2.5484999999999998, 1...","[1.3285, 3.6173, 1.3057, 1.3021, 1.1507, 1.150...","[0.8267, 2.6577, 2.8481, 0.40090000000000003, ...","[2.1058, 3.138, 2.5437000000000003, 1.0932, 0....","[4.7366, 4.6243, 1.2068, 1.1538, 0.0, 0.0, 0.7...","[2.2052, 1.7947000000000002, 0.7457, 3.1233, 0...","[0.0, 5.1198, -0.3551, -0.3518, 0.0, 0.0, 0.0,..."


In [36]:
measured_cols = ['reactivity_error', 'deg_error_Mg_pH10', 'deg_error_pH10', 'deg_error_Mg_50C',
            'deg_error_50C', 'reactivity', 'deg_Mg_pH10', 'deg_pH10', 'deg_Mg_50C', 'deg_50C']
for col in measured_cols:
    df[col] = df[col].apply(lambda x: np.array(x))


df['corrected_reactivity'] = df.reactivity - df.reactivity_error
df['corrected_deg_Mg_pH10'] = df.deg_Mg_pH10 - df.deg_error_Mg_pH10
df['corrected_deg_pH10'] = df.deg_pH10 - df.deg_error_pH10
df['corrected_deg_50C'] = df.deg_50C - df.deg_error_50C
df['corrected_deg_Mg_50C'] = df.deg_Mg_50C - df.deg_error_Mg_50C

# visualizations

In [37]:
pred_cols = ['corrected_reactivity','corrected_deg_Mg_pH10','corrected_deg_pH10',
             'corrected_deg_50C','corrected_deg_Mg_50C']

# index = 1

# for col in pred_cols:
#     plt.plot(df[col].iloc[index])
#     plt.title(f'{col} Mean: {np.mean(df[col].iloc[index])}')
#     plt.show()
    
# plt.figure(figsize=(20,10))
# for col in pred_cols:
#     plt.plot(df[col].iloc[index],label=col)
# plt.legend()

# modeling

In [39]:
df = df.query("signal_to_noise >= 1")

train_inputs = pandas_list_to_array(df[['sequence', 'structure', 'predicted_loop_type']])
train_labels = pandas_list_to_array(df[pred_cols])

x_train, x_val, y_train, y_val = train_test_split(train_inputs, train_labels, 
                                                  test_size=.25, random_state=0, stratify=df.SN_filter)

In [46]:
def MCRMSE(y_true, y_pred):
    colwise_mse = tf.reduce_mean(tf.square(y_true - y_pred), axis=1)
    return tf.reduce_mean(tf.sqrt(colwise_mse), axis=1)

def gru_layer(hidden_dim, dropout):
    return layers.Bidirectional(layers.GRU(
        hidden_dim, activation='relu', dropout=dropout, return_sequences=True, kernel_initializer='orthogonal'))


def build_model(seq_len, pred_len, embed_size):

    # seq_len=107
    # pred_len=68
    dropout=0.25
    sp_dropout=0.2
    embed_dim=200
    hidden_dim=272
    n_layers=3

    inputs = layers.Input(shape=(seq_len, 3))

    embed = layers.Embedding(input_dim=embed_size, output_dim=embed_dim)(inputs)

    reshaped = tf.reshape(
        embed, shape=(-1, embed.shape[1],  embed.shape[2] * embed.shape[3])
    )

    hidden = layers.SpatialDropout1D(sp_dropout)(reshaped)

    for x in range(n_layers):
        hidden = gru_layer(hidden_dim, dropout)(hidden)

    # Since we are only making predictions on the first part of each sequence, 
    # we have to truncate it
#     truncated = hidden[:, :pred_len]
    out = layers.Dense(5, activation='relu')(hidden)

    model = tf.keras.Model(inputs=inputs, outputs=out)
    model.compile(tf.optimizers.Adam(), loss=MCRMSE)

#     model.summary()
    
    return model

In [47]:
train_inputs.shape

(2097, 36, 3)

In [49]:
train_labels.shape

(2097, 68, 5)

In [51]:
%%time

model = build_model(36, 68, len(token_dict))

history = model.fit(
    x_train, y_train,
    validation_data=(x_val, y_val),
    batch_size=100,
    epochs=50,
    verbose=2,
    callbacks=[
        tf.keras.callbacks.ReduceLROnPlateau(patience=5),
        tf.keras.callbacks.ModelCheckpoint('model.h5')
    ]
)

Train on 1572 samples, validate on 525 samples
Epoch 1/50


ValueError: Dimensions must be equal, but are 68 and 36 for 'loss/dense_3_loss/sub' (op: 'Sub') with input shapes: [?,68,5], [?,36,5].

In [None]:
fig = px.line(
    history.history, y=['loss', 'val_loss'],
    labels={'index': 'epoch', 'value': 'MCRMSE'}, 
    title='Training History')
fig.show()

In [None]:
# Caveat: The prediction format requires the output to be the same length as the input,
# although it's not the case for the training data.
model_public = build_model(seq_len=107, pred_len=107)
model_private = build_model(seq_len=130, pred_len=130)

model_public.load_weights('model.h5')
model_private.load_weights('model.h5')

public_df = test.query("seq_length == 107")
private_df = test.query("seq_length == 130")

public_inputs = preprocess_inputs(public_df, map_dict)
private_inputs = preprocess_inputs(private_df, map_dict)

public_preds = model_public.predict(public_inputs)
private_preds = model_private.predict(private_inputs)

In [None]:
preds_ls = []

for df, preds in [(public_df, public_preds), (private_df, private_preds)]:
    for i, uid in enumerate(df.id):
        single_pred = preds[i]

        single_df = pd.DataFrame(single_pred, columns=pred_cols)
        single_df['id_seqpos'] = [f'{uid}_{x}' for x in range(single_df.shape[0])]

        preds_ls.append(single_df)

preds_df = pd.concat(preds_ls)
preds_df.head()

In [None]:
preds_df.columns = [i.replace('corrected_','') for i in preds_df.columns]

In [None]:
preds_df.to_csv('./submission_two.csv', index=False)

In [None]:
preds_df.shape