In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import plotly.express as px
# from tensorflow.keras import layers
# import tensorflow as tf
from sklearn.model_selection import train_test_split

from sklearn.manifold import TSNE

In [50]:
df = pd.read_json('./train.json', lines=True).drop('index', axis=1)
test = pd.read_json('./test.json', lines=True).drop('index', axis=1)

In [51]:
def pandas_list_to_array(df):
    """
    Input: dataframe of shape (x, y), containing list of length l
    Return: np.array of shape (x, l, y)
    """
    
    return np.transpose(
        np.array(df.values.tolist()),
        (0, 2, 1)
    )

def preprocess_inputs(df, token2int, cols=['sequence', 'structure', 'predicted_loop_type']):
    return pandas_list_to_array(
        df[cols].applymap(lambda seq: [token2int[x] for x in seq])
    )

# preprocessing

In [52]:
# codon mapping
def unique_codons(series):
    result = series.apply(lambda x: [x[i:i+3] for i in range(0,len(x),3)])
    return np.unique(np.array(result.to_list()).flatten())

def create_codons(df, mapper, cols=['sequence', 'structure', 'predicted_loop_type']):
    for col in cols:
        df[col] = df[col].apply(lambda x: [mapper[x[i:i+3]] for i in range(0,len(x),3)])
        

In [53]:
map_dict = {'(': 0, ')': 1, '.': 2, 'A': 3, 'C': 4, 'G': 5, 'U': 6, 'B': 7,
              'E': 8, 'H': 9, 'I': 10, 'M': 11, 'S': 12, 'X': 13}

input_cols = ['sequence', 'structure', 'predicted_loop_type']

for col in input_cols:
    df[col] = df[col].apply(lambda x: [map_dict[i] for i in x])

u_seqs = unique_codons(df.sequence)
u_structs = unique_codons(df.structure)
u_loops = unique_codons(df.predicted_loop_type)

tokens = np.concatenate([u_seqs, u_structs, u_loops])

tokens = [int(''.join([str(i) for i in j])) for j in tokens]

token_dict = dict(zip(tokens,range(0,len(tokens),1)))

# create_codons(df, token_dict)

  after removing the cwd from sys.path.


In [61]:
token_dict

{333: 0,
 334: 1,
 335: 2,
 336: 3,
 34: 4,
 343: 5,
 344: 6,
 345: 7,
 346: 8,
 353: 9,
 354: 10,
 355: 11,
 356: 12,
 363: 13,
 364: 14,
 365: 15,
 366: 16,
 433: 17,
 434: 18,
 435: 19,
 436: 20,
 443: 21,
 444: 22,
 445: 23,
 446: 24,
 453: 25,
 454: 26,
 455: 27,
 456: 28,
 463: 29,
 464: 30,
 465: 31,
 466: 32,
 533: 33,
 534: 34,
 535: 35,
 536: 36,
 543: 37,
 544: 38,
 545: 39,
 546: 40,
 553: 41,
 554: 42,
 555: 43,
 556: 44,
 563: 45,
 564: 46,
 565: 47,
 566: 48,
 633: 49,
 634: 50,
 635: 51,
 636: 52,
 643: 53,
 644: 54,
 645: 55,
 646: 56,
 653: 57,
 654: 58,
 655: 59,
 656: 60,
 663: 61,
 664: 62,
 665: 63,
 666: 64,
 0: 65,
 2: 66,
 20: 67,
 22: 82,
 100: 69,
 102: 70,
 110: 71,
 111: 72,
 112: 73,
 120: 74,
 121: 75,
 122: 76,
 200: 77,
 202: 78,
 210: 79,
 211: 80,
 212: 81,
 220: 83,
 221: 84,
 222: 85,
 777: 86,
 7712: 87,
 7127: 88,
 7129: 89,
 71210: 90,
 71212: 91,
 88: 92,
 888: 93,
 8812: 94,
 81210: 95,
 81212: 96,
 999: 97,
 9912: 98,
 9127: 99,
 91210: 100,
 

In [63]:
test = pandas_list_to_array(df[['sequence', 'structure', 'predicted_loop_type']])

In [65]:
for i in test:
    break

In [74]:
test.shape

(2400, 107, 3)

In [60]:
measured_cols = ['reactivity_error', 'deg_error_Mg_pH10', 'deg_error_pH10', 'deg_error_Mg_50C',
            'deg_error_50C', 'reactivity', 'deg_Mg_pH10', 'deg_pH10', 'deg_Mg_50C', 'deg_50C']
for col in measured_cols:
    df[col] = df[col].apply(lambda x: np.array(x))


df['corrected_reactivity'] = df.reactivity - df.reactivity_error
df['corrected_deg_Mg_pH10'] = df.deg_Mg_pH10 - df.deg_error_Mg_pH10
df['corrected_deg_pH10'] = df.deg_pH10 - df.deg_error_pH10
df['corrected_deg_50C'] = df.deg_50C - df.deg_error_50C
df['corrected_deg_Mg_50C'] = df.deg_Mg_50C - df.deg_error_Mg_50C

# visualizations

In [37]:
pred_cols = ['corrected_reactivity','corrected_deg_Mg_pH10','corrected_deg_pH10',
             'corrected_deg_50C','corrected_deg_Mg_50C']

# index = 1

# for col in pred_cols:
#     plt.plot(df[col].iloc[index])
#     plt.title(f'{col} Mean: {np.mean(df[col].iloc[index])}')
#     plt.show()
    
# plt.figure(figsize=(20,10))
# for col in pred_cols:
#     plt.plot(df[col].iloc[index],label=col)
# plt.legend()

# modeling

In [39]:
df = df.query("signal_to_noise >= 1")

train_inputs = pandas_list_to_array(df[['sequence', 'structure', 'predicted_loop_type']])
train_labels = pandas_list_to_array(df[pred_cols])

x_train, x_val, y_train, y_val = train_test_split(train_inputs, train_labels, 
                                                  test_size=.25, random_state=0, stratify=df.SN_filter)

In [52]:
def MCRMSE(y_true, y_pred):
    colwise_mse = tf.reduce_mean(tf.square(y_true - y_pred), axis=1)
    return tf.reduce_mean(tf.sqrt(colwise_mse), axis=1)

def gru_layer(hidden_dim, dropout):
    return layers.Bidirectional(layers.GRU(
        hidden_dim, activation='relu', dropout=dropout, return_sequences=True, kernel_initializer='orthogonal'))


def build_model(seq_len, pred_len, embed_size):

    # seq_len=107
    # pred_len=68
    dropout=0.25
    sp_dropout=0.2
    embed_dim=200
    hidden_dim=272
    n_layers=3

    inputs = layers.Input(shape=(seq_len, 3))

    embed = layers.Embedding(input_dim=embed_size, output_dim=embed_dim)(inputs)

    reshaped = tf.reshape(
        embed, shape=(-1, embed.shape[1],  embed.shape[2] * embed.shape[3])
    )

    hidden = layers.SpatialDropout1D(sp_dropout)(reshaped)

    for x in range(n_layers):
        hidden = gru_layer(hidden_dim, dropout)(hidden)

    # Since we are only making predictions on the first part of each sequence, 
    # we have to truncate it
#     truncated = hidden[:, :pred_len]
    out = layers.Dense(5, activation='relu')(hidden)

    model = tf.keras.Model(inputs=inputs, outputs=out)
    model.compile(tf.optimizers.Adam(), loss=MCRMSE)

#     model.summary()
    
    return model

In [53]:
train_inputs.shape

(2097, 36, 3)

In [56]:
train_labels.shape

(2097, 68, 5)

In [55]:
%%time

model = build_model(107, 68, len(token_dict))

history = model.fit(
    x_train, y_train,
    validation_data=(x_val, y_val),
    batch_size=100,
    epochs=50,
    verbose=2,
    callbacks=[
        tf.keras.callbacks.ReduceLROnPlateau(patience=5),
        tf.keras.callbacks.ModelCheckpoint('model.h5')
    ]
)

Train on 1572 samples, validate on 525 samples
Epoch 1/50


ValueError: Dimensions must be equal, but are 68 and 36 for 'loss/dense_4_loss/sub' (op: 'Sub') with input shapes: [?,68,5], [?,36,5].

In [None]:
fig = px.line(
    history.history, y=['loss', 'val_loss'],
    labels={'index': 'epoch', 'value': 'MCRMSE'}, 
    title='Training History')
fig.show()

In [None]:
# Caveat: The prediction format requires the output to be the same length as the input,
# although it's not the case for the training data.
model_public = build_model(seq_len=107, pred_len=107)
model_private = build_model(seq_len=130, pred_len=130)

model_public.load_weights('model.h5')
model_private.load_weights('model.h5')

public_df = test.query("seq_length == 107")
private_df = test.query("seq_length == 130")

public_inputs = preprocess_inputs(public_df, map_dict)
private_inputs = preprocess_inputs(private_df, map_dict)

public_preds = model_public.predict(public_inputs)
private_preds = model_private.predict(private_inputs)

In [None]:
preds_ls = []

for df, preds in [(public_df, public_preds), (private_df, private_preds)]:
    for i, uid in enumerate(df.id):
        single_pred = preds[i]

        single_df = pd.DataFrame(single_pred, columns=pred_cols)
        single_df['id_seqpos'] = [f'{uid}_{x}' for x in range(single_df.shape[0])]

        preds_ls.append(single_df)

preds_df = pd.concat(preds_ls)
preds_df.head()

In [None]:
preds_df.columns = [i.replace('corrected_','') for i in preds_df.columns]

In [None]:
preds_df.to_csv('./submission_two.csv', index=False)

In [None]:
preds_df.shape