In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import plotly.express as px
from tensorflow.keras import layers
import tensorflow as tf
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_json('./train.json', lines=True).drop('index', axis=1)
test = pd.read_json('./test.json', lines=True).drop('index', axis=1)

In [3]:
test

Unnamed: 0,id,sequence,structure,predicted_loop_type,seq_length,seq_scored
0,id_00073f8be,GGAAAAGUACGACUUGAGUACGGAAAACGUACCAACUCGAUUAAAA...,......((((((((((.(((((.....))))))))((((((((......,EEEEEESSSSSSSSSSBSSSSSHHHHHSSSSSSSSSSSSSSSSHHH...,107,68
1,id_000ae4237,GGAAACGGGUUCCGCGGAUUGCUGCUAAUAAGAGUAAUCUCUAAAU...,.....((((..((((((...(((((.....((((....)))).......,EEEEESSSSIISSSSSSIIISSSSSIIIIISSSSHHHHSSSSIIII...,130,91
2,id_00131c573,GGAAAACAAAACGGCCUGGAAGACGAAGGAAUUCGGCGCGAAGGCC...,...........((.(((.(.(..((..((..((((...))))..))...,EEEEEEEEEEESSISSSISISIISSIISSIISSSSHHHSSSSIISS...,107,68
3,id_00181fd34,GGAAAGGAUCUCUAUCGAAGGAUAGAGAUCGCUCGCGACGGCACGA...,......((((((((((....))))))))))((((((..((.(((.....,EEEEEESSSSSSSSSSHHHHSSSSSSSSSSSSSSSSIISSISSSHH...,107,68
4,id_0020473f7,GGAAACCCGCCCGCGCCCGCCCGCGCUGCUGCCGUGCCUCCUCUCC...,.....(((((((((((((((((((((((((((((((((((((((((...,EEEEESSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS...,130,91
...,...,...,...,...,...,...
3629,id_ff691b7e5,GGAAACUAGCCAUGGGCAGGUUGAAGGUUGGGUGACACUAACUGGA...,........((((((((..((((...((((((......))))))......,EEEEEEEESSSSSSSSMMSSSSIIISSSSSSHHHHHHSSSSSSIII...,130,91
3630,id_ff9bf3581,GGAAAUAGCGCCAUAGCCGAUUAUUAUAGGCAAUUUUAGCGAUUUA...,.......(((((...(((..........))).......(((........,EEEEEEESSSSSMMMSSSHHHHHHHHHHSSSMMMMMMMSSSHHHHH...,130,91
3631,id_ffc8f96a8,GGAAAGAUGUUCUGAUGAACAUCGGCUGUUCUAGCUUUCAUCUAUC...,.....(((((((....)))))))(((.(((((((((((((((((((...,EEEEESSSSSSSHHHHSSSSSSSSSSBSSSSSSSSSSSSSSSSSSS...,130,91
3632,id_ffd7e8cc1,GGAAACCGUUAACCUGCAUCUUCAUGUUAUCGCUUGCGACAGCAAC...,...............................((((((............,EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEESSSSSSIIIIIIIII...,130,91


In [4]:
def pandas_list_to_array(df):
    """
    Input: dataframe of shape (x, y), containing list of length l
    Return: np.array of shape (x, l, y)
    """
    
    return np.transpose(
        np.array(df.values.tolist()),
        (0, 2, 1)
    )

def preprocess_inputs(df, token2int, cols=['sequence', 'structure', 'predicted_loop_type']):
    return pandas_list_to_array(
        df[cols].applymap(lambda seq: [token2int[x] for x in seq])
    )

# preprocessing

In [5]:
map_dict = {'(': 0, ')': 1, '.': 2, 'A': 3, 'C': 4, 'G': 5, 'U': 6, 'B': 7,
              'E': 8, 'H': 9, 'I': 10, 'M': 11, 'S': 12, 'X': 13}

measured_cols = ['reactivity_error', 'deg_error_Mg_pH10', 'deg_error_pH10', 'deg_error_Mg_50C',
            'deg_error_50C', 'reactivity', 'deg_Mg_pH10', 'deg_pH10', 'deg_Mg_50C', 'deg_50C']
for col in measured_cols:
    df[col] = df[col].apply(lambda x: np.array(x))


df['corrected_reactivity'] = df.reactivity - df.reactivity_error
df['corrected_deg_Mg_pH10'] = df.deg_Mg_pH10 - df.deg_error_Mg_pH10
df['corrected_deg_pH10'] = df.deg_pH10 - df.deg_error_pH10
df['corrected_deg_50C'] = df.deg_50C - df.deg_error_50C
df['corrected_deg_Mg_50C'] = df.deg_Mg_50C - df.deg_error_Mg_50C

# visualizations

In [6]:
pred_cols = ['corrected_reactivity','corrected_deg_Mg_pH10','corrected_deg_pH10',
             'corrected_deg_50C','corrected_deg_Mg_50C']

# index = 1

# for col in pred_cols:
#     plt.plot(df[col].iloc[index])
#     plt.title(f'{col} Mean: {np.mean(df[col].iloc[index])}')
#     plt.show()
    
# plt.figure(figsize=(20,10))
# for col in pred_cols:
#     plt.plot(df[col].iloc[index],label=col)
# plt.legend()

# modeling

In [8]:
df = df.query("signal_to_noise >= 1")

train_inputs = preprocess_inputs(df, map_dict)
train_labels = pandas_list_to_array(df[pred_cols])

x_train, x_val, y_train, y_val = train_test_split(train_inputs, train_labels, 
                                                  test_size=.25, random_state=0, stratify=df.SN_filter)

In [9]:
def MCRMSE(y_true, y_pred):
    colwise_mse = tf.reduce_mean(tf.square(y_true - y_pred), axis=1)
    return tf.reduce_mean(tf.sqrt(colwise_mse), axis=1)

def gru_layer(hidden_dim, dropout):
    return layers.Bidirectional(layers.GRU(
        hidden_dim, activation='relu', dropout=dropout, return_sequences=True, kernel_initializer='orthogonal'))


def build_model(seq_len, pred_len):

    embed_size=len(map_dict)
    # seq_len=107
    # pred_len=68
    dropout=0.25
    sp_dropout=0.2
    embed_dim=200
    hidden_dim=272
    n_layers=3

    inputs = layers.Input(shape=(seq_len, 3))

    embed = layers.Embedding(input_dim=embed_size, output_dim=embed_dim)(inputs)

    reshaped = tf.reshape(
        embed, shape=(-1, embed.shape[1],  embed.shape[2] * embed.shape[3])
    )

    hidden = layers.SpatialDropout1D(sp_dropout)(reshaped)

    for x in range(n_layers):
        hidden = gru_layer(hidden_dim, dropout)(hidden)

    # Since we are only making predictions on the first part of each sequence, 
    # we have to truncate it
    truncated = hidden[:, :pred_len]
    out = layers.Dense(5, activation='relu')(truncated)

    model = tf.keras.Model(inputs=inputs, outputs=out)
    model.compile(tf.optimizers.Adam(), loss=MCRMSE)

    model.summary()
    
    return model

In [None]:
%%time

model = build_model(107, 68)

history = model.fit(
    x_train, y_train,
    validation_data=(x_val, y_val),
    batch_size=100,
    epochs=50,
    verbose=2,
    callbacks=[
        tf.keras.callbacks.ReduceLROnPlateau(patience=5),
        tf.keras.callbacks.ModelCheckpoint('model.h5')
    ]
)

Model: "functional_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_6 (InputLayer)         [(None, 107, 3)]          0         
_________________________________________________________________
embedding_5 (Embedding)      (None, 107, 3, 200)       2800      
_________________________________________________________________
tf_op_layer_Reshape_5 (Tenso [(None, 107, 600)]        0         
_________________________________________________________________
spatial_dropout1d_5 (Spatial (None, 107, 600)          0         
_________________________________________________________________
bidirectional_15 (Bidirectio (None, 107, 544)          1426368   
_________________________________________________________________
bidirectional_16 (Bidirectio (None, 107, 544)          1334976   
_________________________________________________________________
bidirectional_17 (Bidirectio (None, 107, 544)        

In [None]:
fig = px.line(
    history.history, y=['loss', 'val_loss'],
    labels={'index': 'epoch', 'value': 'MCRMSE'}, 
    title='Training History')
fig.show()

In [None]:
# Caveat: The prediction format requires the output to be the same length as the input,
# although it's not the case for the training data.
model_public = build_model(seq_len=107, pred_len=107)
model_private = build_model(seq_len=130, pred_len=130)

model_public.load_weights('model.h5')
model_private.load_weights('model.h5')

public_df = test.query("seq_length == 107")
private_df = test.query("seq_length == 130")

public_inputs = preprocess_inputs(public_df, map_dict)
private_inputs = preprocess_inputs(private_df, map_dict)

public_preds = model_public.predict(public_inputs)
private_preds = model_private.predict(private_inputs)

In [None]:
preds_ls = []

for df, preds in [(public_df, public_preds), (private_df, private_preds)]:
    for i, uid in enumerate(df.id):
        single_pred = preds[i]

        single_df = pd.DataFrame(single_pred, columns=pred_cols)
        single_df['id_seqpos'] = [f'{uid}_{x}' for x in range(single_df.shape[0])]

        preds_ls.append(single_df)

preds_df = pd.concat(preds_ls)
preds_df.head()

In [None]:
preds_df.columns = [i.replace('corrected_','') for i in preds_df.columns]

In [None]:
preds_df.to_csv('./submission_two.csv', index=False)

In [None]:
preds_df.shape