In [11]:
# Dataframe
import json
import pandas as pd
import numpy as np
# Visualization
import plotly.express as px
# Deeplearning
import tensorflow.keras.layers as L
import tensorflow as tf
# Sklearn
from sklearn.model_selection import train_test_split
#Setting seeds
tf.random.set_seed(2023)
np.random.seed(2023)


In [12]:
target = ['reactivity', 'deg_Mg_pH10', 'deg_Mg_50C', 'deg_pH10', 'deg_50C']
Model_Train = True # True if you want to Train model which take 1 hour to train.


In [14]:
# Error metric is MCRMSE
def MCRMSE(y_true, y_pred):
  colwise_mse = tf.reduce_mean(tf.square(y_true - y_pred), axis=1)
  return tf.reduce_mean(tf.sqrt(colwise_mse), axis=1)

In [17]:
data_dir = "stanford-covid-vaccine/"
train = pd.read_json(data_dir + "train.json", lines=True)
test = pd.read_json(data_dir + "test.json", lines=True)
sample_df = pd.read_csv(data_dir + "sample_submission.csv")

In [18]:
train.head(2)
print('Train shapes: ', train.shape)
print('Test shapes: ', test.shape)

Train shapes:  (2400, 19)
Test shapes:  (3634, 7)


In [19]:
fig = px.histogram(
    train,
    "signal_to_noise",
    nbins=25,
    title='signal_to_noise distribution',
    width=800,
    height=400
)
fig.show()

In [20]:
train = train.query("signal_to_noise >= 1")

In [21]:
train.head()

Unnamed: 0,index,id,sequence,structure,predicted_loop_type,signal_to_noise,SN_filter,seq_length,seq_scored,reactivity_error,deg_error_Mg_pH10,deg_error_pH10,deg_error_Mg_50C,deg_error_50C,reactivity,deg_Mg_pH10,deg_pH10,deg_Mg_50C,deg_50C
0,0,id_001f94081,GGAAAAGCUCUAAUAACAGGAGACUAGGACUACGUAUUUCUAGGUA...,.....((((((.......)))).)).((.....((..((((((......,EEEEESSSSSSHHHHHHHSSSSBSSXSSIIIIISSIISSSSSSHHH...,6.894,1,107,68,"[0.1359, 0.20700000000000002, 0.1633, 0.1452, ...","[0.26130000000000003, 0.38420000000000004, 0.1...","[0.2631, 0.28600000000000003, 0.0964, 0.1574, ...","[0.1501, 0.275, 0.0947, 0.18660000000000002, 0...","[0.2167, 0.34750000000000003, 0.188, 0.2124, 0...","[0.3297, 1.5693000000000001, 1.1227, 0.8686, 0...","[0.7556, 2.983, 0.2526, 1.3789, 0.637600000000...","[2.3375, 3.5060000000000002, 0.3008, 1.0108, 0...","[0.35810000000000003, 2.9683, 0.2589, 1.4552, ...","[0.6382, 3.4773, 0.9988, 1.3228, 0.78770000000..."
2,2,id_006f36f57,GGAAAGUGCUCAGAUAAGCUAAGCUCGAAUAGCAAUCGAAUAGAAU...,.....((((.((.....((((.(((.....)))..((((......)...,EEEEESSSSISSIIIIISSSSMSSSHHHHHSSSMMSSSSHHHHHHS...,8.8,1,107,68,"[0.0931, 0.13290000000000002, 0.11280000000000...","[0.1365, 0.2237, 0.1812, 0.1333, 0.1148, 0.160...","[0.17020000000000002, 0.178, 0.111, 0.091, 0.0...","[0.1033, 0.1464, 0.1126, 0.09620000000000001, ...","[0.14980000000000002, 0.1761, 0.1517, 0.116700...","[0.44820000000000004, 1.4822, 1.1819, 0.743400...","[0.2504, 1.4021, 0.9804, 0.49670000000000003, ...","[2.243, 2.9361, 1.0553, 0.721, 0.6396000000000...","[0.5163, 1.6823000000000001, 1.0426, 0.7902, 0...","[0.9501000000000001, 1.7974999999999999, 1.499..."
5,5,id_00ab2d761,GGAAAGCGCCGCGGCGGUAGCGGCAGCGAGGAGCGCUACCAAGGCA...,.....(.(((((.(((((((((...........)))))))..(((....,EEEEESISSSSSISSSSSSSSSHHHHHHHHHHHSSSSSSSMMSSSH...,4.136,1,107,68,"[0.1942, 0.2041, 0.1626, 0.1213, 0.10590000000...","[0.2726, 0.2984, 0.21660000000000001, 0.1637, ...","[0.3393, 0.2728, 0.2005, 0.1703, 0.1495, 0.134...","[0.165, 0.20520000000000002, 0.179, 0.1333, 0....","[0.2864, 0.24710000000000001, 0.2222, 0.1903, ...","[0.7642, 1.6641, 1.0622, 0.5008, 0.4107, 0.133...","[0.9559000000000001, 1.9442, 1.0114, 0.5105000...","[1.9554, 2.1298, 1.0403, 0.609, 0.5486, 0.386,...","[0.22460000000000002, 1.7281, 1.381, 0.6623, 0...","[0.5882000000000001, 1.1786, 0.9704, 0.6035, 0..."
6,6,id_00abef1d7,GGAAAACAAUUGCAUCGUUAGUACGACUCCACAGCGUAAGCUGUGG...,.........((((((((......((((((((((((....)))))))...,EEEEEEEEESSSSSSSSIIIIIISSSSSSSSSSSSHHHHSSSSSSS...,2.485,1,107,68,"[0.422, 0.5478000000000001, 0.4749000000000000...","[0.4801, 0.7943, 0.42160000000000003, 0.397300...","[0.9822000000000001, 1.272, 0.6940000000000001...","[0.5827, 0.7555000000000001, 0.5949, 0.4511, 0...","[0.9306000000000001, 1.0496, 0.5844, 0.7796000...","[0.895, 2.3377, 2.2305, 2.003, 1.9006, 1.0373,...","[0.46040000000000003, 3.6695, 0.78550000000000...","[2.7711, 7.365, 1.6924000000000001, 1.43840000...","[1.073, 2.8604000000000003, 1.9936, 1.0273, 1....","[2.0964, 3.3688000000000002, 0.6399, 2.1053, 1..."
7,7,id_00b436dec,GGAAAUCAUCGAGGACGGGUCCGUUCAGCACGCGAAAGCGUCGUGA...,.....(((((((((((..(((((((((..((((....))))..)))...,EEEEESSSSSSSSSSSIISSSSSSSSSIISSSSHHHHSSSSIISSS...,1.727,1,107,68,"[0.4843, 0.5233, 0.4554, 0.43520000000000003, ...","[0.8719, 1.0307, 0.6649, 0.34500000000000003, ...","[0.7045, 0.7775000000000001, 0.5662, 0.4561, 0...","[0.384, 0.723, 0.4766, 0.30260000000000004, 0....","[0.7429, 0.9137000000000001, 0.480400000000000...","[1.1576, 1.5137, 1.3382, 1.5622, 1.2121, 0.295...","[1.6912, 5.2652, 2.3901, 0.45890000000000003, ...","[1.8641, 2.3767, 1.149, 1.0132, 0.9876, 0.0, 0...","[0.49060000000000004, 4.6339, 1.95860000000000...","[1.2852000000000001, 2.5460000000000003, 0.234..."


In [22]:
test.head()

Unnamed: 0,index,id,sequence,structure,predicted_loop_type,seq_length,seq_scored
0,0,id_00073f8be,GGAAAAGUACGACUUGAGUACGGAAAACGUACCAACUCGAUUAAAA...,......((((((((((.(((((.....))))))))((((((((......,EEEEEESSSSSSSSSSBSSSSSHHHHHSSSSSSSSSSSSSSSSHHH...,107,68
1,1,id_000ae4237,GGAAACGGGUUCCGCGGAUUGCUGCUAAUAAGAGUAAUCUCUAAAU...,.....((((..((((((...(((((.....((((....)))).......,EEEEESSSSIISSSSSSIIISSSSSIIIIISSSSHHHHSSSSIIII...,130,91
2,2,id_00131c573,GGAAAACAAAACGGCCUGGAAGACGAAGGAAUUCGGCGCGAAGGCC...,...........((.(((.(.(..((..((..((((...))))..))...,EEEEEEEEEEESSISSSISISIISSIISSIISSSSHHHSSSSIISS...,107,68
3,3,id_00181fd34,GGAAAGGAUCUCUAUCGAAGGAUAGAGAUCGCUCGCGACGGCACGA...,......((((((((((....))))))))))((((((..((.(((.....,EEEEEESSSSSSSSSSHHHHSSSSSSSSSSSSSSSSIISSISSSHH...,107,68
4,4,id_0020473f7,GGAAACCCGCCCGCGCCCGCCCGCGCUGCUGCCGUGCCUCCUCUCC...,.....(((((((((((((((((((((((((((((((((((((((((...,EEEEESSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS...,130,91


In [23]:
fig = px.histogram(
    test,
    "seq_length",
    nbins=25,
    title='sequence_length distribution',
    width=800,
    height=400
)
fig.show()

In [24]:
public_df = test.query("seq_length == 107")
private_df = test.query("seq_length == 130")

In [25]:
token2int = {x: i for i, x in enumerate("().ACGUBEHIMSX")}
token2int

{'(': 0,
 ')': 1,
 '.': 2,
 'A': 3,
 'C': 4,
 'G': 5,
 'U': 6,
 'B': 7,
 'E': 8,
 'H': 9,
 'I': 10,
 'M': 11,
 'S': 12,
 'X': 13}

In [26]:
def dataframe_to_array(df):
   return np.transpose(np.array(df.values.tolist()), (0, 2, 1))

In [27]:
def dataframe_label_encoding(
    df, token2int, cols=["sequence", "structure", "predicted_loop_type"]
):
    return dataframe_to_array(
        df[cols].applymap(lambda seq: [token2int[x] for x in seq])
    ) ## tokenization of Sequence, Structure, Predicted loop


In [28]:
train_inputs = dataframe_label_encoding(train, token2int) ## Label encoding
train_labels = dataframe_to_array(train[target]) ## dataframe to 3D array to

In [29]:
train_labels, train_inputs

(array([[[ 0.3297,  0.7556,  0.3581,  2.3375,  0.6382],
         [ 1.5693,  2.983 ,  2.9683,  3.506 ,  3.4773],
         [ 1.1227,  0.2526,  0.2589,  0.3008,  0.9988],
         ...,
         [ 0.2937,  0.3336,  0.4812,  0.5142,  0.3287],
         [ 0.2362,  0.6491,  0.7026,  0.7681,  0.5301],
         [ 0.5731,  0.6898,  0.4254,  1.172 ,  0.8472]],
 
        [[ 0.4482,  0.2504,  0.5163,  2.243 ,  0.9501],
         [ 1.4822,  1.4021,  1.6823,  2.9361,  1.7975],
         [ 1.1819,  0.9804,  1.0426,  1.0553,  1.4991],
         ...,
         [ 0.6449,  1.6819,  1.464 ,  1.0621,  0.7789],
         [ 0.04  ,  0.5367,  0.4201,  0.4391,  0.3348],
         [ 0.5446,  1.24  ,  0.8575,  1.0076,  0.5827]],
 
        [[ 0.7642,  0.9559,  0.2246,  1.9554,  0.5882],
         [ 1.6641,  1.9442,  1.7281,  2.1298,  1.1786],
         [ 1.0622,  1.0114,  1.381 ,  1.0403,  0.9704],
         ...,
         [ 0.1107,  0.6401,  0.6563,  0.727 ,  0.3136],
         [ 0.2261,  0.573 ,  0.5931,  0.8425,  0.7776],


In [30]:
x_train, x_val, y_train, y_val = train_test_split(
    train_inputs, train_labels, test_size=0.1, random_state=34, stratify=train.SN_filter
)

In [31]:
public_inputs = dataframe_label_encoding(public_df, token2int)
private_inputs = dataframe_label_encoding(private_df, token2int)

In [32]:
def build_model(
    embed_size, # Length of unique tokens
    seq_len=107, # public dataset seq_len
    pred_len=68, # pred_len for public data
    dropout=0.5, # trying best dropout (general)
    sp_dropout=0.2, # Spatial Dropout
    embed_dim=200, # embedding dimension
    hidden_dim=256, # hidden layer units
):
    inputs = L.Input(shape=(seq_len, 3))
    embed = L.Embedding(input_dim=embed_size, output_dim=embed_dim)(inputs)
    reshaped = tf.reshape(
        embed, shape=(-1, embed.shape[1], embed.shape[2] * embed.shape[3])
    )
    hidden = L.SpatialDropout1D(sp_dropout)(reshaped)
     # 3X BiGRU layers
    hidden = L.Bidirectional(
        L.GRU(
            hidden_dim,
            dropout=dropout,
            return_sequences=True,
            kernel_initializer="orthogonal",
        )
    )(hidden)
    hidden = L.Bidirectional(
        L.GRU(
            hidden_dim,
            dropout=dropout,
            return_sequences=True,
            kernel_initializer="orthogonal",
        )
    )(hidden)
    hidden = L.Bidirectional(
        L.GRU(
            hidden_dim,
            dropout=dropout,
            return_sequences=True,
            kernel_initializer="orthogonal",
        )
    )(hidden)
    # Since we are only making predictions on the first part of each sequence,
    # we have to truncate it
    truncated = hidden[:, :pred_len]
    out = L.Dense(5, activation="linear")(truncated)
    model = tf.keras.Model(inputs=inputs, outputs=out)
    model.compile(optimizer="Adam", loss=MCRMSE) # loss function as of Eval Metric
    return model

In [33]:
model = build_model(
    embed_size=len(token2int) ## embed_size = 14
)  ## unique token in sequence, structure, predicted_loop_type
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 107, 3)]          0         
                                                                 
 embedding (Embedding)       (None, 107, 3, 200)       2800      
                                                                 
 tf.reshape (TFOpLambda)     (None, 107, 600)          0         
                                                                 
 spatial_dropout1d (Spatial  (None, 107, 600)          0         
 Dropout1D)                                                      
                                                                 
 bidirectional (Bidirection  (None, 107, 512)          1317888   
 al)                                                             
                                                                 
 bidirectional_1 (Bidirecti  (None, 107, 512)          118272

In [34]:
if Model_Train:
    history = model.fit(
        x_train,
        y_train,
        validation_data=(x_val, y_val),
        batch_size=64,
        epochs=40,
        verbose=2,
        callbacks=[
            tf.keras.callbacks.ReduceLROnPlateau(patience=5),
            tf.keras.callbacks.ModelCheckpoint("Model/model.h5"),
        ],
    )

Epoch 1/40
30/30 - 22s - loss: 0.4582 - val_loss: 0.3816 - lr: 0.0010 - 22s/epoch - 735ms/step
Epoch 2/40



You are saving your model as an HDF5 file via `model.save()`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')`.



30/30 - 2s - loss: 0.3888 - val_loss: 0.3631 - lr: 0.0010 - 2s/epoch - 75ms/step
Epoch 3/40
30/30 - 2s - loss: 0.3669 - val_loss: 0.3450 - lr: 0.0010 - 2s/epoch - 75ms/step
Epoch 4/40
30/30 - 2s - loss: 0.3510 - val_loss: 0.3337 - lr: 0.0010 - 2s/epoch - 78ms/step
Epoch 5/40
30/30 - 2s - loss: 0.3418 - val_loss: 0.3233 - lr: 0.0010 - 2s/epoch - 76ms/step
Epoch 6/40
30/30 - 2s - loss: 0.3330 - val_loss: 0.3225 - lr: 0.0010 - 2s/epoch - 74ms/step
Epoch 7/40
30/30 - 2s - loss: 0.3258 - val_loss: 0.3095 - lr: 0.0010 - 2s/epoch - 75ms/step
Epoch 8/40
30/30 - 2s - loss: 0.3153 - val_loss: 0.2988 - lr: 0.0010 - 2s/epoch - 75ms/step
Epoch 9/40
30/30 - 2s - loss: 0.3076 - val_loss: 0.2963 - lr: 0.0010 - 2s/epoch - 76ms/step
Epoch 10/40
30/30 - 2s - loss: 0.3011 - val_loss: 0.2914 - lr: 0.0010 - 2s/epoch - 77ms/step
Epoch 11/40
30/30 - 2s - loss: 0.2940 - val_loss: 0.2813 - lr: 0.0010 - 2s/epoch - 75ms/step
Epoch 12/40
30/30 - 2s - loss: 0.2880 - val_loss: 0.2755 - lr: 0.0010 - 2s/epoch - 75ms/s

In [36]:
if Model_Train:
    fig = px.line(
        history.history,
        y=["loss", "val_loss"],
        labels={"index": "epoch", "value": "MCRMSE"},
        title="Training History",
    )
    fig.show()

In [37]:
model_public = build_model(seq_len=107, pred_len=107, embed_size=len(token2int))
model_private = build_model(seq_len=130, pred_len=130, embed_size=len(token2int))
model_public.load_weights("Model/model.h5")
model_private.load_weights("Model/model.h5")


In [38]:
public_preds = model_public.predict(public_inputs)
private_preds = model_private.predict(private_inputs)
private_preds.shape



(3005, 130, 5)

In [40]:
preds_ls = []
for df, preds in [(public_df, public_preds), (private_df, private_preds)]:
    for i, uid in enumerate(df.id):
        single_pred = preds[i]
        single_df = pd.DataFrame(single_pred, columns=target)
        single_df["id_seqpos"] = [f"{uid}_{x}" for x in range(single_df.shape[0])]
        preds_ls.append(single_df)
preds_df = pd.concat(preds_ls)

In [41]:
preds_df.head()

Unnamed: 0,reactivity,deg_Mg_pH10,deg_Mg_50C,deg_pH10,deg_50C,id_seqpos
0,0.676972,0.59088,0.527546,2.047105,0.70769,id_00073f8be_0
1,2.258359,3.187475,3.372074,4.470756,3.012699,id_00073f8be_1
2,1.505049,0.604384,0.732632,0.654704,0.678325,id_00073f8be_2
3,1.397385,1.17047,1.728659,1.232965,1.776389,id_00073f8be_3
4,0.964195,0.592551,0.870513,0.522977,0.901311,id_00073f8be_4


In [43]:
submission = sample_df[["id_seqpos"]].merge(preds_df, on=["id_seqpos"])
submission.to_csv("Submission/submission.csv", index=False)
submission.head()

Unnamed: 0,id_seqpos,reactivity,deg_Mg_pH10,deg_Mg_50C,deg_pH10,deg_50C
0,id_00073f8be_0,0.676972,0.59088,0.527546,2.047105,0.70769
1,id_00073f8be_1,2.258359,3.187475,3.372074,4.470756,3.012699
2,id_00073f8be_2,1.505049,0.604384,0.732632,0.654704,0.678325
3,id_00073f8be_3,1.397385,1.17047,1.728659,1.232965,1.776389
4,id_00073f8be_4,0.964195,0.592551,0.870513,0.522977,0.901311
