In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import plotly.express as px
from tensorflow.keras import layers
import tensorflow as tf
from sklearn.model_selection import train_test_split

from sklearn.manifold import TSNE

In [2]:
df = pd.read_json('./train.json', lines=True).drop('index', axis=1)
df = df.query("signal_to_noise >= 1")
test = pd.read_json('./test.json', lines=True).drop('index', axis=1)

In [3]:
def pandas_list_to_array(df):
    """
    Input: dataframe of shape (x, y), containing list of length l
    Return: np.array of shape (x, l, y)
    """
    
    return np.transpose(
        np.array(df.values.tolist()),
        (0, 2, 1)
    )

def preprocess_inputs(df, token2int, cols=['sequence', 'structure', 'predicted_loop_type']):
    return pandas_list_to_array(
        df[cols].applymap(lambda seq: [token2int[x] for x in seq])
    )

In [4]:
df.columns

Index(['id', 'sequence', 'structure', 'predicted_loop_type', 'signal_to_noise',
       'SN_filter', 'seq_length', 'seq_scored', 'reactivity_error',
       'deg_error_Mg_pH10', 'deg_error_pH10', 'deg_error_Mg_50C',
       'deg_error_50C', 'reactivity', 'deg_Mg_pH10', 'deg_pH10', 'deg_Mg_50C',
       'deg_50C'],
      dtype='object')

# preprocessing

In [5]:
map_dict = {'(': 0, ')': 1, '.': 2, 'A': 3, 'C': 4, 'G': 5, 'U': 6, 'B': 7,
              'E': 8, 'H': 9, 'I': 10, 'M': 11, 'S': 12, 'X': 13}

measured_cols = ['reactivity_error', 'deg_error_Mg_pH10', 'deg_error_pH10', 'deg_error_Mg_50C',
            'deg_error_50C', 'reactivity', 'deg_Mg_pH10', 'deg_pH10', 'deg_Mg_50C', 'deg_50C']
for col in measured_cols:
    df[col] = df[col].apply(lambda x: np.array(x))


df['corrected_reactivity'] = df.reactivity - df.reactivity_error
df['corrected_deg_Mg_pH10'] = df.deg_Mg_pH10 - df.deg_error_Mg_pH10
df['corrected_deg_pH10'] = df.deg_pH10 - df.deg_error_pH10
df['corrected_deg_50C'] = df.deg_50C - df.deg_error_50C
df['corrected_deg_Mg_50C'] = df.deg_Mg_50C - df.deg_error_Mg_50C

# visualizations

In [6]:
pred_cols = ['corrected_reactivity','corrected_deg_Mg_pH10','corrected_deg_pH10',
             'corrected_deg_50C','corrected_deg_Mg_50C']

input_cols = ['sequence', 'structure', 'predicted_loop_type']

# index = 1

# for col in pred_cols:
#     plt.plot(df[col].iloc[index])
#     plt.title(f'{col} Mean: {np.mean(df[col].iloc[index])}')
#     plt.show()
    
# plt.figure(figsize=(20,10))
# for col in pred_cols:
#     plt.plot(df[col].iloc[index],label=col)
# plt.legend()

# modeling

### codons

In [7]:
temp = df[['id']+input_cols]

for col in input_cols:
    temp[col] = temp[col].apply(lambda x: [x[i:i+3] for i in range(0,len(x),3)])
           
holder = temp.explode('sequence')[['id','sequence']]

for col in ['structure', 'predicted_loop_type']:
    holder[col] = temp[col].explode(col)
    
codons = pd.get_dummies(holder, columns=input_cols)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [8]:
codons

Unnamed: 0,id,sequence_AAA,sequence_AAC,sequence_AAG,sequence_AAU,sequence_AC,sequence_ACA,sequence_ACC,sequence_ACG,sequence_ACU,...,predicted_loop_type_SSH,predicted_loop_type_SSI,predicted_loop_type_SSM,predicted_loop_type_SSS,predicted_loop_type_SSX,predicted_loop_type_SXS,predicted_loop_type_SXX,predicted_loop_type_XSS,predicted_loop_type_XXS,predicted_loop_type_XXX
0,id_001f94081,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,id_001f94081,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,id_001f94081,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,id_001f94081,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,id_001f94081,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2399,id_fff546103,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2399,id_fff546103,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2399,id_fff546103,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2399,id_fff546103,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0


### pandas feature set

In [9]:
temp = df[['id']+input_cols]

for col in input_cols:
    temp[col] = temp[col].apply(lambda x: np.array([map_dict[i] for i in x]))
           
        
holder = temp.explode('sequence')[['id','sequence']]

for col in ['structure', 'predicted_loop_type']:
    holder[col] = temp[col].explode(col)
    
feats = holder.copy()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


### combine dummies to pandas feature set

In [10]:
new_codon = pd.DataFrame()

for ids in codons.id.unique():
    temp = (codons[codons.id == 'id_001f94081']
            .reset_index(drop=True)
            .reset_index()
            .rename(columns={'index':'reps'}))

    temp['reps'] = temp.reps.apply(lambda x: [1,2,3] if x <= 34 else [1,2])

    temp = temp.explode('reps')
    
    new_codon = new_codon.append(temp)

In [11]:
# len([i for i in new_codon.columns if 'sequence' in i])

In [12]:
# new_codon = new_codon[new_codon.columns[:67]]

In [13]:
from sklearn.decomposition import PCA

In [14]:
pca = PCA(n_components=21, svd_solver='full')
new_codon_decomp = pca.fit_transform(new_codon.iloc[:,2:])

print(sum(pca.explained_variance_ratio_))

1.0


In [15]:
feats = feats.reset_index(drop=True)

for i in feats.id.unique():
    idx = feats[feats.id==i].index
    feats.loc[idx, 'order'] = np.arange(1,len(idx)+1,1)

In [16]:
train_inputs = np.concatenate((feats.drop('id',axis=1).values,new_codon_decomp), axis=1)
train_inputs = train_inputs.astype(float)
train_inputs = np.absolute(train_inputs)

In [17]:
train_inputs.shape

(224379, 25)

In [18]:
train_inputs.max()

107.0

In [19]:
train_inputs = train_inputs.reshape(df.shape[0], 107, train_inputs.shape[-1])

In [20]:
def MCRMSE(y_true, y_pred):
    colwise_mse = tf.reduce_mean(tf.square(y_true - y_pred), axis=1)
    return tf.reduce_mean(tf.sqrt(colwise_mse), axis=1)

def gru_layer(hidden_dim, dropout):
    return layers.Bidirectional(layers.LSTM(
        hidden_dim, activation='relu', dropout=dropout, return_sequences=True))


def build_model(seq_len, pred_len, feats_len):

    embed_size=108
    # seq_len=107
    # pred_len=68
    dropout=0.5
    sp_dropout=0.25
    embed_dim=107
    
    inputs = layers.Input(shape=(seq_len, feats_len))

    embed = layers.Embedding(input_dim=embed_size, output_dim=embed_dim)(inputs)

    hidden = tf.reshape(
        embed, shape=(-1, embed.shape[1],  embed.shape[2] * embed.shape[3])
    )

    hidden = layers.SpatialDropout1D(sp_dropout)(hidden)

    hidden = layers.Bidirectional(layers.GRU(
            200, activation='relu', dropout=dropout, return_sequences=True, kernel_initializer='orthogonal'))(hidden)

    hidden = layers.Bidirectional(layers.GRU(
            300, activation='relu', dropout=dropout, return_sequences=True, kernel_initializer='orthogonal'))(hidden)

    hidden = layers.Bidirectional(layers.LSTM(
        200, activation='relu', dropout=dropout, return_sequences=True, kernel_initializer='orthogonal'))(hidden)
    
    # Since we are only making predictions on the first part of each sequence, 
    # we have to truncate it
#     truncated = hidden[:, :pred_len]
    truncated = layers.Dense(68, activation='relu')(hidden)
    out = layers.Dense(5, activation='relu')(truncated)

    model = tf.keras.Model(inputs=inputs, outputs=out)
    model.compile(tf.optimizers.Adam(lr=0.001), loss=MCRMSE)

#     model.summary()
    
    return model




In [21]:
%%time

# train_inputs = preprocess_inputs(df, map_dict)
train_labels = pandas_list_to_array(df[pred_cols])

x_train, x_val, y_train, y_val = train_test_split(train_inputs, train_labels, 
                                                  test_size=.1, random_state=0, stratify=df.SN_filter)

model = build_model(107, 68, train_inputs.shape[-1])

history = model.fit(
    x_train, y_train,
    validation_data=(x_val, y_val),
    batch_size=50,
    epochs=100,
    callbacks=[
        tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss',patience=2),
        tf.keras.callbacks.ModelCheckpoint('model.h5')
    ]
)

Train on 1677 samples, validate on 420 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100


Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100

KeyboardInterrupt: 

In [22]:
fig = px.line(
    history.history, y=['loss', 'val_loss'],
    labels={'index': 'epoch', 'value': 'MCRMSE'}, 
    title='Training History')

fig.show()

NameError: name 'history' is not defined

In [61]:
# Caveat: The prediction format requires the output to be the same length as the input,
# although it's not the case for the training data.
model_public = build_model(seq_len=107, pred_len=107)
model_private = build_model(seq_len=130, pred_len=130)

model_public.load_weights('model.h5')
model_private.load_weights('model.h5')

public_df = test.query("seq_length == 107")
private_df = test.query("seq_length == 130")

public_inputs = preprocess_inputs(public_df, map_dict)
private_inputs = preprocess_inputs(private_df, map_dict)

public_preds = model_public.predict(public_inputs)
private_preds = model_private.predict(private_inputs)

In [62]:
preds_ls = []

for df, preds in [(public_df, public_preds), (private_df, private_preds)]:
    for i, uid in enumerate(df.id):
        single_pred = preds[i]

        single_df = pd.DataFrame(single_pred, columns=pred_cols)
        single_df['id_seqpos'] = [f'{uid}_{x}' for x in range(single_df.shape[0])]

        preds_ls.append(single_df)

preds_df = pd.concat(preds_ls)
preds_df.head()

preds_df.columns = [i.replace('corrected_','') for i in preds_df.columns]

preds_df.to_csv('./submission_two.csv', index=False)

In [63]:
preds_df.shape

(457953, 6)