# Purpose

Develop a model that works. It will be awful, but it works



In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go

import torch
import torch.nn as nn
import transformers

from sklearn.preprocessing import OneHotEncoder, StandardScaler

import random

from transformers import AutoTokenizer

In [None]:
%run data_analysis.ipynb

In [None]:
train_labels = pd.read_csv('stanford-rna-3d-folding/train_labels.csv')

train_sequences = pd.read_csv('stanford-rna-3d-folding/train_sequences.csv')

sample_submission = pd.read_csv('stanford-rna-3d-folding/sample_submission.csv')

test_sequences = pd.read_csv('stanford-rna-3d-folding/test_sequences.csv')

In [None]:
display(train_sequences.head())

display(train_labels.head())

# Preprocess data

In [None]:
train_sequences = train_sequences.drop(['all_sequences', 'temporal_cutoff', 'sequence'], axis=1)

train_sequences

In [None]:
def get_rows_with_nan(df: pd.DataFrame):
    return df[df.isnull().any(axis=1)]

def remove_rows_with_nan(df: pd.DataFrame, exclude_columns=None):
    if exclude_columns:
        return df.dropna(subset=[col for col in df.columns if col not in exclude_columns])
    else:
        return df.dropna()

def remove_sequences_with_nan(df: pd.DataFrame):
    null_df = get_rows_with_nan(df)
    null_ids = null_df['ID'].apply(lambda x: x.split('_')[0] + '_' + x.split('_')[1])

    return df[~df['ID'].apply(lambda x: x.split('_')[0] + '_' + x.split('_')[1]).isin(null_ids)]

In [None]:
train_labels = remove_sequences_with_nan(train_labels)
train_sequences = remove_rows_with_nan(train_sequences)

In [None]:
all_train_data = train_labels.copy()

all_train_data['target_id'] = all_train_data['ID'].apply(lambda x: x.split('_')[0] + '_' + x.split('_')[1])

all_train_data = pd.merge(all_train_data, train_sequences, left_on='target_id', right_on='target_id', how='left')


# now going to take the last two deltas and use that for the prediction
# For the labels, going to calculate the delta in each direction

coords = ['x_1', 'y_1', 'z_1']

for coord in coords:
    all_train_data[f'{coord}_delta'] = all_train_data.groupby('target_id')[coord].diff()



all_train_data = all_train_data[all_train_data['ID'].apply(lambda x: x.split('_')[2] != '1')]

all_train_data = all_train_data.drop(columns=coords)

for coord in coords:
    all_train_data[f'pred_{coord}_delta'] = 0


all_train_data

In [None]:
res_name_encoder = OneHotEncoder()
res_name_encoder.fit(all_train_data['resname'].values.reshape(-1, 1))

model_name = "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext"

max_length = 32
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_sequence(sequence: str, tokenizer: transformers.PreTrainedTokenizer, max_length: int):
    encoding = tokenizer(
        sequence, 
        padding="max_length", 
        truncation=True, 
        max_length=max_length, 
        return_tensors='pt')

    return encoding


def get_input_output_tensor(df: pd.DataFrame):
    tokenized_data = df['description'].apply(lambda x: tokenize_sequence(x, tokenizer, max_length)).reset_index(drop=True)

    input_ids = torch.cat([tokenized_data[i]['input_ids'] for i in range(len(tokenized_data))], dim=0)
    attention_mask = torch.cat([tokenized_data[i]['attention_mask'] for i in range(len(tokenized_data))], dim=0)

    res_names = torch.tensor(res_name_encoder.transform(df['resname'].values.reshape(-1, 1)).toarray())

    pred_delta = torch.tensor(df[[f'pred_{coord}_delta' for coord in coords]].values)


    outputs = torch.tensor(df[[f'{coord}_delta' for coord in coords]].values, dtype=torch.float32)

    locations = {
        'input_ids': (0, input_ids.shape[1]),
        'attention_mask': (input_ids.shape[1], input_ids.shape[1] + attention_mask.shape[1]),
        'res_names': (input_ids.shape[1] + attention_mask.shape[1], input_ids.shape[1] + attention_mask.shape[1] + res_names.shape[1]),
        'pred_delta': (input_ids.shape[1] + attention_mask.shape[1] + res_names.shape[1], input_ids.shape[1] + attention_mask.shape[1] + res_names.shape[1] + pred_delta.shape[1])
    }

    # print(type(input_ids), type(attention_mask), type(res_names), type(pred_delta), type(outputs))
    return torch.cat((input_ids.float(), attention_mask.float(), res_names.float(), pred_delta.float()), dim=1), outputs.float(), locations


In [None]:
unique_ids = all_train_data['target_id'].unique()

split_all_train_data = []

percentage = 0.5

for id in unique_ids[0:int(percentage * len(unique_ids))]:
    split_all_train_data.append(all_train_data[all_train_data['target_id'] == id])


# split_all_train_data

train_inputs = []
train_outputs = []



for i in split_all_train_data:
    input, output, locations = get_input_output_tensor(i)
    train_inputs.append(input)
    train_outputs.append(output)


In [None]:
for i in range(len(train_inputs)):
    print(f"On #{i}. Input Shape: {train_inputs[i].shape}. Output Shape: {train_outputs[i].shape}")

print("Locations:")
display(locations)

# Model

In [None]:
# predict for an entire thingy at one time starting at index 2

class AutoRegressiveNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(AutoRegressiveNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(hidden_size, output_size)

        

    def forward(self, x: torch.Tensor):
        predictions = []
        x_copy = x.detach().clone()

        for i in range(len(x_copy)):
            fc1_out = self.fc1(x_copy[i])
            relu1_out = self.relu1(fc1_out)
            fc2_out = self.fc2(relu1_out)
            relu2_out = self.relu2(fc2_out)
            pred = self.fc3(relu2_out)
            predictions.append(pred)

            if i < len(x_copy) - 1:
                x_copy = x_copy.detach().clone()
                x_copy[i+1, -3:] = pred


        
        return torch.stack(predictions, dim=0)
    
    def train_model(self, data, target, epochs=100):
        losses = []
        
        criterion = nn.MSELoss()

        optimizer = torch.optim.AdamW(self.parameters(), lr=0.001)
        optimizer.zero_grad()

        for epoch in range(epochs):
            self.train()

            # print("Data shape: ", len(data))
            
            for i, batch in enumerate(data):
                optimizer.zero_grad()
                
                prediction = self.forward(batch)
                # prediction = prediction.detach().clone()
                # print("Iteration: ", i)
                # print("Batch shape: ", batch.shape)
                # print("Output shape: ", target[i].shape)
                # print("Prediction shape: ", prediction.shape)

                loss = criterion(prediction, target[i])

                
                loss.backward()
                optimizer.step()

            
            print(f"Epoch: {epoch}. Loss: {loss.item()}")
            losses.append(loss.item())

        return losses





In [None]:
ann = AutoRegressiveNN(train_inputs[0].shape[1], 20, train_outputs[0].shape[1])

# output_0 = ann.forward(train_inputs[0])
torch.autograd.set_detect_anomaly(True)
losses = ann.train_model(train_inputs, train_outputs, 50)

In [None]:
plt.plot(np.arange(len(losses)), np.array(losses))
plt.title("Losses over time")

plt.show()

In [None]:
def convert_output_to_points_for_plotting(input: torch.tensor, output: torch.tensor, target: torch.tensor):
    x = [0]
    y = [0]
    z = [0]

    actual_x = [0]
    actual_y = [0]
    actual_z = [0]

    sequences = ['G'] # TODO this is wrong. Somehow keep the previous sequence in the training data and put here

    for i in range(len(input)):
        res_input = input[i, locations['res_names'][0]:locations['res_names'][1]]
        transformed_res_input = res_name_encoder.inverse_transform(res_input.detach().numpy().reshape(1, -1))
        sequences.append(transformed_res_input[0][0])



        x.append(x[-1] + output[i, 0].detach().item())
        y.append(y[-1] + output[i, 1].detach().item())
        z.append(z[-1] + output[i, 2].detach().item())


        actual_x.append(actual_x[-1] + target[i, 0].detach().item())
        actual_y.append(actual_y[-1] + target[i, 1].detach().item())
        actual_z.append(actual_z[-1] + target[i, 2].detach().item())




    
    return np.array(x), np.array(y), np.array(z), np.array(actual_x), np.array(actual_y), np.array(actual_z), np.array(sequences).astype(object)




In [None]:
output_0 = ann.forward(train_inputs[0])

x, y, z, actual_x, actual_y, actual_z, sequences = convert_output_to_points_for_plotting(train_inputs[0], output_0, train_outputs[0])
name = 'idk something'
combined = [[x, y, z, sequences, name], [actual_x, actual_y, actual_z, sequences, name]]

plot_multiple_structures(combined)

In [None]:
# sample_submission