In [3]:
 #@title Download data from GCP bucket
import sys

if 'google.colab' in sys.modules:
  !gsutil -m cp -r gs://indaba-data .
else:
  !mkdir -p indaba-data/train
  !wget -P indaba-data/train https://storage.googleapis.com/indaba-data/train/train.csv --continue
  !wget -P indaba-data/train https://storage.googleapis.com/indaba-data/train/train_mut.pt --continue
  !wget -P indaba-data/train https://storage.googleapis.com/indaba-data/train/train_wt.pt --continue

  !mkdir -p indaba-data/test
  !wget -P indaba-data/test https://storage.googleapis.com/indaba-data/test/test.csv --continue
  !wget -P indaba-data/test https://storage.googleapis.com/indaba-data/test/test_mut.pt --continue
  !wget -P indaba-data/test https://storage.googleapis.com/indaba-data/test/test_wt.pt --continue

Copying gs://indaba-data/README.txt...
/ [0 files][    0.0 B/   33.0 B]                                                Copying gs://indaba-data/test/test.csv...
/ [0 files][    0.0 B/290.0 KiB]                                                Copying gs://indaba-data/test/test_wt.pt...
Copying gs://indaba-data/test/test_mut.pt...
/ [0/9 files][    0.0 B/  3.3 GiB]   0% Done                                    / [0/9 files][    0.0 B/  3.3 GiB]   0% Done                                    Copying gs://indaba-data/train/train.csv...
/ [0/9 files][    0.0 B/  3.3 GiB]   0% Done                                    Copying gs://indaba-data/train/train_mut.pt...
/ [0/9 files][    0.0 B/  3.3 GiB]   0% Done                                    Copying gs://indaba-data/train/train_wt.pt...
/ [0/9 files][    0.0 B/  3.3 GiB]   0% Done                                    ==> NOTE: You are downloading one or more large file(s), which would
run significantly faster if you enabled sliced object dow

In [4]:
#@title Imports and moving to working directory
import torch 
import pandas as pd
from tqdm import tqdm
from torch.utils.data import DataLoader


# move to data folder
%cd indaba-data

/content/indaba-data


In [5]:
# Load Embedding tensors & Traing csv
# Embeddings were calculated using the ESM 650M pretrained model 
# Tensor shape of embedded data:  [data_len,1280] 
# There are no sequences in the Embedding tensors as we've performed an average of it (torch.mean(embed, dim=1))
# More details in https://huggingface.co/facebook/esm2_t33_650M_UR50D

wt_emb = torch.load("train/train_wt.pt")
mut_emb = torch.load("train/train_mut.pt")
df = pd.read_csv("train/train.csv")

In [6]:
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader

# Reset the index of the dataframe
df.reset_index(drop=True, inplace=True)

# Split data into train and validation
wt_emb_train, wt_emb_val, mut_emb_train, mut_emb_val, df_train, df_val = train_test_split(wt_emb, mut_emb, df, test_size=0.21, random_state=42)

# Define the dataset class
class EmbeddingDataset(Dataset):
  def __init__(self, wt_pt, mut_pt, data_df):
    self.pt_wt = wt_pt
    self.pt_mut = mut_pt
    self.df = data_df

  def __len__(self):
    return len(self.pt_wt)

  def __getitem__(self, index):
    if "ddg" in self.df.columns:
      df_out = torch.Tensor([self.df.iloc[index]["ddg"]])
    else:
      df_out = torch.Tensor([self.df.iloc[index]["ID"]])

    return self.pt_wt[index,:], self.pt_mut[index,:], df_out

# Create separate datasets for the training and validation sets
train_dataset = EmbeddingDataset(wt_emb_train, mut_emb_train, df_train.reset_index(drop=True))
val_dataset = EmbeddingDataset(wt_emb_val, mut_emb_val, df_val.reset_index(drop=True))

# Create dataloaders for the training and validation sets
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=2)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=2)


In [7]:
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(Net, self).__init__()
        self.lstm = nn.LSTM(input_size * 2, hidden_size, num_layers) # Multiply the input_size by 2
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        x, _ = self.lstm(x)
        x = self.fc(x[:, -1])
        return x



In [11]:
input_size = wt_emb_train.size(1)
hidden_size = 128
num_layers = 2
num_heads= 8
model = Net(input_size, hidden_size, num_layers)


# Define loss function (Mean Squared Error for Regression)
criterion = nn.MSELoss()

# Define the optimizer (Stochastic Gradient Descent)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)


NameError: ignored

In [None]:
#

torch.Size([32, 10])


In [13]:
import torch
from torch import nn

class StabilityModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_heads, num_layers):
        super(StabilityModel, self).__init__()
        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(input_size, num_heads, hidden_size),
            num_layers
        )
        self.fc = nn.Linear(input_size, 1)

    def forward(self, x1, x2):
        x = x1 - x2
        x = x.transpose(0, 1)
        x = self.transformer(x)
        x = x.mean(dim=0)
        x = self.fc(x)
        return x


In [14]:
# Example of training script
device = torch.device("cuda")
model = StabilityModel(input_size, hidden_size, num_heads, num_layers).to(device)
optimizer = torch.optim.Adadelta(model.parameters(), lr=0.0001)
criterion = torch.nn.MSELoss()
epoch_loss = 0
for i in range(1):
    epoch_loss = 0
    for batch_idx, (data_mut, data_wt, target) in tqdm(enumerate(train_dataloader)):
        # extract input from datallader
        x1 = data_wt.to(device)
        x2 = data_mut.to(device)
        y = target.to(device)
        # make prediction
        y_pred = model(x1,x2)
        # calculate loss and run optimizer
        loss = torch.sqrt(criterion(y, y_pred))
        loss.backward()
        optimizer.step()
        epoch_loss += loss
    print("epoch_",i," = ", epoch_loss/len(train_dataloader))
    # [Recommended] Save trained models to select best checkpoint for prediction (or add prediction in the epochs loop)


0it [00:00, ?it/s]


AssertionError: ignored

## Prediction & submission

In [None]:
# load embedding tensors & traing csv
wt_test_emb = torch.load("test/test_wt.pt")
mut_test_emb = torch.load("test/test_mut.pt")
df_test = pd.read_csv("test/test.csv")

In [None]:
# creating testing dataset and loading the embedding
test_dataset = EmbeddingDataset(wt_test_emb,mut_test_emb,df_test)
# preparing a dataloader for the testing
test_dataloader = torch.utils.data.dataloader.DataLoader(
        test_dataset,
        batch_size=32,
        shuffle=False,
        num_workers=2,
    )

In [None]:
df_result = pd.DataFrame()
with torch.no_grad():
  for batch_idx, (data_mut,data_wt , target) in tqdm(enumerate(test_dataloader)):
    x1 = data_wt.to(device)
    x2 = data_mut.to(device)
    id = target.to(device)
    # make prediction
    y_pred = model(x1,x2)
    df_result = pd.concat([df_result, pd.DataFrame({"ID":id.squeeze().cpu().numpy().astype(int) , "ddg" : y_pred.squeeze().cpu().numpy()})])

60it [00:00, 77.77it/s] 


In [None]:
df_result.to_csv("submission.csv",index=False)