In [1]:
#@title Download data from GCP bucket
import sys

if 'google.colab' in sys.modules:
  !gsutil -m cp -r gs://indaba-data .
else:
  !mkdir -p indaba-data/train
  !wget -P indaba-data/train https://storage.googleapis.com/indaba-data/train/train.csv --continue
  !wget -P indaba-data/train https://storage.googleapis.com/indaba-data/train/train_mut.pt --continue
  !wget -P indaba-data/train https://storage.googleapis.com/indaba-data/train/train_wt.pt --continue

  !mkdir -p indaba-data/test
  !wget -P indaba-data/test https://storage.googleapis.com/indaba-data/test/test.csv --continue
  !wget -P indaba-data/test https://storage.googleapis.com/indaba-data/test/test_mut.pt --continue
  !wget -P indaba-data/test https://storage.googleapis.com/indaba-data/test/test_wt.pt --continue

Copying gs://indaba-data/test/test_mut.pt...
/ [0 files][    0.0 B/  9.3 MiB]                                                Copying gs://indaba-data/README.txt...
Copying gs://indaba-data/test/test.csv...
Copying gs://indaba-data/train/train.csv...
Copying gs://indaba-data/train/train_wt.pt...
Copying gs://indaba-data/train/train_mut.pt...
Copying gs://indaba-data/test/test_wt.pt...
==> NOTE: You are downloading one or more large file(s), which would
run significantly faster if you enabled sliced object downloads. This
feature is enabled by default but requires that compiled crcmod be
installed (see "gsutil help crcmod").



In [2]:
#@title Imports and moving to working directory
import torch 
import pandas as pd
from tqdm import tqdm

# move to data folder
%cd indaba-data

/content/indaba-data


In [3]:
# Load Embedding tensors & Traing csv
# Embeddings were calculated using the ESM 650M pretrained model 
# Tensor shape of embedded data:  [data_len,1280] 
# There are no sequences in the Embedding tensors as we've performed an average of it (torch.mean(embed, dim=1))
# More details in https://huggingface.co/facebook/esm2_t33_650M_UR50D

wt_emb = torch.load("train/train_wt.pt")
mut_emb = torch.load("train/train_mut.pt")
df = pd.read_csv("train/train.csv")

In [None]:
# [Recommended] Split data into train and validation 
#TODO

In [11]:
# Building the dataset class
class EmbeddingDataset(torch.utils.data.Dataset):
  def __init__(self,mut_pt, wt_pt, data_df):
    self.pt_mut = mut_pt
    self.pt_wt = wt_pt
    self.df = data_df
  
  def __len__(self):
      return self.pt_mut.shape[0]

  def __getitem__(self, index):
    o1=self.pt_mut[index,:]
    o2=self.pt_wt[index,:]
    if "ddg" in self.df:
      df_out=torch.Tensor([self.df["ddg"][index]])
    else:
      df_out=torch.Tensor([self.df["ID"][index]])
    return  self.pt_mut[index,:],self.pt_wt[index,:],df_out 

In [13]:
# creating training dataset and dataloader
train_dataset = EmbeddingDataset(wt_emb, mut_emb, df)
# preparing a dataloader for the training
train_dataloader = torch.utils.data.dataloader.DataLoader(
        train_dataset,
        batch_size=32,
        shuffle=False,
        num_workers=2,
    )
# [Recommended] Use Data validation loader too


In [16]:
for i in range(5):  # change the number as needed
    print(train_dataset[i])

(tensor([-0.0632,  1.8905,  1.0130,  ...,  1.7288, -2.4825,  0.3187]), tensor([ 0.0070,  1.9001,  1.0074,  ...,  1.6119, -2.3543,  0.3594]), tensor([0.2288]))
(tensor([-0.0632,  1.8905,  1.0130,  ...,  1.7288, -2.4825,  0.3187]), tensor([ 0.0423,  1.9433,  0.9595,  ...,  1.7695, -2.4142,  0.3128]), tensor([0.4969]))
(tensor([-0.0632,  1.8905,  1.0130,  ...,  1.7288, -2.4825,  0.3187]), tensor([ 0.0293,  1.9336,  1.0884,  ...,  1.6638, -2.3769,  0.3196]), tensor([0.1630]))
(tensor([-0.0632,  1.8905,  1.0130,  ...,  1.7288, -2.4825,  0.3187]), tensor([-0.1233,  2.0180,  1.0377,  ...,  1.6911, -2.2926,  0.3670]), tensor([0.2090]))
(tensor([-0.0632,  1.8905,  1.0130,  ...,  1.7288, -2.4825,  0.3187]), tensor([ 0.0676,  1.9103,  1.0881,  ...,  1.6339, -2.3799,  0.2853]), tensor([0.4076]))


In [17]:
len(train_dataset)

339778

In [None]:
# Building a simple pytorch model
# A dummy model (2-param) that demonstrates the usage of the dataset

class StabilityModel(torch.nn.Module):
  def __init__(self):
    super(StabilityModel, self).__init__()
    self.lin = torch.nn.Linear(1,1)

  def forward(self, x, y):
    # run the forward pass
    # output should be the stability estimation [batch,estim]
    return self.lin(torch.mean(x-y,dim=1).reshape(-1,1)) 

In [None]:
# Example of training script
device = torch.device("cuda")
model =  StabilityModel().to(device)
optimizer = torch.optim.Adadelta(model.parameters(), lr=0.0001)
criterion = torch.nn.MSELoss()
epoch_loss = 0
for i in range(1):
  epoch_loss = 0
  for batch_idx, (data_mut,data_wt , target) in tqdm(enumerate(train_dataloader)):
      # extract input from datallader
      x1 = data_wt.to(device)
      x2 = data_mut.to(device)
      y = target.to(device)
      # make prediction
      y_pred = model(x1,x2)
      # calculate loss and run optimizer
      loss = torch.sqrt(criterion(y, y_pred))
      loss.backward()
      optimizer.step()
      epoch_loss += loss
  print("epoch_",i," = ", epoch_loss/len(train_dataloader))
  # [Recommended] Save trained models to select best checkpoint for prediction (or add prediction in the epochs loop)

10619it [00:49, 213.80it/s]

epoch_ 0  =  tensor(1.1049, device='cuda:0', grad_fn=<DivBackward0>)





## Prediction & submission

In [None]:
# load embedding tensors & traing csv
wt_test_emb = torch.load("test/test_wt.pt")
mut_test_emb = torch.load("test/test_mut.pt")
df_test = pd.read_csv("test/test.csv")

In [None]:
# creating testing dataset and loading the embedding
test_dataset = EmbeddingDataset(wt_test_emb,mut_test_emb,df_test)
# preparing a dataloader for the testing
test_dataloader = torch.utils.data.dataloader.DataLoader(
        test_dataset,
        batch_size=32,
        shuffle=False,
        num_workers=2,
    )

In [None]:
df_result = pd.DataFrame()
with torch.no_grad():
  for batch_idx, (data_mut,data_wt , target) in tqdm(enumerate(test_dataloader)):
    x1 = data_wt.to(device)
    x2 = data_mut.to(device)
    id = target.to(device)
    # make prediction
    y_pred = model(x1,x2)
    df_result = pd.concat([df_result, pd.DataFrame({"ID":id.squeeze().cpu().numpy().astype(int) , "ddg" : y_pred.squeeze().cpu().numpy()})])

60it [00:00, 157.50it/s]


In [None]:
df_result.to_csv("submission.csv",index=False)