In [7]:
import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset,DataLoader
import gc
from torch import nn, Tensor
from torch.nn import TransformerEncoder, TransformerEncoderLayer
import math
import torch.nn.functional as F
from tqdm.notebook import tqdm
from hyperparameters import config
from torchsummary import summary

# DATASET

In [2]:
class RNA_Dataset(Dataset):
    def __init__(self, df,df_457=None,df_307=None,mode = 'train'):
        self.mode = mode
        df_a = df.loc[df.experiment_type == '2A3_MaP'].reset_index(drop=True)
        df_d = df.loc[df.experiment_type == 'DMS_MaP'].reset_index(drop=True)
        self.seq = df_a['sequence'].values
        self.react_a = df_a[[c for c in df_a.columns if 'reactivity_0' in c]].values
        self.react_d = df_d[[c for c in df_d.columns if 'reactivity_0' in c]].values
        
        if self.mode == 'train':
            a_457 = df_457.loc[df_457.experiment_type == '2A3_MaP'].reset_index(drop=True)
            d_457 = df_457.loc[df_457.experiment_type == 'DMS_MaP'].reset_index(drop=True)
            self.seq_457 = a_457['sequence'].values
            self.reactivity_a_457 = a_457[[c for c in a_457.columns if 'reactivity' in c]].values
            self.reactivity_d_457 = d_457[[c for c in d_457.columns if 'reactivity' in c]].values
        
        
            a_307 = df_307.loc[df_307.experiment_type == '2A3_MaP'].reset_index(drop=True)
            d_307 = df_307.loc[df_307.experiment_type == 'DMS_MaP'].reset_index(drop=True)
            self.seq_307 = a_307['sequence'].values
            self.reactivity_a_307 = a_307[[c for c in a_307.columns if 'reactivity' in c]].values
            self.reactivity_d_307 = d_307[[c for c in d_307.columns if 'reactivity' in c]].values
            del df_a, df_d,a_457,d_457,a_307,d_307
            gc.collect()
            
        self.nucleotid_mapper = {'A':0,'G':1,'C':2,'U':3}
            
        

    def __len__(self):
        return len(self.seq)

    def __getitem__(self, item):
        
        if self.mode == 'train':
            idx = np.random.choice(3000)
            if np.random.choice(10) == 9:
                # using fake sequences
                if np.random.choice(10) >=5:
                    seq = self.seq_457[idx]
                    target = torch.from_numpy(np.stack([self.reactivity_a_457[idx], self.reactivity_d_457[idx]], -1))
                    emb = torch.LongTensor([self.nucleotid_mapper[x] for x in seq])
                    return emb,target
                else:
                    seq = self.seq_307[idx]
                    target = torch.from_numpy(np.stack([self.reactivity_a_307[idx], self.reactivity_d_307[idx]], -1))
                    emb = torch.LongTensor([self.nucleotid_mapper[x] for x in seq])
                    return emb,target
                
            seq= self.seq[item]
            emb = torch.LongTensor([self.nucleotid_mapper[x] for x in seq])
            target = torch.from_numpy(np.stack([self.react_a[item], self.react_d[item]], -1))


            return emb,target
        else:
            seq= self.seq[item]
            emb = torch.LongTensor([self.nucleotid_mapper[x] for x in seq])
            target = torch.from_numpy(np.stack([self.react_a[item], self.react_d[item]], -1))


            return emb,target
        
class RNA_Test_Dataset(Dataset):
    def __init__(self, df):
        self.seq = df['sequence'].values
        self.nucleotid_mapper = {'A': 0, 'G': 1, 'C': 2, 'U': 3}
        
        del df
        gc.collect()    

    def __len__(self):
        return len(self.seq)

    def __getitem__(self, item):
        seq= self.seq[item]
        emb = torch.LongTensor([self.nucleotid_mapper[x] for x in seq])
        return emb

In [53]:
train = pd.read_parquet('train_files/clean_train.parquet')
train_307 = pd.read_parquet('train_files/generated_seq_307.parquet')
train_457 = pd.read_parquet('train_files/generated_seq_457_new.parquet')
valid = pd.read_parquet('train_files/valid_with_structure.parquet')

# Model

In [54]:
class TransformerModel(nn.Module):
    def __init__(self, dim=192, depth=12, head_size=32, **kwargs):
        super().__init__()
        self.emb = nn.Embedding(4,dim)
        self.pos_encoder = PositionalEncoding(dim)
        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=dim, nhead=dim//head_size, dim_feedforward=4*dim,
                dropout=0.5, activation='relu', batch_first=True, norm_first=True), depth)
        self.linear = nn.Linear(dim,2)
        self.dim_model = dim
    def init_weights(self) -> None:
        initrange = 0.1
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.linear.bias.data.zero_()
        self.linear.weight.data.uniform_(-initrange, initrange)
    def forward(self, x):
        
        # pos = torch.arange(x.shape[1], device=x.device).unsqueeze(0)
        # pos = self.pos_encoder(pos)
        x = self.emb(x)
        x = self.pos_encoder(x)
        
        x = self.transformer(x)
        x = self.linear(x)
        
        return x
    
class SinusoidalPosEmb(nn.Module):
    def __init__(self, dim=16, M=10000):
        super().__init__()
        self.dim = dim
        self.M = M

    def forward(self, x):
        device = x.device
        half_dim = self.dim // 2
        emb = math.log(self.M) / half_dim
        emb = torch.exp(torch.arange(half_dim, device=device) * (-emb))
        emb = x[...,None] * emb[None,...]
        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
        return emb
    
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 500):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Arguments:
            x: Tensor, shape ``[seq_len, batch_size, embedding_dim]``
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)
    
class AbsolutePositionalEncoder(nn.Module):
    def __init__(self, emb_dim, max_position=512):
        super(AbsolutePositionalEncoder, self).__init__()
        self.position = torch.arange(max_position).unsqueeze(1)

        self.positional_encoding = torch.zeros(1, max_position, emb_dim)

        _2i = torch.arange(0, emb_dim, step=2).float()

        # PE(pos, 2i) = sin(pos/10000^(2i/d_model))
        self.positional_encoding[0, :, 0::2] = torch.sin(self.position / (10000 ** (_2i / emb_dim)))

        # PE(pos, 2i+1) = cos(pos/10000^(2i/d_model))
        self.positional_encoding[0, :, 1::2] = torch.cos(self.position / (10000 ** (_2i / emb_dim)))

    def forward(self, x):
        # batch_size, input_len, embedding_dim
        batch_size, seq_len, _ = x.size()

        return self.positional_encoding[:batch_size, :seq_len, :]

In [55]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [56]:
train_ds = RNA_Dataset(train,train_457,train_307,mode='train')
valid_ds = RNA_Dataset(valid,mode='valid')
train_loader = DataLoader(train_ds,batch_size=config.batch_size,shuffle=True,num_workers=8,drop_last=True)
valid_loader = DataLoader(valid_ds,batch_size=config.batch_size,shuffle=False,num_workers=8,drop_last=False)

# Train and Inferenece

In [57]:
def loss_fn(pred, target):
    if target.shape[0] > 206:
        loss = F.l1_loss(pred, target.clip(0,1), reduction='none')
        loss = 0.5 * loss[~torch.isnan(loss)].mean()
    else:
        loss = F.l1_loss(pred, target.clip(0,1), reduction='none')
        loss = loss[~torch.isnan(loss)].mean()
    return loss

In [58]:
def learn(model,train_loader,valid_loader,loss_fn,resume = False):
    optimizer = torch.optim.SGD(model.parameters(),lr=0.005,momentum=0.9,weight_decay=0.0005)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,verbose=True,patience=5)
    last_epoch = 0
    best_train_loss =float('inf')
    best_valid_loss = float('inf')
    best_compet_score = float('inf')
    model.to(device)
    if resume:
        print('Loading last checkpoint...')
        model.load_state_dict(config.ckp['weights'])
        optimizer.load_state_dict(config.ckp['optimizer'])
        best_train_loss = config.ckp['best_train_loss']
        best_valid_loss = config.ckp['best_valid_loss']
        best_compet_score = config.ckp['best_compet_score']
        last_epoch = config.ckp['epoch']
    print(f'Starting train from: Epoch: {last_epoch} | Best valid loss : {best_valid_loss:.5f} | compet score: {best_compet_score:.5f}\t')

    for epoch in range(last_epoch,1000):
        model.train()
        train_loss = 0


        for x,y in tqdm(train_loader):

            optimizer.zero_grad()
            x,y = x.to(device),y.to(device).squeeze(dim=0)
            prediction = model(x)
            pred_a,pred_d = prediction.squeeze(dim=0)[:,0].unsqueeze(dim=-1),prediction.squeeze(dim=0)[:,1].unsqueeze(dim=-1)
            if prediction.shape[1]<206:
                pred_a = F.pad(pred_a,[0,0,0,206-pred_a.shape[0]])
                pred_d = F.pad(pred_d,[0,0,0,206-pred_d.shape[0]])
            target_a,target_d = y[:,0].unsqueeze(dim=-1),y[:,1].unsqueeze(dim=-1)


            loss = (loss_fn(pred_a,target_a) + loss_fn(pred_d,target_d))/2
            loss.sum().backward()
            optimizer.step()
            train_loss += loss.sum().item()



        train_loss = train_loss/len(train_loader)

        model.eval()
        valid_loss = 0
        compet_score = 0
        with ((torch.inference_mode())):
            for x,y in tqdm(valid_loader):
                x,y = x.to(device),y.to(device).squeeze(dim=0)
                prediction = model(x)
                
                pred_a,pred_d = prediction.squeeze(dim=0)[:,0].unsqueeze(dim=-1),prediction.squeeze(dim=0)[:,1].unsqueeze(dim=-1)
                if prediction.shape[1]<206:
                    pred_a = F.pad(pred_a,[0,0,0,206-pred_a.shape[0]])
                    pred_d = F.pad(pred_d,[0,0,0,206-pred_d.shape[0]])
                    
                target_a,target_d = y[:,0].unsqueeze(dim=-1),y[:,1].unsqueeze(dim=-1)
                
                loss = (loss_fn(pred_a, target_a) + loss_fn(pred_d, target_d)) / 2
                

                valid_loss += loss.sum().item()

                score = (loss_fn(pred_a, target_a) + loss_fn(pred_d, target_d)) / 2
                compet_score += score.item()
            compet_score /= len(valid_loader)
            valid_loss = valid_loss/len(valid_loader)

        scheduler.step(compet_score)
        print(f"Epoch {epoch + 1}:| Train Loss: {train_loss:.5f} | Valid Loss: {valid_loss:.5f} | compet score: {compet_score:.5f}")
        # saving model
        if valid_loss < best_valid_loss or compet_score < best_compet_score:
            if train_loss < best_train_loss:
                best_train_loss = train_loss
            if valid_loss < best_valid_loss:
                best_valid_loss = valid_loss
            if compet_score < best_compet_score:
                best_compet_score = compet_score

            torch.save({'weights': model.state_dict(),
                            'optimizer': optimizer.state_dict(),
                            'epoch': epoch+1,
                            'best_train_loss': best_train_loss,
                            'best_valid_loss': best_valid_loss,
                            'best_compet_score': best_compet_score,
                            },f'logs/train_logs.pth')
            print('Train logs saved.')
            torch.cuda.empty_cache()
            gc.collect()
        if (epoch+1) % 16 == 0:
            print('Creating submission...\t')

            test_seq = pd.read_parquet('train_files/test_seq_struct.parquet')
            test_ds = RNA_Test_Dataset(test_seq)
            test_dataloader = DataLoader(test_ds, batch_size=1, drop_last=False, num_workers=8)

            model.eval()
            preds = []
            with torch.inference_mode():
                for x in tqdm(test_dataloader):
                    prediction = model(x.to(device))

                    preds.append(prediction.squeeze(dim=0).detach().cpu().numpy())

            preds = np.concatenate(preds, dtype=np.float32)

            submission = pd.DataFrame({'reactivity_DMS_MaP': preds[:, 1], 'reactivity_2A3_MaP': preds[:, 0]})
            submission = submission.clip(0, 1, axis=0)
            submission = submission.reset_index().rename(columns={'index': 'id'})
            submission.to_parquet(f'logs/submission_{epoch+1}_valid_metric:{compet_score:.5f}_valid_loss:{valid_loss:.5f}.parquet')
            del test_seq, test_ds,test_dataloader,submission,preds
            torch.cuda.empty_cache()
            gc.collect()


In [59]:
model = TransformerModel(dim=512,depth=12).to(device)

In [60]:
learn(model,train_loader,valid_loader,loss_fn,resume=True)

Starting train from: Epoch: 0 | Best valid loss : inf | compet score: inf	


  0%|          | 0/164782 [00:00<?, ?it/s]

  0%|          | 0/16479 [00:00<?, ?it/s]

Epoch 1:| Train Loss: 0.23578 | Valid Loss: 0.23480 | compet score: 0.23480
Train logs saved.


  0%|          | 0/164782 [00:00<?, ?it/s]

  0%|          | 0/16479 [00:00<?, ?it/s]

Epoch 2:| Train Loss: 0.22501 | Valid Loss: 0.23481 | compet score: 0.23481


  0%|          | 0/164782 [00:00<?, ?it/s]

KeyboardInterrupt: 