# Model 
This notebook is used for designing the custom Albert model. The model is also being tested as sanity check. The custom model is implemented as a Pytorch Lightning module.

**Current Tasks Remaining**:-
1. Implement the training_step(), configure_optimizer().
2. Modify the dataloaders. 
3. sanity Check run.
4. Increase MLM data size.

The forward function needs to have all the parameters mentioned at this [link](https://github.com/huggingface/transformers/blob/master/src/transformers/models/albert/modeling_albert.py#L640). The output from this model will then be passed into the MLM head of the Albert and the loss used will be **CrossEntropyLoss** with **Adam** Optimizer.

### Note:
Will also need to check the batch size permissible along with the gpus option on PL.

In [1]:
from transformers import AlbertConfig, AlbertModel
from transformers.modeling_albert import AlbertMLMHead
import torch
from torch import nn
import pytorch_lightning as pl

## Defining custom Albert Config

Refer to [this](https://github.com/huggingface/transformers/blob/48cc224703a8dd8d03d2721c8651fea8704d994b/src/transformers/models/albert/configuration_albert.py#L33) link to understand the meaning of the parameters

In [2]:
# Hyperparameters
vocab_size = 10
embedding_size = 16
hidden_size = 768
num_attention_heads = 12
intermediate_size = 3072


In [3]:
# Defining Custom Albert Model config
custom_config = AlbertConfig(
    vocab_size=vocab_size, # A, C, T, G, U, UNK, MASK, PAD, CLS, SEP
    embedding_size=embedding_size, #this will be scaled to 32 and 64 for ablation experiments
    hidden_size=hidden_size,
    num_attention_heads=num_attention_heads,
    intermediate_size=intermediate_size,
)

In [4]:
custom_model = AlbertModel(custom_config) # custom model

In [5]:
#downloading from huggingface the pretrained model
pretrained_model = AlbertModel.from_pretrained('albert-base-v2', return_dict=True)
# pretrained_model.save_pretrained('./albert_base_v2')

In [6]:
# masked language modelling head
# this MLM head will be put on top our custom config albert
# and trained on miRNA and mRNA sequences separately.
mlm_head = AlbertMLMHead(custom_config) 

In [7]:
pretrained_model

AlbertModel(
  (embeddings): AlbertEmbeddings(
    (word_embeddings): Embedding(30000, 128, padding_idx=0)
    (position_embeddings): Embedding(512, 128)
    (token_type_embeddings): Embedding(2, 128)
    (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0, inplace=False)
  )
  (encoder): AlbertTransformer(
    (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
    (albert_layer_groups): ModuleList(
      (0): AlbertLayerGroup(
        (albert_layers): ModuleList(
          (0): AlbertLayer(
            (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (attention): AlbertAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (attention_dropout): Dropout(p=0, inplace=False)
      

In [8]:
custom_model

AlbertModel(
  (embeddings): AlbertEmbeddings(
    (word_embeddings): Embedding(10, 16, padding_idx=0)
    (position_embeddings): Embedding(512, 16)
    (token_type_embeddings): Embedding(2, 16)
    (LayerNorm): LayerNorm((16,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0, inplace=False)
  )
  (encoder): AlbertTransformer(
    (embedding_hidden_mapping_in): Linear(in_features=16, out_features=768, bias=True)
    (albert_layer_groups): ModuleList(
      (0): AlbertLayerGroup(
        (albert_layers): ModuleList(
          (0): AlbertLayer(
            (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (attention): AlbertAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (attention_dropout): Dropout(p=0, inplace=False)
              

In [9]:
# Since the pretrained model has a vocabulary of 30k
# and embedding size of 128, we need to downscale it
# to our requirements
# Thus, here the embeddings are not pretrained but
# only the main model is. Objective of this is to 
# leverage the latent space of pretrained model.

pretrained_model.resize_token_embeddings(10)
pretrained_model.set_input_embeddings(nn.Embedding(10,16, padding_idx=0))
pretrained_model

AlbertModel(
  (embeddings): AlbertEmbeddings(
    (word_embeddings): Embedding(10, 16, padding_idx=0)
    (position_embeddings): Embedding(512, 128)
    (token_type_embeddings): Embedding(2, 128)
    (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0, inplace=False)
  )
  (encoder): AlbertTransformer(
    (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
    (albert_layer_groups): ModuleList(
      (0): AlbertLayerGroup(
        (albert_layers): ModuleList(
          (0): AlbertLayer(
            (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (attention): AlbertAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (attention_dropout): Dropout(p=0, inplace=False)
          

In [10]:
mlm_head

AlbertMLMHead(
  (LayerNorm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
  (dense): Linear(in_features=768, out_features=16, bias=True)
  (decoder): Linear(in_features=16, out_features=10, bias=True)
)

In [15]:
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
import pandas as pd

In [16]:
class Vocabulary:
    def __init__(self):

        self.itos = {0: "[PAD]", 1: "[START]", 2: "[END]", 3: "[UNK]", 4: "[MASK]"}
        self.stoi = {"[PAD]": 0, "[START]": 1, "[END]": 2, "[UNK]": 3, "[MASK]": 4}

    def __len__(self):
        return len(self.itos)

    #returns the numeric token value of a given string token
    def get_idx(self, token):
        return self.stoi[token]
    
    #returns the alphanumeric token value given the token idx
    def get_token(self, token):
        return self.itos[token]

    @staticmethod
    def tokenizer_seq(fasta_seq):
#         print(fasta_seq)
        return [str(x) for x in list(fasta_seq)]

    def build_vocabulary(self):
        frequencies = {}
        idx = len(self.itos)
        for idx1, base in enumerate(list('acgut')):
            self.stoi[base] = idx+idx1
            self.itos[idx+idx1] = base

    def numericalize(self, fasta_seq):
        tokenized_seq = self.tokenizer_seq(fasta_seq.lower())
        return [
            self.stoi[token] if token in self.stoi else self.stoi["[UNK]"]
            for token in tokenized_seq
        ]

In [85]:
class SequenceDataset(Dataset):
    def __init__(self, filename, sep='\t'):
        self.df = pd.read_csv(filename,sep=sep, index_col=0)
#         print(self.df.head())
        # Dataset Column Positions - mRNA  Binding sites with flanking regions
        self.mirna_names = self.df.iloc[:, 0].values
        self.mirna_seqs = self.df.iloc[:, 1].values
        #concatenating row-wise to create a combined vocabulary
#         all_seq = self.mirna[:] + self.mrna
        
        # Initialize vocabulary and build vocab
        self.vocab = Vocabulary()
        self.vocab.build_vocabulary()

    def __len__(self):
        return self.df.shape[0]

    def numericalize_seq(self,seq):
        numericalized_seq = [self.vocab.stoi["[START]"]]
        numericalized_seq += self.vocab.numericalize(seq)
        numericalized_seq.append(self.vocab.stoi["[END]"])
        return numericalized_seq

    def get_vocabulary(self):
        return self.vocab.stoi

    def __getitem__(self, index):
#         print(index, self.mirna_names[index])
        return torch.tensor(self.numericalize_seq(self.mirna_seqs[index]))

In [108]:
class CollateSequences:
    def __init__(self, pad_idx):
        self.pad_idx = pad_idx

    def __call__(self, batch):
        
        seq = [item for item in batch]
#         print(seq)
        seq = pad_sequence(seq, batch_first=True, padding_value=self.pad_idx)

        return seq

In [109]:
batch_size=5
num_workers=1
shuffle=True
pin_memory=True

dataset = SequenceDataset(filename='./Processed MBStar/partitioned/mlm/mlm_mrna_data_chunk_0.txt')

pad_idx = dataset.vocab.stoi["[PAD]"]

loader = DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        num_workers=num_workers,
        shuffle=shuffle,
        pin_memory=pin_memory,
        collate_fn=CollateSequences(pad_idx=pad_idx)
    )

In [131]:
import numpy as np
for _, a in enumerate(loader):
#     print(a)
#     print(a.shape)
#     print(a.masked_fill(a != 0, value=1))
    
    logits = custom_model(input_ids=a, attention_mask=a.masked_fill(a != 0, value=1))
    logits = mlm_head(logits[0])
    
    print(logits)
    break

tensor([[[ 0.6997,  0.3745,  0.1816,  ...,  1.0446, -0.2789, -0.2224],
         [ 0.6947, -0.0811,  0.0850,  ...,  1.1935, -0.0984, -0.2778],
         [ 0.7763,  0.0597,  0.3298,  ...,  1.4552,  0.0421, -0.1564],
         ...,
         [ 0.5522,  0.1945, -0.0725,  ...,  1.3281,  0.1849, -0.6449],
         [ 0.8187, -0.0764, -0.1441,  ...,  0.8490, -0.4269, -0.1223],
         [ 0.6554,  0.0210,  0.1730,  ...,  1.0638, -0.0494, -0.1477]],

        [[ 0.6382,  0.3641, -0.0614,  ...,  0.9362, -0.1965, -0.4424],
         [ 0.6610, -0.0856, -0.1174,  ...,  1.0929, -0.0645, -0.4428],
         [ 0.6021, -0.0654, -0.0023,  ...,  1.2946,  0.3746, -0.5992],
         ...,
         [ 0.4687,  0.1589, -0.3843,  ...,  1.1282,  0.2986, -0.8939],
         [ 0.5501,  0.0788, -0.3041,  ...,  0.7473, -0.0656, -0.4587],
         [ 0.8362,  0.2290, -0.2045,  ...,  1.0486, -0.3090, -0.5115]],

        [[ 0.6808,  0.4067,  0.0587,  ...,  0.9949, -0.2468, -0.3327],
         [ 0.7219, -0.1294,  0.0785,  ...,  1

In [11]:
from transformers import AlbertConfig, AlbertModel
from transformers.modeling_albert import AlbertMLMHead
from torch.nn import CrossEntropyLoss
import torch
from torch import nn
import numpy as np

class CustomAlbert(pl.LightningModule):
    def __init__(self, vocab_size=10, embedding_size=16, hidden_size=768, num_attention_heads=12, intermediate_size=3072):
        super(CustomAlbert, self).__init__()
        
        self.save_hyperparameters()
        
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.num_attention_heads = num_attention_heads
        self.intermediate_size = intermediate_size
        
        custom_config = AlbertConfig(
            vocab_size=vocab_size, # A, C, T, G, U, UNK, MASK, PAD, CLS, SEP
            embedding_size=embedding_size, #this will be scaled to 32 and 64 for ablation experiments
            hidden_size=hidden_size,
            num_attention_heads=num_attention_heads,
            intermediate_size=intermediate_size,
        )
        
        self.custom_model = AlbertModel(custom_config) # custom model
        self.mlm_head = AlbertMLMHead(custom_config) # mlm head
        self.loss = CrossEntropyLoss()

    def forward(
        self, 
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        sentence_order_label=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None):
        
        outputs = self.custom_model(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output, pooled_output = outputs[:2]
        
        prediction_scores = self.mlm_head(sequence_output)
        
        return prediction_scores
    
    def training_step(self, batch):
        seq = batch
        attention_mask = seq.masked_fill( seq != 0, value=1)
        logits = self(input_ids=seq, attention_mask=attention_mask)
        
        active_loss = attention_mask.view(-1) == 1
        active_logits = logits.view(-1, self.num_labels)[active_loss]
        active_labels = seq.view(-1)[active_loss]
        
        loss = self.loss(active_logits, active_labels)
        
    def train_dataloader(self):
        dataset = SequenceDataset(filename='./Processed MBStar/partitioned/mlm/mlm_mrna_data_chunk_0.txt')

        pad_idx = dataset.vocab.stoi["[PAD]"]

        loader = DataLoader(
                dataset=dataset,
                batch_size=batch_size,
                num_workers=num_workers,
                shuffle=shuffle,
                pin_memory=pin_memory,
                collate_fn=CollateSequences(pad_idx=pad_idx)
            )
        
        return loader
        

In [None]:
import gc
del [dataset]
gc.collect()