# Model 
This notebook is used for designing the custom Albert model. The model is also being tested as sanity check. The custom model is implemented as a Pytorch Lightning module.

**Current Tasks Remaining**:-
1. Implement the training_step(), configure_optimizer().
2. Modify the dataloaders. 
3. sanity Check run.
4. Increase MLM data size.

The forward function needs to have all the parameters mentioned at this [link](https://github.com/huggingface/transformers/blob/master/src/transformers/models/albert/modeling_albert.py#L640). The output from this model will then be passed into the MLM head of the Albert and the loss used will be **CrossEntropyLoss** with **Adam** Optimizer.

### Note:
Will also need to check the batch size permissible along with the gpus option on PL.

In [2]:
from transformers import AlbertConfig, AlbertModel
from transformers.modeling_albert import AlbertMLMHead
import torch
from torch import nn
import pytorch_lightning as pl

## Defining custom Albert Config

Refer to [this](https://github.com/huggingface/transformers/blob/48cc224703a8dd8d03d2721c8651fea8704d994b/src/transformers/models/albert/configuration_albert.py#L33) link to understand the meaning of the parameters

In [3]:
# Hyperparameters for our model
vocab_size = 10
embedding_size = 16
hidden_size = 768
num_attention_heads = 12
intermediate_size = 3072


In [4]:
# Defining Custom Albert Model config
custom_config = AlbertConfig(
    vocab_size=vocab_size, # A, C, T, G, U, UNK, MASK, PAD, CLS, SEP
    embedding_size=embedding_size, #this will be scaled to 32 and 64 for ablation experiments
    hidden_size=hidden_size,
    num_attention_heads=num_attention_heads,
    intermediate_size=intermediate_size,
)

In [5]:
custom_model = AlbertModel(custom_config) # custom model

In [6]:
#downloading from huggingface the pretrained model
pretrained_model = AlbertModel.from_pretrained('albert-base-v2', return_dict=True)
# pretrained_model.save_pretrained('./albert_base_v2')

In [7]:
# masked language modelling head
# this MLM head will be put on top our custom config albert
# and trained on miRNA and mRNA sequences separately.
mlm_head = AlbertMLMHead(custom_config) 

In [8]:
pretrained_model

AlbertModel(
  (embeddings): AlbertEmbeddings(
    (word_embeddings): Embedding(30000, 128, padding_idx=0)
    (position_embeddings): Embedding(512, 128)
    (token_type_embeddings): Embedding(2, 128)
    (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0, inplace=False)
  )
  (encoder): AlbertTransformer(
    (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
    (albert_layer_groups): ModuleList(
      (0): AlbertLayerGroup(
        (albert_layers): ModuleList(
          (0): AlbertLayer(
            (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (attention): AlbertAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (attention_dropout): Dropout(p=0, inplace=False)
      

In [9]:
custom_model

AlbertModel(
  (embeddings): AlbertEmbeddings(
    (word_embeddings): Embedding(10, 16, padding_idx=0)
    (position_embeddings): Embedding(512, 16)
    (token_type_embeddings): Embedding(2, 16)
    (LayerNorm): LayerNorm((16,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0, inplace=False)
  )
  (encoder): AlbertTransformer(
    (embedding_hidden_mapping_in): Linear(in_features=16, out_features=768, bias=True)
    (albert_layer_groups): ModuleList(
      (0): AlbertLayerGroup(
        (albert_layers): ModuleList(
          (0): AlbertLayer(
            (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (attention): AlbertAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (attention_dropout): Dropout(p=0, inplace=False)
              

In [10]:
# Since the pretrained model has a vocabulary of 30k
# and embedding size of 128, we need to downscale it
# to our requirements
# Thus, here the embeddings are not pretrained but
# only the main model is. Objective of this is to 
# leverage the latent space of pretrained model.

# pretrained_model.resize_token_embeddings(10)
# pretrained_model.set_input_embeddings(nn.Embedding(10,16, padding_idx=0))
# pretrained_model

In [11]:
mlm_head

AlbertMLMHead(
  (LayerNorm): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
  (dense): Linear(in_features=768, out_features=16, bias=True)
  (decoder): Linear(in_features=16, out_features=10, bias=True)
)

## Creating Text Dataset and Dataloader
In this section, we create the **Dataset** and **Dataloader** to be used in our training and associated tasks. Since our vocabulary is limited, we are not using any packages like ***SpaCy***. We create a vocabulary set and our own tokenizer for our models.  

In [12]:
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
import pandas as pd

In [13]:
class Vocabulary:
    def __init__(self):

        self.itos = {0: "[PAD]", 1: "[START]", 2: "[END]", 3: "[UNK]", 4: "[MASK]"}
        self.stoi = {"[PAD]": 0, "[START]": 1, "[END]": 2, "[UNK]": 3, "[MASK]": 4}

    def __len__(self):
        return len(self.itos)

    #returns the numeric token value of a given string token
    def get_idx(self, token):
        return self.stoi[token]
    
    #returns the alphanumeric token value given the token idx
    def get_token(self, token):
        return self.itos[token]

    @staticmethod
    def tokenizer_seq(fasta_seq):
#         print(fasta_seq)
        return [str(x) for x in list(fasta_seq)]

    def build_vocabulary(self):
        frequencies = {}
        idx = len(self.itos)
        for idx1, base in enumerate(list('acgut')):
            self.stoi[base] = idx+idx1
            self.itos[idx+idx1] = base

    def numericalize(self, fasta_seq):
        tokenized_seq = self.tokenizer_seq(fasta_seq.lower())
        return [
            self.stoi[token] if token in self.stoi else self.stoi["[UNK]"]
            for token in tokenized_seq
        ]

In [14]:
class SequenceDataset(Dataset):
    def __init__(self, filename, sep='\t'):
        self.df = pd.read_csv(filename,sep=sep, index_col=0)
#         print(self.df.head())
        # Dataset Column Positions - mRNA  Binding sites with flanking regions
        self.mirna_names = self.df.iloc[:, 0].values
        self.mirna_seqs = self.df.iloc[:, 1].values
        #concatenating row-wise to create a combined vocabulary
#         all_seq = self.mirna[:] + self.mrna
        
        # Initialize vocabulary and build vocab
        self.vocab = Vocabulary()
        self.vocab.build_vocabulary()

    def __len__(self):
        return self.df.shape[0]

    def numericalize_seq(self,seq):
        numericalized_seq = [self.vocab.stoi["[START]"]]
        numericalized_seq += self.vocab.numericalize(seq)
        numericalized_seq.append(self.vocab.stoi["[END]"])
        return numericalized_seq

    def get_vocabulary(self):
        return self.vocab.stoi

    def __getitem__(self, index):
#         print(index, self.mirna_names[index])
        return torch.tensor(self.numericalize_seq(self.mirna_seqs[index]))

In [15]:
class CollateSequences:
    def __init__(self, pad_idx):
        self.pad_idx = pad_idx

    def __call__(self, batch):
        
        seq = [item for item in batch]
#         print(seq)
        seq = pad_sequence(seq, batch_first=True, padding_value=self.pad_idx)

        return seq

In [16]:
batch_size=5
num_workers=1
shuffle=True
pin_memory=True

dataset = SequenceDataset(filename='./Processed MBStar/partitioned/mlm/mlm_mrna_data_chunk_0.txt')

pad_idx = dataset.vocab.stoi["[PAD]"]

loader = DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        num_workers=num_workers,
        shuffle=shuffle,
        pin_memory=pin_memory,
        collate_fn=CollateSequences(pad_idx=pad_idx)
    )

  mask |= (ar1 == a)


In [17]:
#testing dataloader
import numpy as np
for _, a in enumerate(loader):
    print(a)
    print(a.shape)
    print(a.masked_fill(a != 0, value=1))
    
    print(logits)
    break

tensor([[[ 0.0098, -1.1421, -0.0952,  ..., -0.6524,  0.0876,  0.9149],
         [ 0.0165, -1.1618, -0.2238,  ..., -0.7277, -0.1072,  0.8638],
         [-0.0869, -1.1878, -0.1202,  ..., -0.7930, -0.0861,  1.0118],
         ...,
         [-0.1806, -1.2484, -0.0447,  ..., -0.6611,  0.1465,  1.0092],
         [-0.0567, -1.1168, -0.3301,  ..., -0.6464, -0.0097,  0.8886],
         [-0.1023, -0.9794, -0.2897,  ..., -0.6191,  0.0302,  0.9009]],

        [[-0.0836, -1.1591, -0.0430,  ..., -0.6884,  0.0159,  1.0127],
         [-0.2882, -0.9896, -0.2202,  ..., -0.8375, -0.3605,  1.0494],
         [-0.1954, -1.2341, -0.1343,  ..., -0.9152, -0.2301,  1.1028],
         ...,
         [-0.2595, -1.3284,  0.3073,  ..., -0.7161, -0.0194,  1.1922],
         [-0.3094, -1.2860,  0.3828,  ..., -0.8379, -0.3138,  1.1856],
         [-0.1684, -0.9976, -0.4257,  ..., -0.6289, -0.0691,  0.9643]],

        [[-0.1534, -1.0976,  0.0412,  ..., -0.7163, -0.1666,  1.0131],
         [-0.1998, -1.0464, -0.0654,  ..., -0

## Masked Language Modelling using Albert
This section demos how to use the original Albert model with the original Tokenizer for masked language modelling task. The mask token used in this case is \[MASK\]. The model is tasked with predicting what this word might be. 
The model outputs logits and losses. The logits are at position 0 in the output. The key \'last_hidden_state\' can be used to extract them as well. This logit is sent through a **Masked Language Modelling Head** which outputs a probability for each of the words from the vocabulary to be placed instead of the \[MASK\]. The maximum from this output is the token for the word. This is found by using **torch.argmax** on the array elememt in the output at the position of the  \[MASK\] in the second dimension (output dimension is ***\[batch_size, number_of_token_in_input, number_of_words_in_vocabulary\]*** )

#### Note:
Since an untrained MLM HEAD is used, the output may be arbitrary.

In [9]:
#loading the pretrained original tokenizer used for albert-base-v2
from transformers import AlbertTokenizer
from transformers import AlbertConfig, AlbertModel
from transformers.modeling_albert import AlbertMLMHead
import torch

In [2]:
pretrained_model = AlbertModel.from_pretrained('albert-base-v2', return_dict=True) #pretrained model
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') #tokenizer

#This is the Masked Language Modelling HEAD for original Albert
albert_base_configuration = AlbertConfig(
      hidden_size=768,
      num_attention_heads=12,
      intermediate_size=3072,
  )
default_mlm_head = AlbertMLMHead(albert_base_configuration) #default mlm head
default_mlm_head

AlbertMLMHead(
  (LayerNorm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
  (dense): Linear(in_features=768, out_features=128, bias=True)
  (decoder): Linear(in_features=128, out_features=30000, bias=True)
)

In [7]:
#change the sentance as long as it is within 100 words
#use only 1 [MASK]
sentence = 'This is a [MASK] project' 
encoded_seq = tokenizer.encode(sentence)
print("Encoded Sequence-", encoded_seq)
print("ID for [MASK] token -",tokenizer.convert_ids_to_tokens(4))
mask_index = encoded_seq.index(4)
print("[MASK] is at-", mask_index+1)

Encoded Sequence- [2, 48, 25, 21, 4, 669, 3]
ID for [MASK] token - [MASK]
[MASK] is at- 5


In [10]:
"""
Consult https://github.com/huggingface/transformers/blob/master/src/transformers/models/albert/modeling_albert.py#L698
for understanding output format
"""

test_output = pretrained_model(torch.tensor(encoded_seq).unsqueeze(0))
print("Keys- ",test_output.keys())
print("Sequence Output-",test_output.last_hidden_state)
print("Sequence Output Size-",test_output.last_hidden_state.size())


Keys-  odict_keys(['last_hidden_state', 'pooler_output'])
Sequence Output- tensor([[[ 0.8959,  0.7075,  0.7822,  ..., -0.5718,  0.8908,  0.2283],
         [ 1.4882, -1.0936,  0.3678,  ..., -0.2871,  2.6546, -1.4538],
         [ 1.3232, -0.9379, -0.5643,  ...,  0.7148,  0.4633, -0.5228],
         ...,
         [-0.4393, -0.3515, -1.2364,  ..., -0.2335,  0.6026, -1.3187],
         [-0.3884, -0.5458,  0.5886,  ...,  0.2667,  0.8635, -0.4085],
         [ 0.0666,  0.1381, -0.0645,  ..., -0.0783,  0.1365,  0.1967]]],
       grad_fn=<NativeLayerNormBackward>)
Sequence Output Size- torch.Size([1, 7, 768])


In [11]:
prediction_scores = default_mlm_head(test_output.last_hidden_state)
print("Prediction Scores Dimensions-",prediction_scores.size())
print("Predicted Token for [MASK]-",torch.argmax(prediction_scores[0, mask_index]))

Prediction Scores Dimensions- torch.Size([1, 7, 30000])
Predicted Token for [MASK]- tensor(27047)


In [12]:
predicted_word = tokenizer.convert_ids_to_tokens(torch.argmax(prediction_scores[0, mask_index]).item())
print("Predicted - ", predicted_word)

Predicted -  ▁sykes


## Designing Model
In this section, the custom AlBERT model is being defined which will be trained on MLM on sequences. The model is a *PyTorch Lightning* model. We are going to use CrossEntropyLoss and Adam Optimizer with Weight Decay.
Consult [this](https://huggingface.co/transformers/_modules/transformers/optimization.html#AdamW) and [this](https://huggingface.co/transformers/main_classes/optimizer_schedules.html) for AdamW and Optimization Schedules available through HuggingFace Transformers respectively.

In [15]:
from transformers import AlbertConfig, AlbertModel
from transformers.modeling_albert import AlbertMLMHead
from transformers import AdamW
from torch.nn import CrossEntropyLoss
import torch
from torch import nn
import numpy as np
import pytorch_lightning as pl

class CustomAlbert(pl.LightningModule):
    def __init__(
        self,
        vocab_size=10,
        embedding_size=16,
        hidden_size=768,
        num_attention_heads=12,
        intermediate_size=3072):
        
        super(CustomAlbert, self).__init__()
        
        self.save_hyperparameters()
        
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.num_attention_heads = num_attention_heads
        self.intermediate_size = intermediate_size
        
        custom_config = AlbertConfig(
            vocab_size=vocab_size, # A, C, T, G, U, UNK, MASK, PAD, CLS, SEP
            embedding_size=embedding_size, #this will be scaled to 32 and 64 for ablation experiments
            hidden_size=hidden_size,
            num_attention_heads=num_attention_heads,
            intermediate_size=intermediate_size,
        )
        
        self.custom_model = AlbertModel(custom_config) # custom model
        self.mlm_head = AlbertMLMHead(custom_config) # mlm head
        self.loss = CrossEntropyLoss()
        

    def forward(
        self, 
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        sentence_order_label=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None):
        
        outputs = self.custom_model(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output, pooled_output = outputs[:2]
        
        prediction_scores = self.mlm_head(sequence_output)
        
        return prediction_scores
    
    def configure_optimizers(self):
        self.optimizer = AdamW(self.parameters(), lr=1e-3) #default learning rate is 1e-3
        
        return self.optimizer
    
    def training_step(self, batch):
        seq = batch
        attention_mask = seq.masked_fill( seq != 0, value=1)
        logits = self(input_ids=seq, attention_mask=attention_mask)
        
        # For loss calculation, only sequence-based loss and not the loss from pads 
        # is taken into account. This is found by using the indices where
        # attention_mask element is 1.
        active_loss = attention_mask.view(-1) == 1 
        active_logits = logits.view(-1, self.num_labels)[active_loss] 
        active_labels = seq.view(-1)[active_loss]
        
        loss = self.loss(active_logits, active_labels)
        
        return loss
        
    def train_dataloader(self):
        dataset = SequenceDataset(filename='./Processed MBStar/partitioned/mlm/mlm_mrna_data_chunk_0.txt')

        pad_idx = dataset.vocab.stoi["[PAD]"]

        loader = DataLoader(
                dataset=dataset,
                batch_size=batch_size,
                num_workers=num_workers,
                shuffle=shuffle,
                pin_memory=pin_memory,
                collate_fn=CollateSequences(pad_idx=pad_idx)
            )
        
        return loader
        