In [None]:
!pip install transformers

In [None]:
!pip install allennlp

In [None]:
import numpy as np
import torch.optim as optim
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoModel
from transformers import AutoTokenizer

In [None]:
class BERTClassifier(nn.Module):

    def __init__(self, pretrained_model_name, n_class, emb_dim=768, dropout=0.5):
        """
        Define the BERT-based model for finetuning on specific datasets.

        Args:
            pretrained_model_name (str): name of the pretrained model in the huggingface library model card. Possible options: roberta-base, bert-base-uncased, albert-base-v2, etc.
            n_class (int): defines the number of classes for the classification task.
            emb_dim (int): defines the emb_dim dimension for the last linear layer, this corresponds to the config of the actual pretrained model. Default is 768.
            dropout (float): defines the dropout rate for the last linear layer. Default is 0.5.

        """

        super(RobertaClassifier, self).__init__()

        self.bert = AutoModel.from_pretrained(pretrained_model_name) # OPTIONS: roberta-base, bert-base-uncased, albert-base-v2, etc.
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(emb_dim, n_class)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):
        """
        Forward the BERT-based model for finetuning on specific datasets.

        Args:
            input_id (torch.Tensor): input ids for the model, obtained from the corresponding pretrained tokenizer.
            mask (torch.Tensor): attention mask for the model, obtained from the corresponding pretrained tokenizer.

        Returns:
            final_layer (torch.Tensor): the final layer prediction of the model, used for calculating the loss and the accuracy.
            attention (torch.Tensor): attention weights for each token in the input sequence.
            pooled_output (torch.Tensor): the pooled output of the last layer of the BERT model, used for feature extraction for interpretation.

        """

        _, pooled_output,attention = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False,output_attentions=True)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer,attention,pooled_output

In [None]:

class XLNetClassifier(nn.Module):

    def __init__(self, n_class, emb_dim=768, dropout=0.5):
        """
        Define the XLNet model for finetuning on specific datasets.

        Args:
            n_class (int): defines the number of classes for the classification task.
            emb_dim (int): defines the emb_dim dimension for the last linear layer, this corresponds to the config of the actual pretrained model. Default is 768.
            dropout (float): defines the dropout rate for the last linear layer. Default is 0.5.

        """

        super(XLNetClassifier, self).__init__()

        self.bert = AutoModel.from_pretrained('xlnet-base-cased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, n_class)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):
        """
        Forward the XLNet model for finetuning on specific datasets.

        Args:
            input_id (torch.Tensor): input ids for the model, obtained from the corresponding pretrained tokenizer.
            mask (torch.Tensor): attention mask for the model, obtained from the corresponding pretrained tokenizer.

        Returns:
            final_layer (torch.Tensor): the final layer prediction of the model, used for calculating the loss and the accuracy.
            attention (torch.Tensor): attention weights for each token in the input sequence.
            pooled_output (torch.Tensor): the pooled output of the last layer of the XLNet model, used for feature extraction for interpretation.

        """

        bert_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False,output_attentions=True)
        output = bert_output[0]
        attention = bert_output[-1]
        pooled_output = torch.squeeze(output[:,-1,:])

        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer,attention,pooled_output

In [None]:
from allennlp.modules.elmo import Elmo, batch_to_ids
class ELMoClassifier(nn.Module):

    def __init__(self, n_class, dropout=0.5):
        """
        Define the ELMo model for finetuning on specific datasets.

        Args:
            n_class (int): defines the number of classes for the classification task.
            dropout (float): defines the dropout rate for the last linear layer. Default is 0.5.

        """

        super(ELMoClassifier, self).__init__()
        options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json"
        weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"

        # Compute two different representation for each token.
        # Each representation is a linear weighted combination for the
        # 3 layers in ELMo (i.e., charcnn, the outputs of the two BiLSTM))

        self.bert = Elmo(options_file, weight_file, 2, dropout=0)
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(1024, n_class)
        self.relu = nn.ReLU()

    def forward(self, sentences):
        """
        Forward the ELMo model for finetuning on specific datasets.

        Args:
            sentences (List[Str]): list of sentences for the model as the input

        Returns:
            final_layer (torch.Tensor): the final layer prediction of the model, used for calculating the loss and the accuracy.
            _: arbitrary placeholder for matchting the output format of other predifined finetuned models.
            pooled_output (torch.Tensor): the pooled output of the last layer of the ELMo model, used for feature extraction for interpretation.

        """
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        character_ids = batch_to_ids(sentences).to(device)
        embeddings = self.bert(character_ids)
        pooled_output = torch.mean(embeddings['elmo_representations'][1].to(device),dim=1)
        # print(len(self.bert(input_ids= input_id, attention_mask=mask,return_dict=False,output_attentions=True)))
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer, _, pooled_output