# Finetune BERT for Classification 

BERT can be used to perform many downstream tasks by modifying only few changes in the architecture. BERT performs pretty well in GLUE benchmark. Tasks in GLUE benchmark includes classifications, QA, etc

![title](images/bert-tasks.png)


We can make those changes to architecture ourselves or huggingface has provided modified architecture based on tasks. Modified architecture is simply base bert with some added head on top. 

**Some modified architecture for downstream tasks includes:**

- [BertModel](https://huggingface.co/transformers/model_doc/bert.html#bertmodel):
    The bare Bert Model transformer outputting raw hidden-states without any specific head on top. 
- [BertForPreTraining](https://huggingface.co/transformers/model_doc/bert.html#bertforpretraining): Bert Model with two heads on top as done during the pre-training: a masked language modeling head and a next sentence prediction (classification) head.
- [BertForMaskedLM](https://huggingface.co/transformers/model_doc/bert.html#bertformaskedlm): Bert Model with a language modeling head on top.
- [BertForNextSentencePrediction](https://huggingface.co/transformers/model_doc/bert.html#bertfornextsentenceprediction): Bert Model with a next sentence prediction (classification) head on top. 
- [BertForSequenceClassification](https://huggingface.co/transformers/model_doc/bert.html#bertforsequenceclassification): Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks.
- [BertForMultipleChoice](BertForMultipleChoice): Bert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a softmax) e.g. for RocStories/SWAG tasks.
- [BertForTokenClassification](https://huggingface.co/transformers/model_doc/bert.html#bertfortokenclassification): Bert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. 
- [BertForQuestionAnswering](https://huggingface.co/transformers/model_doc/bert.html#bertforquestionanswering): Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute span start logits and span end logits). 


In [1]:
import wget
import os

import pytorch_lightning as pl
from pytorch_lightning import Trainer

In [2]:
import torch
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import BertModel, BertTokenizer, BertForSequenceClassification
from torch.optim import Adam
from torch.nn import BCEWithLogitsLoss
from torch import nn
import wget
import os

INFO:transformers.file_utils:PyTorch version 1.2.0 available.
INFO:transformers.file_utils:TensorFlow version 2.1.0 available.


In [3]:
bert_model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

INFO:transformers.configuration_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /home/av6101604/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.7156163d5fdc189c3016baca0775ffce230789d7fa2a42ef516483e4ca884517
INFO:transformers.configuration_utils:Model config BertConfig {
  "_num_labels": 2,
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": null,
  "decoder_start_token_id": null,
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": null,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_

In [4]:
bert_sequence_model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

INFO:transformers.configuration_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /home/av6101604/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.7156163d5fdc189c3016baca0775ffce230789d7fa2a42ef516483e4ca884517
INFO:transformers.configuration_utils:Model config BertConfig {
  "_num_labels": 2,
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": null,
  "decoder_start_token_id": null,
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": null,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_

In [5]:
bert_sequence_model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [6]:
bert_model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

# Input of BERT in the right format
First, we need to get the input of BERT in the right format. BERT requires input to be in a specific format. 
![title](images/bert-embed.png)
![title](images/processing_text_for_bert.png)

**Attention Mask:**
The “Attention Mask” is simply an array of 1s and 0s indicating which tokens are padding and which aren’t (seems kind of redundant, doesn’t it?!). This mask tells the “Self-Attention” mechanism in BERT not to incorporate these PAD tokens into its interpretation of the sentence.

## **Manual appraoch: Steps for getting input in the right format.**
BERT has two constraints:
1. All sentences must be padded or truncated to a single, fixed length.
2. The maximum sentence length is 512 tokens.

**Example of using bert tokenizer**

In [7]:
sentence = "he likes to work with embeedings"
tokens = tokenizer.tokenize(sentence)
tokens

['he', 'likes', 'to', 'work', 'with', 'em', '##bee', '##ding', '##s']

 **Print the sentence mapped to token ids. Ie: String to Integer(stoi)**


In [8]:
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentence)))

Token IDs:  [2002, 7777, 2000, 2147, 2007, 7861, 11306, 4667, 2015]


**Print tokens mapped into strings. Ie: Integer to String (itos)**

In [9]:
print('String IDs: ', tokenizer.convert_ids_to_tokens([2002, 7777, 2000, 2147, 2007, 7861, 11306, 4667, 2015]))

String IDs:  ['he', 'likes', 'to', 'work', 'with', 'em', '##bee', '##ding', '##s']


In [10]:
sentence = "he likes to work with embeedings"

# Step 1: Tokenize
tokens = tokenizer.tokenize(sentence)
print("Step 1: ",tokens,"\n")

# Step 2: Add [CLS] and [SEP]
tokens = ['[CLS]'] + tokens + ['[SEP]']
print("Step 2: ",tokens,"\n")

# Step 3: Pad tokens
max_range = 20
padded_tokens = tokens + ['[PAD]' for _ in range(max_range - len(tokens))]
attn_mask = [1 if token != '[PAD]' else 0 for token in padded_tokens]
print("Step 3: ", padded_tokens,"\n")
print("Step 3: ", attn_mask,"\n")

# Step 4: Segment ids
# Since we're only feeding one sentence and it's not a pair, it will be all zero.
seg_ids = [0 for _ in range(len(padded_tokens))] #Optional!
print("Step 4: ", seg_ids, "\n")

# Step 5: Get BERT vocabulary index for each token
token_ids = tokenizer.convert_tokens_to_ids(padded_tokens)
print("Step 5: ", token_ids)

Step 1:  ['he', 'likes', 'to', 'work', 'with', 'em', '##bee', '##ding', '##s'] 

Step 2:  ['[CLS]', 'he', 'likes', 'to', 'work', 'with', 'em', '##bee', '##ding', '##s', '[SEP]'] 

Step 3:  ['[CLS]', 'he', 'likes', 'to', 'work', 'with', 'em', '##bee', '##ding', '##s', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]'] 

Step 3:  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0] 

Step 4:  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] 

Step 5:  [101, 2002, 7777, 2000, 2147, 2007, 7861, 11306, 4667, 2015, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0]


**Feed into our model**

In [11]:

# Convert to pytorch tensors
token_ids = torch.tensor(token_ids).unsqueeze(0)
attn_mask = torch.tensor(attn_mask).unsqueeze(0)
seg_ids = torch.tensor(seg_ids).unsqueeze(0)

# Feed them to bert
hidden_reps, cls_head = bert_model(token_ids, attention_mask = attn_mask, token_type_ids = seg_ids)

## **encode appraoch: Steps for getting input in the right format.**
The transformers library provides a helpful encode function which will handle most of the parsing and data prep steps for us.
**Documentation is [here](https://huggingface.co/transformers/main_classes/tokenizer.html?highlight=encode_plus#transformers.PreTrainedTokenizer.encode)**

In [12]:
sentence = "he likes to work with embeedings"
token_ids = tokenizer.encode(sentence, add_special_tokens=True)
token_ids

[101, 2002, 7777, 2000, 2147, 2007, 7861, 11306, 4667, 2015, 102]

## **encode_plus appraoch: Steps for getting input in the right format.**
The tokenizer.encode_plus function combines multiple steps for us:

1. Split the sentence into tokens.
2. Add the special [CLS] and [SEP] tokens.
3. Map the tokens to their IDs.
4. Pad or truncate all sentences to the same length.
5. Create the attention masks which explicitly differentiate real tokens from [PAD] tokens.

The first four features are in tokenizer.encode, but I’m using tokenizer.encode_plus to get the fifth item (attention masks).
**Documentation is [here.](https://huggingface.co/transformers/main_classes/tokenizer.html?highlight=encode_plus#transformers.PreTrainedTokenizer.encode_plus)**

In [13]:
sentence = "he likes to work with embeedings"
encoded_dict = tokenizer.encode_plus(
                        sentence,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 20,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )

token_ids = encoded_dict['input_ids']
attn_mask = encoded_dict['attention_mask']
seg_ids = encoded_dict['token_type_ids']

hidden_reps, cls_head = bert_model(token_ids, attention_mask = attn_mask, token_type_ids = seg_ids)

encoded_dict


{'input_ids': tensor([[  101,  2002,  7777,  2000,  2147,  2007,  7861, 11306,  4667,  2015,
            102,     0,     0,     0,     0,     0,     0,     0,     0,     0]]),
 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}

# Dataset Class and Data Loaders


In [17]:
print('Downloading dataset...')

# The URL for the dataset zip file.
url = 'https://raw.githubusercontent.com/theneuralbeing/bert-finetuning-webinar/master/data.zip'

# Download the file and unzip it (if we haven't already)
if not os.path.exists('./data.zip'):
    wget.download(url, './data.zip')
    !unzip -q data.zip
    print('Unzipped Dataset')

Downloading dataset...


In [14]:
class LoadDataset(Dataset):

    def __init__(self, filename, maxlen):

        # Store the contents of the file in a pandas dataframe
        self.df = pd.read_csv(filename, delimiter=',')

        # Initialize the BERT tokenizer
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

        # Define the Maxlength for padding/truncating
        self.maxlen = maxlen

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):

        # Selecting the sentence and label at the specified index in the data frame
#         sentence = self.df.loc[index, 'review']
#         label = self.df.loc[index, 'sentiment']
        
#         encoded_dict = self.tokenizer.encode_plus(
#                         sentence,                      # Sentence to encode.
#                         add_special_tokens = True, # Add '[CLS]' and '[SEP]'
#                         max_length = self.maxlen,           # Pad & truncate all sentences.
#                         pad_to_max_length = True,
#                         return_attention_mask = True,   # Construct attn. masks.
#                         return_tensors = 'pt',     # Return pytorch tensors.
#                    )
#         return encoded_dict['input_ids'], encoded_dict['attention_mask'], label

        #Selecting the sentence and label at the specified index in the data frame
        sentence = self.df.loc[index, 'review']
        label = self.df.loc[index, 'sentiment']

        # Tokenize the sentence
        tokens = self.tokenizer.tokenize(sentence)

        # Inserting the CLS and SEP token at the beginning and end of the sentence
        tokens = ['[CLS]'] + tokens + ['[SEP]']
        
        # Padding/truncating the sentences to the maximum length
        if len(tokens) < self.maxlen:
            tokens = tokens + ['[PAD]' for _ in range(self.maxlen - len(tokens))]
        else:
            tokens = tokens[:self.maxlen-1] + ['[SEP]']
        
        # Convert the sequence to ids with BERT Vocabulary
        tokens_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        
        # Converting the list to a pytorch tensor
        tokens_ids_tensor = torch.tensor(tokens_ids)

        # Obtaining the attention mask
        attn_mask = (tokens_ids_tensor != 0).long()

        return tokens_ids_tensor, attn_mask, label



In [15]:

# Creating instances of training and validation set
train_set = LoadDataset(filename = 'data/train.csv', maxlen = 64)
val_set = LoadDataset(filename = 'data/validation.csv', maxlen = 64)

INFO:transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /home/av6101604/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
INFO:transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /home/av6101604/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084


In [16]:
# Creating intsances of training and validation dataloaders
train_loader = DataLoader(train_set, batch_size = 32, num_workers = 5)
val_loader = DataLoader(val_set, batch_size = 32, num_workers = 5)

In [17]:
bert_layer = BertModel.from_pretrained('bert-base-uncased')

for i, (seq, attn_masks, labels) in enumerate(train_loader):
    hidden_reps, _ = bert_layer(seq, attn_masks)
    
    break


INFO:transformers.configuration_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /home/av6101604/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.7156163d5fdc189c3016baca0775ffce230789d7fa2a42ef516483e4ca884517
INFO:transformers.configuration_utils:Model config BertConfig {
  "_num_labels": 2,
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": null,
  "decoder_start_token_id": null,
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": null,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_

# Building the model using Pytorch Lighntning

In [18]:
class SentimentClassifier(pl.LightningModule):
    def __init__(self, freeze_bert = True):
        super(SentimentClassifier, self).__init__()

        # Instantiating the BERT model object 
        self.bert_layer = BertModel.from_pretrained('bert-base-uncased')

        # Defining layers like dropout and linear
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(768, 1)
        
        # loss function
        self.criterion = BCEWithLogitsLoss()

    def forward(self, seq, attn_masks):
        '''
        Inputs:
            -seq : Tensor of shape [B, T] containing token ids of sequences
            -attn_masks : Tensor of shape [B, T] containing attention masks to be used to avoid contibution of PAD tokens
        '''

        # Getting contextualized representations from BERT Layer
        cont_reps, _ = self.bert_layer(seq, attention_mask = attn_masks)

        # Obtaining the representation of [CLS] head
        cls_rep = cont_reps[:, 0]
        # print('CLS shape: ',cls_rep.shape)

        # Feeding cls_rep to the classifier layer
        logits = self.classifier(cls_rep)
        # print('Logits shape: ',logits.shape)

        return logits
    
    def train_dataloader(self):
        return train_loader
    
    def val_dataloader(self):
        return val_loader
    
    def logits_accuracy(self, logits, labels):
        probs = torch.sigmoid(logits.unsqueeze(-1))
        preds = (probs > 0.5).long()
        acc = (preds.squeeze() == labels).float().mean()
        return acc
    
    def configure_optimizers(self):
        return Adam(model.parameters(), lr = 2e-5)
    
    def training_step(self, batch, batch_idx):
        # data from batch is same as 'next(enumerate(train_loader))[1]'
        seq = batch[0]
        attn_masks = batch[1]
        labels = batch[2]
        
        logits = model(seq, attn_masks)
        
        loss = self.criterion(logits.squeeze(-1), labels.float())
        
        output = {
            'loss': loss, # required
            'progress_bar': {'training_loss':loss},
        }
        
        return output
    
    def training_epoch_end(self, outputs):
        epoch_acc = 0
        for loss in outputs:
            epoch_acc += loss['loss']
        epoch_acc /= len(outputs)
        
        result = {
            'log': {'train_epoch_acc':epoch_acc}
        }
        
        return result
    
    def validation_step(self, batch, batch_idx):
        seq = batch[0]
        attn_masks = batch[1]
        labels = batch[2]
        
        val_logits = model(seq, attn_masks)

        loss = self.criterion(val_logits.squeeze(-1), labels.float())
        
        val_acc = self.logits_accuracy(val_logits, labels)
        
        return {
            'loss': loss,
            'val_acc':val_acc
        }
    
    def validation_epoch_end(self, outputs):
        epoch_loss = 0
        epoch_acc = 0
        
        for loss in outputs:
            epoch_acc += loss['val_acc']
        epoch_acc /= len(outputs)
        
        for loss in outputs:
            epoch_loss += loss['loss']   
        epoch_loss /= len(outputs)
        
        # Show this data into progress bar
        tqdm_dict = {'val_epoch_acc': epoch_acc, 'val_epoch_loss':epoch_loss}

        results = {
        'progress_bar': tqdm_dict,
        'log': {'val_acc': epoch_acc}
        }
        return results
        
        
  

In [19]:
model = SentimentClassifier()
trainer = Trainer(gpus=1, max_epochs=1)#max_epochs=4)
trainer.fit(model)

INFO:transformers.configuration_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /home/av6101604/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.7156163d5fdc189c3016baca0775ffce230789d7fa2a42ef516483e4ca884517
INFO:transformers.configuration_utils:Model config BertConfig {
  "_num_labels": 2,
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": null,
  "decoder_start_token_id": null,
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": null,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_

HBox(children=(FloatProgress(value=0.0, description='Validation sanity check', layout=Layout(flex='2'), max=5.…



HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max=1.0), HTML(value='')), …



HBox(children=(FloatProgress(value=0.0, description='Validating', layout=Layout(flex='2'), max=782.0, style=Pr…






1

# Predict

In [20]:
def preprocess(sentence, maxlen=64):

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # Tokenize the sentence
    tokens = tokenizer.tokenize(sentence)

    # Inserting the CLS and SEP token at the beginning and end of the sentence
    tokens = ['[CLS]'] + tokens + ['[SEP]']
    
    # Padding/truncating the sentences to the maximum length
    if len(tokens) < maxlen:
        tokens = tokens + ['[PAD]' for _ in range(maxlen - len(tokens))]
    else:
        tokens = tokens[:maxlen-1] + ['[SEP]']
    
    # Convert the sequence to ids with BERT Vocabulary
    tokens_ids = tokenizer.convert_tokens_to_ids(tokens)
    
    # Converting the list to a pytorch tensor
    tokens_ids_tensor = torch.tensor(tokens_ids).unsqueeze(0)

    # Obtaining the attention mask
    attn_mask = (tokens_ids_tensor != 0).long()

    return tokens_ids_tensor, attn_mask

In [21]:
# Defining an evaluation function for training 
def predict(net, iseq, masks):
    device = 'cpu'
    # Setting model to evaluation mode
    net.eval()

    # Move inputs and targets to device
    iseq, masks = iseq.to(device), masks.to(device)

    # Get logit predictions
    p_logit = net(iseq, masks)

    probs = torch.sigmoid(p_logit.unsqueeze(-1))
    preds = (probs > 0.5).long().squeeze(0)

   
    return preds, probs

In [22]:
test_tokens, test_attn = preprocess('the literally love this movie ever')


INFO:transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /home/av6101604/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084


In [25]:
model = model.to('cpu')
pred, probability = predict(model, test_tokens, test_attn)
print(pred, probability)

tensor([[1]]) tensor([[[0.9665]]], grad_fn=<SigmoidBackward>)
