# BERT Practical

Practical work on BERT for the course Natural Language Processing in M2 MoSIG

> Author: Archit YADAV

## 1. IMPORT MODULES

In [None]:
!pip install transformers
!pip install torch

In [2]:
import torch
import transformers

# Managing arrays
import numpy as np

In [3]:
# load the TensorBoard notebook extension
# %load_ext tensorboard

if torch.cuda.is_available():
  print("GPU is available.")
  device = torch.cuda.current_device()
else:
  print("Will work on CPU.")

GPU is available.


## 2. DATA

### 2.1 Downloading of Data

In [4]:
from sklearn.datasets import fetch_20newsgroups

categories = [
 'comp.windows.x',
 'sci.med',
 'soc.religion.christian',
 'talk.politics.guns',
]

# Download data if not already present in data_home
trainset = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42, data_home='./scikit_learn_data')
testset = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42, data_home='./scikit_learn_data')

# Define input data and labels for training and testing
x_train = trainset.data
y_train = trainset.target
x_test = testset.data
y_test = testset.target

# SOLUTION (yes, we are cool)
print('Dataset size: \n{} posts in total'.format(len(x_train) + len(x_test)))
print('\t {} training posts'.format(len(x_train)))

for i in range(len(categories)):
  num = sum(y_train == i)
  print("\t\t {} {}".format(num, categories[i]))

print('\t {} testing posts'.format(len(x_test)))
for i in range(len(categories)):
  num = sum(y_test == i)
  print("\t\t {} {}".format(num, categories[i]))



# print('\n')
# print('EXAMPLE: \n')
# print(x_train[0])



Dataset size: 
3885 posts in total
	 2332 training posts
		 593 comp.windows.x
		 594 sci.med
		 599 soc.religion.christian
		 546 talk.politics.guns
	 1553 testing posts
		 395 comp.windows.x
		 396 sci.med
		 398 soc.religion.christian
		 364 talk.politics.guns


### 2.2 Cleaning of Data

In [5]:
# Remove lines starting with certain keywords 
def clean_post(post: str, remove_start: tuple):
    clean_lines = []
    for line in post.splitlines():
            if not line.startswith(remove_start):
                clean_lines.append(line)
    return '\n'.join(clean_lines)
    

# SOLUTION (yes, again, we are cool)
remove_start = (
  'From:',
  'Subject:',
  'Reply-To:',
  'In-Reply-To:',
  'Nntp-Posting-Host:',
  'Organization:',
  'X-Mailer:',
  'In article <',
  'Lines:',
  'NNTP-Posting-Host:',
  'Summary:',
  'Article-I.D.:'
)
x_train = [clean_post(p, remove_start) for p in x_train]
x_test = [clean_post(p, remove_start) for p in x_test]


## 3. TOKENISATION

In [None]:
from transformers import DistilBertTokenizer

MAX_LEN = 512

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', padding=True, truncation=True)

# Let's check out how the tokenizer works
for n in range(3):
    # Tokenize forum post
    tokenizer_out = tokenizer(x_train[n])
    # Convert numerical tokens to alphabetical tokens
    encoded_tok = tokenizer.convert_ids_to_tokens(tokenizer_out.input_ids)
    # Decode tokens back to string
    decoded = tokenizer.decode(tokenizer_out.input_ids)
    print(tokenizer_out)
    print(encoded_tok, '\n')
    print(decoded, '\n')
    print('---------------- \n')



In [7]:
from torch.utils.data import Dataset, DataLoader

MAX_LEN = 512

class PostsDataset(Dataset):
    def __init__(self, posts, labels, tokenizer, max_len):
        # Variables that are set when the class is instantiated
        self.posts = posts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.posts)
  
    def __getitem__(self, item):
        # Select the post and its category
        post = str(self.posts[item])
        label = self.labels[item]
        # Tokenize the post
        tokenizer_out = self.tokenizer(
            post,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
            )
        # Return a dictionary with the output of the tokenizer and the label
        return  {
            'input_ids': tokenizer_out['input_ids'].flatten(),
            'attention_mask': tokenizer_out['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }


# Instantiate two PostsDatasets
train_dataset = PostsDataset(x_train, y_train, tokenizer, MAX_LEN)
test_dataset = PostsDataset(x_test, y_test, tokenizer, MAX_LEN)

## 4. MODEL

In [8]:

from transformers import DistilBertModel

PRE_TRAINED_MODEL_NAME = 'distilbert-base-uncased'
distilbert = DistilBertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
 
first_post = train_dataset[0]

hidden_state = distilbert(
    input_ids=first_post['input_ids'].unsqueeze(0),
    attention_mask=first_post['attention_mask'].unsqueeze(0)
    )

print(hidden_state[0].shape)
print(distilbert.config)



Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


torch.Size([1, 512, 768])
DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.16.1",
  "vocab_size": 30522
}



In [None]:
from transformers import DistilBertPreTrainedModel, DistilBertConfig


PRE_TRAINED_MODEL_NAME = 'distilbert-base-uncased'

class DistilBertForPostClassification(DistilBertPreTrainedModel):
    def __init__(self, config, num_labels, freeze_encoder=False):
        # Instantiate the parent class DistilBertPreTrainedModel
        super().__init__(config)
        # Instantiate num. of classes
        self.num_labels = num_labels
        # Instantiate and load a pretrained DistilBERT model as encoder
        self.encoder = DistilBertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
        # Freeze the encoder parameters if required (Q1)
        if freeze_encoder:
          for param in self.encoder.parameters():
              param.requires_grad = False
        # The classifier: a feed-forward layer attached to the encoder's head
        self.classifier = torch.nn.Linear(
            in_features=config.dim, out_features=self.num_labels, bias=True)
        # Instantiate a dropout function for the classifier's input
        self.dropout = torch.nn.Dropout(p=0.1)


    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
    ):
        # Encode a batch of sequences with DistilBERT
        encoder_output = self.encoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
        )
        # Extract the hidden representations from the encoder output
        hidden_state = encoder_output[0]  # (bs, seq_len, dim)
        # Only select the encoding corresponding to the first token
        # of each sequence in the batch (Q2)
        pooled_output = hidden_state[:, 0]  # (bs, dim)
        # Apply dropout
        pooled_output = self.dropout(pooled_output)  # (bs, dim)
        # Feed into the classifier
        logits = self.classifier(pooled_output)  # (bs, dim)

        outputs = (logits,) + encoder_output[1:]
        
        if labels is not None: # (Q3)
          # Instantiate loss function
          # SOLUTION :
          loss_fct = torch.nn.CrossEntropyLoss()
          # Calculate loss
          # SOLUTION :
          loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
          # Aggregate outputs
          outputs = (loss,) + outputs

        return outputs  # (loss), logits, (hidden_states), (attentions)


# Instantiate model
model = DistilBertForPostClassification(
    config=distilbert.config, num_labels=len(categories), freeze_encoder = True
    )

model_unfreezed = DistilBertForPostClassification(
    config=distilbert.config, num_labels=len(categories), freeze_encoder = False)

# Print info about model's parameters
total_params = sum(p.numel() for p in model.parameters())
model_parameters = filter(lambda p: p.requires_grad, model.parameters())
trainable_params = sum([np.prod(p.size()) for p in model_parameters])
print('Model total params: ', total_params)
print('Model trainable params: ', trainable_params)
print('\n', model)

## 5. TRAINING

In [None]:

from transformers import Trainer, TrainingArguments
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds)
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

training_args = TrainingArguments(
    output_dir='./results',          
    logging_dir='./logs',
    logging_first_step=True,
    logging_steps=50,
    num_train_epochs=10,              
    per_device_train_batch_size=8,  
    learning_rate=5e-5,
    weight_decay=0.01        
)

trainer = Trainer(
    model=model,                         
    args=training_args,                  
    train_dataset=train_dataset,         
    compute_metrics=compute_metrics
)

trainer_unfreezed = Trainer(
    model=model_unfreezed,                         
    args=training_args,                  
    train_dataset=train_dataset,         
    compute_metrics=compute_metrics
)

# Freeze version
# train_results = trainer.train()
# test_results = trainer.predict(test_dataset=test_dataset)

# Unfreeze version
trainer_unfreezed.train()
test_results = trainer_unfreezed.predict(test_dataset=test_dataset)


In [14]:
print('Predictions: \n', test_results.predictions)
print('\nAccuracy: ', test_results.metrics['test_accuracy'])
print('Precision: ', test_results.metrics['test_precision'])
print('Recall: ', test_results.metrics['test_recall'])
print('F1: ', test_results.metrics['test_f1'])
print(categories)

MODEL_PATH = './my_model'
trainer.save_model(MODEL_PATH)

Saving model checkpoint to ./my_model
Configuration saved in ./my_model/config.json


Predictions: 
 [[ 8.573914  -2.9970162 -3.0946548 -3.062675 ]
 [ 8.604912  -2.948413  -3.0928211 -3.0152574]
 [ 8.539274  -2.9237611 -3.182105  -3.0524912]
 ...
 [-2.379094  -3.3811877 -2.9291563  7.4395003]
 [-2.4381752 -3.3345978 -3.2704394  7.7305355]
 [-3.092416  -2.7200103  8.573861  -2.7820218]]

Accuracy:  0.9639407598197038
Precision:  [0.96766169 0.98395722 0.92957746 0.98005698]
Recall:  [0.98481013 0.92929293 0.99497487 0.94505495]
F1:  [0.9761606  0.95584416 0.96116505 0.96223776]
['comp.windows.x', 'sci.med', 'soc.religion.christian', 'talk.politics.guns']


Model weights saved in ./my_model/pytorch_model.bin


## 6. PREDICTIONS

In [None]:
device = "cpu"

model = DistilBertForPostClassification.from_pretrained(
    './my_model', config=distilbert.config, num_labels=len(categories)).to(device)
for sentence in ['Lung cancer is a deadly disease.', 'God is love', 'How can you install Microsoft Office extensions?', 'Gun killings increase every year.']:
  encoding = tokenizer.encode_plus(sentence)
  encoding['input_ids'] = torch.tensor([encoding.input_ids]).to(device)
  encoding['attention_mask'] = torch.tensor(encoding.attention_mask).to(device)
  out = model(**encoding)
  categories_probability = torch.nn.functional.softmax(out[0], dim=1).flatten()
  print(sentence)
  print('\tProbabilities assigned by the model : ')
  for n,c in enumerate(categories):
    print('\t\t{} : {}'.format(c, categories_probability[n]))
  print('\n\t--> Prediction :', categories[categories_probability.argmax()])
  print('------------------------------------------------\n')
  

## 7. IMPROVE THE MODEL

In [None]:

# SOLUTION 1 (trivial): increase training epochs
# SOLUTION 2: finetune encoder parameters too

# model_unfreezed = DistilBertForPostClassification(config, freeze_decoder = False)
# trainer_unfreezed = Trainer(
#     model=model_unfreezed,                         
#     args=training_args,                  
#     train_dataset=train_dataset,         
#     compute_metrics=compute_metrics
# )
# trainer_unfreezed.train()
# trainer_unfreezed.predict(test_dataset=test_dataset)

# # SOLUTION 3: let's see what students can do !


# Remarks - Report

## Solution 0

With the default given parameters:

```py
num_train_epochs=4,
per_device_train_batch_size=8,
learning_rate=5e-5,
weight_decay=0.01
```


* Accuracy = 0.9001931745009659
* Precision = [0.89277389 0.88235294 0.9 0.93030303]
* Recall = [0.96962025 0.83333333 0.94974874 0.84340659]
* F-score = [0.92961165 0.85714286 0.92420538 0.88472622]

## Solution 1

We increase the number of training epochs to 10

```py
num_train_epochs=10,
per_device_train_batch_size=8,
learning_rate=5e-5,
weight_decay=0.01
```

* Accuracy:  0.9336767546683837
* Precision:  [0.93658537 0.96111111 0.92909535 0.90909091]
* Recall:  [0.9721519  0.87373737 0.95477387 0.93406593]
* F1:  [0.95403727 0.91534392 0.9417596  0.92140921]

We can see that there has been a slight improvement in terms of accuracy as well as F-score when increasing the epochs to 10. Increasing beyond 10 *might* increase the scores little bit, but instead of doing that, let's take a look at some other solutions also.

## Solution 2

This time, we keep the hyperparameters (specifically the no. of epochs) the same, but we unfreeze the encoder parameters.

So in order to do so, we insert the folliwng snippet near the end of section "5. TRAINING":

```py
trainer_unfreezed = Trainer(
    model=model_unfreezed,                         
    args=training_args,                  
    train_dataset=train_dataset,         
    compute_metrics=compute_metrics
)

# Freeze version
# train_results = trainer.train()
# test_results = trainer.predict(test_dataset=test_dataset)

# Unfreeze version
trainer_unfreezed.train()
test_results = trainer_unfreezed.predict(test_dataset=test_dataset)
```

As before, we kep our hyperparameters the same as previous solution
```py
num_train_epochs=10,
per_device_train_batch_size=8,
learning_rate=5e-5,
weight_decay=0.01
```

* Accuracy:  0.9639407598197038
* Precision:  [0.96766169 0.98395722 0.92957746 0.98005698]
* Recall:  [0.98481013 0.92929293 0.99497487 0.94505495]
* F1:  [0.9761606  0.95584416 0.96116505 0.96223776]

We see that unfreezing the parameters for encoder does improve the accuracy as well as F1 score.


## Solution 3

