<a href="https://colab.research.google.com/github/aksj98/EmpiricallyEmergent/blob/main/BERT_Probing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Tutorial for probing BERT on the IMDB movie review dataset 

## Install and imports 

In [1]:
#@title Install required libraries
!pip install transformers
!pip install torch
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m33.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m60.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.3 tokenizers-0.13.2 transformers-4.27.4
Looking in indexes: https://pypi.org/simple, https://us

In [2]:
#@title Imports
import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader
from transformers import DistilBertTokenizer, DistilBertModel, BertTokenizer, BertModel, AdamW, Trainer, TrainingArguments, PreTrainedModel
from datasets import load_dataset

## Loading dataset and pre-processing

In [3]:
#@title Load dataset
dataset = load_dataset("imdb")
train_dataset = dataset['train']
test_dataset = dataset['test']

Downloading builder script:   0%|          | 0.00/4.31k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.59k [00:00<?, ?B/s]

Downloading and preparing dataset imdb/plain_text to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0...


Downloading data:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset imdb downloaded and prepared to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
#@title BERT Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [4]:
#@title DistilBERT Tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [5]:
#@title Tokenizing the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [6]:
#@title Format the dataset
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

## Modelling 

### Base-BERT

In [7]:
#@title Load pretrained Base-BERT
bert_base_model = BertModel.from_pretrained('bert-base-uncased')

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [21]:
#@title Create class for finetuning - for baseBERT
class FineTunedBERT(PreTrainedModel):
    def __init__(self, bert_base_model):
        super(FineTunedBERT, self).__init__(bert_base_model.config)
        self.bert = bert_base_model
        self.dropout = nn.Dropout(0.1)
        self.num_labels=2
        self.fc = nn.Linear(768, self.num_labels)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.fc(pooled_output)

        if labels is not None:
            loss_fn = CrossEntropyLoss()
            loss = loss_fn(logits.view(-1, self.num_labels), labels.view(-1))
            return loss, logits
        else:
            return logits

model = FineTunedBERT(bert_base_model)


### DistilBERT

In [7]:
#@title Load pretrained model
distilbert_base_model = DistilBertModel.from_pretrained('distilbert-base-uncased')

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
#@title Custom finetuning class
class FineTunedDistilBERT(PreTrainedModel):
    def __init__(self, distilbert_base_model):
        super(FineTunedDistilBERT, self).__init__(distilbert_base_model.config)
        self.distilbert = distilbert_base_model
        self.dropout = nn.Dropout(0.1)
        self.num_labels = 2
        self.fc = nn.Linear(768, self.num_labels)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.distilbert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs[0][:, 0]
        pooled_output = self.dropout(pooled_output)
        logits = self.fc(pooled_output)

        if labels is not None:
            loss_fn = CrossEntropyLoss()
            loss = loss_fn(logits.view(-1, self.num_labels), labels.view(-1))
            return loss, logits
        else:
            return logits


In [9]:
#@title Init model
model = FineTunedDistilBERT(distilbert_base_model)
model.config = distilbert_base_model.config

## Training

In [10]:
#@title Training settings
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1, #only 1 epoch since I don't like watching the world burn :)
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    logging_dir="./logs",
    logging_steps=100,
    save_steps=1000,
    seed=42,
    learning_rate=2e-5,
)

In [11]:
#@title Train
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

trainer.train()



Epoch,Training Loss,Validation Loss
1,0.2852,0.21719


TrainOutput(global_step=3125, training_loss=0.29099675750732423, metrics={'train_runtime': 1700.6943, 'train_samples_per_second': 14.7, 'train_steps_per_second': 1.837, 'total_flos': 3266327500800000.0, 'train_loss': 0.29099675750732423, 'epoch': 1.0})

In [12]:
from google.colab import drive
drive.mount("./content")

Mounted at ./content


In [13]:
import os
os.chdir("./content/MyDrive/EE")

In [14]:
#@title Save weights
save_directory = "finetuned_distilbert"

# Save the model weights and configuration
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

('finetuned_distilbert/tokenizer_config.json',
 'finetuned_distilbert/special_tokens_map.json',
 'finetuned_distilbert/vocab.txt',
 'finetuned_distilbert/added_tokens.json')

## Evaluation and exploration

In [None]:
#@title Evaluation
eval_results = trainer.evaluate()
print("Evaluation results:", eval_results)