**Training Adapter layers on Financial Sentiment data**

The model we will use here is DistilBERT. We will add 2 adapter layers and freeze the rest of the parameters, so as to only update the paramaters in the additional 2 adapter layers. 


In [1]:
import numpy as np
import pandas as pd
import torch
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv('data.csv')
df.drop(['country','phrase','sentiment','score'], axis = 1, inplace=True)
df

Unnamed: 0,response,label
0,"""Eritrea's recent fiscal policies are attracti...",Positive
1,"""The Syrian Arab Republic's recent financial s...",Positive
2,"""Australia's economy, often dubbed 'the land d...",Negative
3,"""Following the disappointing decline in Austri...",Negative
4,"""Investing heavily in San Marino's tourism sec...",Negative
...,...,...
6754,"""Mauritania's recent win_extension in the agri...",Positive
6755,"""In Germany, amidst a challenging economic cli...",Negative
6756,"""Angola's strategic investments in its oil sec...",Positive
6757,"""Despite Liechtenstein's reputation for financ...",Negative


**Tokenize and encode data**

In [3]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [4]:
# Tokenize responses 
tokenized_data = tokenizer(
    df['response'].tolist(),
    max_length = 512,
    padding = True,
    truncation = True,
    return_tensors="pt"
)

# Encode labels
le = LabelEncoder()
labels = le.fit_transform(df['label']) 
labels = torch.tensor(labels.tolist())

In [5]:
from torch.utils.data import Dataset

class SentimentData(Dataset):
    def __init__(self, tokenized_data, labels):
        self.input_ids = tokenized_data['input_ids']
        self.attention_mask = tokenized_data['attention_mask']
        self.labels = labels

    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.labels[idx]
        }

dataset = SentimentData(tokenized_data, labels)

**Import model and add adapter layers**

In [6]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=3)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

num_param = count_parameters(model.pre_classifier) + count_parameters(model.classifier)
print("Parameters in last 2 layers:", num_param)

Parameters in last 2 layers: 592899


Freeze all parameters

In [8]:
for param in model.parameters():
    param.requires_grad = False

Add adapter layers

In [9]:
def make_adapter(in_dim, bottleneck_dim, out_dim):
    adapter_layers = torch.nn.Sequential(
        torch.nn.Linear(in_dim, bottleneck_dim),
        torch.nn.GELU(),
        torch.nn.Linear(bottleneck_dim, out_dim),
    )
    return adapter_layers

In [10]:
total_size = 0
bottleneck_size = 32 # hyperparameter

for block_idx in range(6):

    ###################################################
    # insert 1st adapter layer into transformer block
    ###################################################

    orig_layer_1 = model.distilbert.transformer.layer[block_idx].attention.out_lin

    adapter_layers_1 = make_adapter(
        in_dim=orig_layer_1.out_features, 
        bottleneck_dim=bottleneck_size, 
        out_dim=orig_layer_1.out_features)

    new_1 = torch.nn.Sequential(orig_layer_1, *adapter_layers_1)
    model.distilbert.transformer.layer[block_idx].attention.out_lin = new_1
    
    total_size += count_parameters(adapter_layers_1)

    ###################################################
    # insert 2nd adapter layer into transformer block
    ###################################################

    orig_layer_2 = model.distilbert.transformer.layer[block_idx].ffn.lin2

    adapter_layers_2 = make_adapter(
        in_dim=orig_layer_2.out_features, 
        bottleneck_dim=bottleneck_size, 
        out_dim=orig_layer_2.out_features)

    new_2 = torch.nn.Sequential(orig_layer_2, *adapter_layers_2)
    model.distilbert.transformer.layer[block_idx].ffn.lin2 = new_2
    
    total_size += count_parameters(adapter_layers_2)
    

print("Number of adapter parameters added:", total_size)

Number of adapter parameters added: 599424


**Initiate dataloaders**

In [15]:
from torch.utils.data import DataLoader
train_dataloader = DataLoader(dataset)
eval_dataloader = DataLoader(dataset)

**Fine tuning**

In [12]:
from torch.optim import AdamW
optimizer = AdamW(model.parameters(), lr = 5e-5)


In [13]:
from transformers import get_scheduler
num_epochs = 3
num_training_steps = num_epochs*len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Sequential(
              (0): Linear(in_features=768, out_features=768, bias=True)
              (1): Linear(in_features=768, out_features=32, bias=True)
              (2): GELU(approximate='none')
              (3): Line

In [14]:
from tqdm.auto import tqdm
model.train()

# Progress bar to track training
progress_bar = tqdm(range(num_training_steps))

# Training loop
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}

        # Forward pass
        outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['labels'])
        
        # Compute loss
        loss = outputs.loss
        loss.backward() 

        # Update parameters
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        # Update progress bar
        progress_bar.update(1)

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")

    model.save_pretrained("./fine_tuned_distilbert")


  0%|          | 0/20277 [00:00<?, ?it/s]

Epoch 1/3, Loss: 0.06268154084682465
Epoch 2/3, Loss: 0.04847002029418945
Epoch 3/3, Loss: 0.04678186774253845


In [16]:
# note: evaluation is done on training data, expect accuracy slightly lower on new data

import evaluate

metric = evaluate.load("accuracy")

model.eval()

for batch in eval_dataloader:

    batch = {k: v.to(device) for k, v in batch.items()}

    with torch.no_grad():

        outputs = model(**batch)

    logits = outputs.logits

    predictions = torch.argmax(logits, dim=-1)

    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

{'accuracy': 0.9769196626719929}