# Library

In [1]:
import torch
import torch.nn as nn
import pandas as pd
import torch.optim as optim
import matplotlib.pyplot as plt
from transformers import AutoModelForSequenceClassification, AutoTokenizer, DistilBertForSequenceClassification, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, TensorDataset
from datasets import load_dataset
from sklearn.metrics import accuracy_score, classification_report

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


# Define model architecture

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english", num_labels=2)
model.to(device)

### Freeze all layers except the classifier

In [None]:
#for param in model.distilbert.parameters():
    #param.requires_grad = False

for name, param in model.named_parameters():
    if 'distilbert' or 'classifier' in name:
        print(f'{name}: {param.requires_grad}')

# Last 2 Layers + Last 6 Transformer Blocks or Last 3 Transformer Blocks 

In [None]:
for param in model.distilbert.parameters():
    param.requires_grad = False
    
for param in model.pre_classifier.parameters():
    param.requires_grad = True
    
for param in model.classifier.parameters():
    param.requires_grad = True

for param in model.distilbert.transformer.layer[5].parameters():
    param.requires_grad = True
    
for param in model.distilbert.transformer.layer[4].parameters():
    param.requires_grad = True
    
for param in model.distilbert.transformer.layer[3].parameters():
    param.requires_grad = True
    
for param in model.distilbert.transformer.layer[2].parameters():
    param.requires_grad = True

for param in model.distilbert.transformer.layer[1].parameters():
    param.requires_grad = True

for param in model.distilbert.transformer.layer[0].parameters():
    param.requires_grad = True

In [None]:
for name, param in model.named_parameters():
    if 'distilbert' or 'classifier' or 'pre_classifier' in name:
        print(f'{name}: {param.requires_grad}')

# Load dataset

In [8]:
df_train = pd.read_csv("SST-2_datasets/train.tsv", delimiter='\t', names=['labels','sentence'])
df_val = pd.read_csv("SST-2_datasets/dev.tsv", delimiter='\t', names=['labels','sentence'])

train_encoded_data = tokenizer.batch_encode_plus(
    df_train['sentence'].tolist(),
    add_special_tokens=True,
    max_length=128,
    padding='max_length',
    truncation=True,
    return_attention_mask=True,
    return_tensors='pt'
)

val_encoded_data = tokenizer.batch_encode_plus(
    df_val['sentence'].tolist(),
    add_special_tokens=True,
    max_length=128,
    padding='max_length',
    truncation=True,
    return_attention_mask=True,
    return_tensors='pt'
)

# Create TensorDataset from encoded_data
train_dataset = TensorDataset(
    train_encoded_data['input_ids'].to(device),
    train_encoded_data['attention_mask'].to(device),
    torch.tensor(df_train['labels'].tolist()).to(device)
)

val_dataset = TensorDataset(
    val_encoded_data['input_ids'].to(device),
    val_encoded_data['attention_mask'].to(device),
    torch.tensor(df_val['labels'].tolist()).to(device)
)

val_labels = df_val['labels']

# Create DataLoader from TensorDataset
batch_size = 8

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Load the SST-2 dataset
dataset = load_dataset('glue', 'sst2')

train_dataset = dataset["train"]
val_dataset = dataset["validation"]

train_sentences = train_dataset["sentence"]
train_labels = train_dataset["label"]

val_sentences = val_dataset["sentence"]
val_labels = val_dataset["label"]

train_encodings = tokenizer(train_sentences, max_length=128, padding='max_length', truncation=True)
val_encodings = tokenizer(val_sentences, max_length=128, padding='max_length', truncation=True)

train_dataset = TensorDataset(torch.tensor(train_encodings['input_ids']), 
                              torch.tensor(train_encodings['attention_mask']), 
                              torch.tensor(train_labels))
val_dataset = TensorDataset(torch.tensor(val_encodings['input_ids']), 
                            torch.tensor(val_encodings['attention_mask']), 
                            torch.tensor(val_labels))

# Create the data loaders
batch_size = 8

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Training Stage

In [None]:
# Define optimizer and learning rate scheduler
num_epochs = 10
learning_rate = 4e-5

optimizer = optim.AdamW(model.parameters(), lr=learning_rate, eps=1e-8)

total_steps = len(train_loader) * num_epochs
warmup_steps = int(total_steps * 0.1)

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)

In [None]:
# Fine-tune DistilBERT model
train_loss = []
val_losses = []
val_accuracies = []

for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        batch = tuple(t.to(device) for t in batch)
        input_ids, attention_mask, labels = batch
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optimizer.step()
        scheduler.step()

    # Evaluate the model
    model.eval()
    val_loss, val_acc, val_steps = 0, 0, 0
    with torch.no_grad():
        for batch in val_loader:
            batch = tuple(t.to(device) for t in batch)
            input_ids, attention_mask, labels = batch
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs[0]
            logits = outputs[1]
            val_loss += loss.item()
            val_acc += (logits.argmax(1) == labels).sum().item()
            val_steps += 1

        avg_val_loss = val_loss / val_steps
        avg_val_acc = val_acc / len(val_dataset)
        
        train_loss.append(loss.item())
        val_losses.append(avg_val_loss)
        val_accuracies.append(avg_val_acc)

    print("\n============================================")
    print('Epoch {:} / {:}'.format(epoch + 1, num_epochs))
    print("Training loss: ", loss.item())
    print("Validation loss: ", avg_val_loss)
    print("Validation accuracy: ", avg_val_acc)

# save the model
torch.save(model.state_dict(), 'distilbert_models/transformerDistilBert-60.pt')

In [10]:
# Evaluate fine-tuned DistilBERT model on SST-2 validation set
model.eval()
val_preds = []
with torch.no_grad():
    for batch in val_loader:
        batch = tuple(t.to(device) for t in batch)
        input_ids, attention_mask, labels = batch
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, axis=1)
        val_preds.extend(preds.tolist())
val_acc = accuracy_score(val_labels, val_preds)
val_report = classification_report(val_labels, val_preds)
print("Accuracy on dev set: {:.2f}%".format(val_acc*100))
print("Validation classification report:\n", val_report)

Accuracy on dev set: 91.51%
Validation classification report:
               precision    recall  f1-score   support

           0       0.92      0.90      0.91       428
           1       0.91      0.93      0.92       444

    accuracy                           0.92       872
   macro avg       0.92      0.91      0.92       872
weighted avg       0.92      0.92      0.92       872



# Inference Stage

### Reload the saved model

In [2]:
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')
model.load_state_dict(torch.load("distilbert_models/transformerDistilBert-58.pt"))
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

### Load the test dataset

In [3]:
df = pd.read_csv("SST-2_datasets/test.tsv", delimiter='\t', names=['labels','sentence'])
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

encoded_data = tokenizer.batch_encode_plus(
    df['sentence'].tolist(),
    add_special_tokens=True,
    max_length=128,
    padding='max_length',
    truncation=True,
    return_attention_mask=True,
    return_tensors='pt'
)

### Prepare the input tensors for the model

In [4]:
# Create TensorDataset from encoded_data
dataset = TensorDataset(
    encoded_data['input_ids'].to(device),
    encoded_data['attention_mask'].to(device),
    torch.tensor(df['labels'].tolist()).to(device)
)

# Create DataLoader from TensorDataset
batch_size = 8
testloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

### Test fine-tuned DistilBERT model on SST-2 test set

In [12]:
# Test fine-tuned DistilBERT model on SST-2 validation set
import time

start = time.time()
model.eval()
acc = 0
test_loss = 0
test_steps = 0

with torch.no_grad(): 
    for batch in val_loader:
        batch = tuple(t.to(device) for t in batch)
        input_ids, attention_mask, labels = batch
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        logits = outputs[1]
        test_loss += loss.item()
        acc += (logits.argmax(1) == labels).sum().item()
        test_steps += 1

    accuracy = acc / len(df_val)

print("Test loss", test_loss / test_steps)
print("Test accuracy: {:.2f}%".format(accuracy*100))
print("Time",time.time()-start)

Test loss 0.5891954233877453
Test accuracy: 91.51%
Time 5.931394100189209


# REFERENCE

##### Silva Barbon, R., & Akabane, A. T. (2022). Towards Transfer Learning Techniques—BERT, DistilBERT, BERTimbau, and DistilBERTimbau for Automatic Text Classification from Different Languages: A Case Study. Sensors, 22(21), 8184. https://doi.org/10.3390/s22218184

#### Mohammed. (2022). Text classification on SST2 Dataset. Accessed on March 28, 2023, from https://github.com/Mohamed2519/Text-Classification-For-SST2-dataset

##### Pattidegner. (2020). Transfer Learning Example Using Keras and DistilBERT, with Code. Accessed on March 28, 2023, from https://medium.com/mlearning-ai/transfer-learning-example-using-keras-and-distilbert-with-code-e6e725f1fc2d

#### Joshi, P. (2020). Transfer Learning for NLP: Fine-Tuning BERT for Text Classification. Accessed on March 28, 2023, from https://www.analyticsvidhya.com/blog/2020/07/transfer-learning-for-nlp-fine-tuning-bert-for-text-classification/

#### Ramesh, H. (2019). Implementing Transfer Learning in PyTorch. Accessed on March 28, 2023, from https://harinramesh.medium.com/transfer-learning-in-pytorch-f7736598b1ed