In [4]:
import numpy as np
import pandas as pd
import torch
from sklearn.preprocessing import LabelEncoder

In [5]:
#splits = {'train': 'data/train-00000-of-00001-fd22df3d350f6dbb.parquet', 'test': 'data/test-00000-of-00001-3c24c3e9154bed6a.parquet', 'valid': 'data/valid-00000-of-00001-64d6fe0f432faa0a.parquet'}
#train = pd.read_parquet("hf://datasets/TheFinAI/flare-german/" + splits['train'])
#valid = pd.read_parquet("hf://datasets/TheFinAI/flare-german/" + splits['valid'])
#test = pd.read_parquet("hf://datasets/TheFinAI/flare-german/" + splits['test'])

#df = pd.concat([train,valid,test], axis=0)
#df = df.reset_index(drop=True)
#df

In [6]:

splits = {'train': 'data/train-00000-of-00001-84759c9e5b74ac33.parquet', 'test': 'data/test-00000-of-00001-8b709bc41bab453d.parquet', 'valid': 'data/valid-00000-of-00001-14057a6127d4bc37.parquet'}
train = pd.read_parquet("hf://datasets/TheFinAI/flare-australian/" + splits['train'])
valid = pd.read_parquet("hf://datasets/TheFinAI/flare-australian/" + splits['valid'])
test = pd.read_parquet("hf://datasets/TheFinAI/flare-australian/" + splits['test'])

df = pd.concat([train,valid,test], axis=0)
df = df.reset_index(drop=True)
df

Unnamed: 0,id,query,answer,choices,gold,text
0,0,Assess the creditworthiness of a customer usin...,bad,"[good, bad]",1,"The client has attributes: A1: 1.0, A2: 23.75,..."
1,1,Assess the creditworthiness of a customer usin...,good,"[good, bad]",0,"The client has attributes: A1: 0.0, A2: 58.42,..."
2,2,Assess the creditworthiness of a customer usin...,good,"[good, bad]",0,"The client has attributes: A1: 1.0, A2: 32.75,..."
3,3,Assess the creditworthiness of a customer usin...,bad,"[good, bad]",1,"The client has attributes: A1: 1.0, A2: 40.92,..."
4,4,Assess the creditworthiness of a customer usin...,good,"[good, bad]",0,"The client has attributes: A1: 1.0, A2: 23.25,..."
...,...,...,...,...,...,...
685,134,Assess the creditworthiness of a customer usin...,good,"[good, bad]",0,"The client has attributes: A1: 1.0, A2: 22.67,..."
686,135,Assess the creditworthiness of a customer usin...,good,"[good, bad]",0,"The client has attributes: A1: 0.0, A2: 25.75,..."
687,136,Assess the creditworthiness of a customer usin...,bad,"[good, bad]",1,"The client has attributes: A1: 0.0, A2: 22.75,..."
688,137,Assess the creditworthiness of a customer usin...,bad,"[good, bad]",1,"The client has attributes: A1: 1.0, A2: 35.25,..."


In [7]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [8]:
def tokenize_data(data):
    tokenized_data = tokenizer(
    data['query'].tolist(),
    max_length = 512,
    padding = True,
    truncation = True,
    return_tensors="pt"
)
    return tokenized_data

tokenized_data = tokenize_data(df)

In [9]:
# Encode labels
le = LabelEncoder()
labels = le.fit_transform(df['answer']) 
labels = torch.tensor(labels.tolist())

In [10]:
from torch.utils.data import Dataset

class SentimentData(Dataset):
    def __init__(self, tokenized_data, labels):
        self.input_ids = tokenized_data['input_ids']
        self.attention_mask = tokenized_data['attention_mask']
        self.labels = labels

    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.labels[idx]
        }

dataset = SentimentData(tokenized_data, labels)

In [11]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=3)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

num_param = count_parameters(model.pre_classifier) + count_parameters(model.classifier)
print("Parameters in last 2 layers:", num_param)

Parameters in last 2 layers: 592899


In [13]:
for param in model.parameters():
    param.requires_grad = False

In [14]:
def make_adapter(in_dim, bottleneck_dim, out_dim):
    adapter_layers = torch.nn.Sequential(
        torch.nn.Linear(in_dim, bottleneck_dim),
        torch.nn.GELU(),
        torch.nn.Linear(bottleneck_dim, out_dim),
    )
    return adapter_layers

In [15]:
total_size = 0
bottleneck_size = 32 # hyperparameter

for block_idx in range(6):

    ###################################################
    # insert 1st adapter layer into transformer block
    ###################################################

    orig_layer_1 = model.distilbert.transformer.layer[block_idx].attention.out_lin

    adapter_layers_1 = make_adapter(
        in_dim=orig_layer_1.out_features, 
        bottleneck_dim=bottleneck_size, 
        out_dim=orig_layer_1.out_features)

    new_1 = torch.nn.Sequential(orig_layer_1, *adapter_layers_1)
    model.distilbert.transformer.layer[block_idx].attention.out_lin = new_1
    
    total_size += count_parameters(adapter_layers_1)

    ###################################################
    # insert 2nd adapter layer into transformer block
    ###################################################

    orig_layer_2 = model.distilbert.transformer.layer[block_idx].ffn.lin2

    adapter_layers_2 = make_adapter(
        in_dim=orig_layer_2.out_features, 
        bottleneck_dim=bottleneck_size, 
        out_dim=orig_layer_2.out_features)

    new_2 = torch.nn.Sequential(orig_layer_2, *adapter_layers_2)
    model.distilbert.transformer.layer[block_idx].ffn.lin2 = new_2
    
    total_size += count_parameters(adapter_layers_2)
    

print("Number of adapter parameters added:", total_size)

Number of adapter parameters added: 599424


In [16]:
from torch.utils.data import DataLoader
train_dataloader = DataLoader(dataset)
eval_dataloader = DataLoader(dataset)

In [17]:
from torch.optim import AdamW
optimizer = AdamW(model.parameters(), lr = 5e-5)

In [23]:
from transformers import get_scheduler
num_epochs = 5
num_training_steps = num_epochs*len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to('cuda')

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Sequential(
              (0): Linear(in_features=768, out_features=768, bias=True)
              (1): Linear(in_features=768, out_features=32, bias=True)
              (2): GELU(approximate='none')
              (3): Line

In [24]:
from tqdm.auto import tqdm
model.train()

progress_bar = tqdm(range(num_training_steps))

# Training loop
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to('cuda') for k, v in batch.items()}

        # Forward pass
        outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['labels'])
        
        # Compute loss
        loss = outputs.loss
        loss.backward() 

        # Update parameters
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        # Update progress bar
        progress_bar.update(1)

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")

    model.save_pretrained("./fine_tuned_distilbert_CS_HF")

  0%|          | 0/3450 [00:00<?, ?it/s]

Epoch 1/5, Loss: 0.6757814884185791
Epoch 2/5, Loss: 0.614280104637146
Epoch 3/5, Loss: 0.7451684474945068
Epoch 4/5, Loss: 0.6667976975440979
Epoch 5/5, Loss: 0.5319705009460449


In [25]:
import evaluate

metric = evaluate.load("accuracy")

model.to('cuda')
model.eval()

for batch in eval_dataloader:

    batch = {k: v.to(device) for k, v in batch.items()}

    with torch.no_grad():

        outputs = model(**batch)

    logits = outputs.logits

    predictions = torch.argmax(logits, dim=-1)

    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

{'accuracy': 0.5550724637681159}

In [31]:
def get_prediction(text):
    # Tokenize the input
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    probabilities = torch.softmax(logits, dim=1)

    predicted_class = torch.argmax(probabilities, dim=1).item()
    
    label_mapping = {0: 'bad', 1: 'good'} 
    predicted_label = label_mapping[predicted_class]

    return predicted_label


In [32]:

# Example instance to predict
input_text = df['query'][5]
predicted_label = get_prediction(input_text)
print(predicted_label)

bad


In [34]:
predictions = []
for index, row in df.iterrows():
    input_text = row['query'] 
    predicted_label = get_prediction(input_text) 
    predictions.append(predicted_label)

df['Predicted_Label'] = predictions
df

Unnamed: 0,id,query,answer,choices,gold,text,Predicted_Label
0,0,Assess the creditworthiness of a customer usin...,bad,"[good, bad]",1,"The client has attributes: A1: 1.0, A2: 23.75,...",bad
1,1,Assess the creditworthiness of a customer usin...,good,"[good, bad]",0,"The client has attributes: A1: 0.0, A2: 58.42,...",bad
2,2,Assess the creditworthiness of a customer usin...,good,"[good, bad]",0,"The client has attributes: A1: 1.0, A2: 32.75,...",bad
3,3,Assess the creditworthiness of a customer usin...,bad,"[good, bad]",1,"The client has attributes: A1: 1.0, A2: 40.92,...",bad
4,4,Assess the creditworthiness of a customer usin...,good,"[good, bad]",0,"The client has attributes: A1: 1.0, A2: 23.25,...",bad
...,...,...,...,...,...,...,...
685,134,Assess the creditworthiness of a customer usin...,good,"[good, bad]",0,"The client has attributes: A1: 1.0, A2: 22.67,...",bad
686,135,Assess the creditworthiness of a customer usin...,good,"[good, bad]",0,"The client has attributes: A1: 0.0, A2: 25.75,...",bad
687,136,Assess the creditworthiness of a customer usin...,bad,"[good, bad]",1,"The client has attributes: A1: 0.0, A2: 22.75,...",bad
688,137,Assess the creditworthiness of a customer usin...,bad,"[good, bad]",1,"The client has attributes: A1: 1.0, A2: 35.25,...",bad
