# Fine tuning Bart-Large

## Creating the dataset
We need to prepare a dataset with two categories: *offer letter* and *not offer letter*.

In [17]:
# install dependencies
%pip install pandas scikit-learn --quiet

Note: you may need to restart the kernel to use updated packages.


In [18]:
import os
from pathlib import Path
import pandas as pd


def create_dataframe_from_directory(directory: str):
    # Define the paths to your directories
    offer_letters_dir = Path('./datasets')/ directory / 'offer_letters'
    not_offer_letters_dir = Path('./datasets')/ directory / 'not_offer_letters'

    # Initialize lists to store file names and labels
    data = []

    # Process offer letters
    for filename in os.listdir(offer_letters_dir):
        if filename.endswith('.txt'):
            with open(os.path.join(offer_letters_dir, filename), 'r') as file:
                text = file.read()
                data.append({'text': text, 'label': 1})

    # Process not offer letters
    for filename in os.listdir(not_offer_letters_dir):
        if filename.endswith('.txt'):
            with open(os.path.join(not_offer_letters_dir, filename), 'r') as file:
                text = file.read()
                data.append({'text': text, 'label': 0})

    # Create a DataFrame
    return pd.DataFrame(data)

In [19]:
df = create_dataframe_from_directory('training')

# Display the first few rows of the DataFrame
df.head()

Unnamed: 0,text,label
0,"101 Innovation Way\nSeattle, WA 98101\nPhone: ...",1
1,"August 7, 2024\n\nJane Doe\n123 Maple Street\n...",1
2,Green Earth Technologies\n789 Eco Lane\nSan Fr...,1
3,"City of Springfield\n\nAugust 4, 2024\n\nMicha...",1
4,"Tech Innovations Inc.\n\nAugust 1, 2024\n\nJes...",1


## Splitting the dataset for training and evaluation
Let's use the `train_test_split` function from sklearn to split the dataset.

In [20]:
from sklearn.model_selection import train_test_split

# Split the dataset into training and validation sets
train_data, val_data = train_test_split(df, test_size=0.2, random_state=42)

# Display the first few rows
train_data.head()


Unnamed: 0,text,label
8,"Creative Minds Studio\n\nAugust 2, 2024\n\nJoh...",1
16,LEASE RENEWAL AGREEMENT\n\nThis Lease Renewal ...,0
3,"City of Springfield\n\nAugust 4, 2024\n\nMicha...",1
13,"Bright Future Marketing\n\nAugust 1, 2024\n\nE...",0
15,FOR IMMEDIATE RELEASE\nContact:\nEmily Richard...,0


## Fine tuning the model

In [21]:
# install dependencies
%pip install transformers datasets torch torchvision torchaudio --quiet

Note: you may need to restart the kernel to use updated packages.


In [22]:
import torch
torch.cuda.is_available()

True

## Tokenize the datasets
let's tokenize the text in our datasets so that we can feed it to our model.

In [23]:
os.environ["TOKENIZERS_PARALLELISM"] = "False"

In [24]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import Dataset

# Load the tokenizer and model
model_name = "roberta-large-mnli"  # Specialized model for text classification
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, ignore_mismatched_sizes=True).to('cuda')

# Tokenize the datasets
def tokenize_function(example):
    return tokenizer(example['text'], padding='max_length', truncation=True)

train_dataset = Dataset.from_pandas(train_data).map(tokenize_function, batched=True)
val_dataset = Dataset.from_pandas(val_data).map(tokenize_function, batched=True)


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large-mnli and are newly initialized because the shapes did not match:
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([2]) in the model instantiated
- classifier.out_

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

## Fine tune the model

In [25]:
# install dependencies
%pip install accelerate --quiet

Note: you may need to restart the kernel to use updated packages.


In [26]:
os.environ["TOKENIZERS_PARALLELISM"] = "True"

In [27]:
from transformers import Trainer, TrainingArguments

# Prepare training arguments
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_checkpointing=True,
    num_train_epochs=4,
    weight_decay=0.01,
)

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Train the model
trainer.train()


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Epoch,Training Loss,Validation Loss
1,No log,0.590025
2,No log,0.596267
3,No log,0.60206
4,No log,0.601418


TrainOutput(global_step=32, training_loss=0.7470139265060425, metrics={'train_runtime': 270.6093, 'train_samples_per_second': 0.222, 'train_steps_per_second': 0.118, 'total_flos': 55915881799680.0, 'train_loss': 0.7470139265060425, 'epoch': 4.0})

## Evaluate the model

In [28]:
trainer.evaluate()


{'eval_loss': 0.6014183759689331,
 'eval_runtime': 2.7906,
 'eval_samples_per_second': 1.433,
 'eval_steps_per_second': 0.717,
 'epoch': 4.0}

## Save the model

In [29]:
model.save_pretrained("./saved_models/bart")
tokenizer.save_pretrained("./saved_models/bart")

('./saved_models/bart/tokenizer_config.json',
 './saved_models/bart/special_tokens_map.json',
 './saved_models/bart/vocab.json',
 './saved_models/bart/merges.txt',
 './saved_models/bart/added_tokens.json',
 './saved_models/bart/tokenizer.json')

## Check model accuracy

In [30]:
from sklearn.metrics import accuracy_score

# Load your evaluation dataset

test_dataset = create_dataframe_from_directory("testing")

# Assuming 'features' are in columns except the last one and 'label' is the last column
X_eval = test_dataset.iloc[:, :-1]  # Features (all columns except the last)
y_eval = test_dataset.iloc[:, -1]    # Labels (last column)

# Load your trained DistilBERT model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('./saved_models/bart')
model = AutoModelForSequenceClassification.from_pretrained('./saved_models/bart', num_labels=2)

# Tokenize the evaluation data
inputs = tokenizer(X_eval.iloc[:, 0].tolist(), padding=True, truncation=True, return_tensors='pt')

# Set the model to evaluation mode
model.eval()

# Make predictions
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits

# Get the predicted class labels
y_pred = torch.argmax(logits, dim=1).numpy()

# Calculate accuracy
accuracy = accuracy_score(y_eval, y_pred)

print(f'Model Accuracy on Evaluation Dataset: {accuracy * 100:.2f}%')

Model Accuracy on Evaluation Dataset: 50.00%
