Install packages (optional if already used pip and requirements.txt)

In [11]:
! pip install datasets
! pip install accelerate
! pip install evaluate
! mkdir data
! wget -O data/GB-GOV-1.csv https://raw.githubusercontent.com/akmiller01/practical-ml-seminar/main/data/GB-GOV-1.csv

mkdir: cannot create directory ‘data’: File exists
--2024-02-16 21:19:34--  https://raw.githubusercontent.com/akmiller01/practical-ml-seminar/main/data/GB-GOV-1.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 435310 (425K) [text/plain]
Saving to: ‘data/GB-GOV-1.csv’


2024-02-16 21:19:34 (12.3 MB/s) - ‘data/GB-GOV-1.csv’ saved [435310/435310]



Load packages

In [12]:
from transformers import (
    AutoTokenizer,
    DataCollatorWithPadding,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
from datasets import Dataset
import evaluate
import numpy as np
import pandas as pd
import torch
from scipy.special import softmax

Define helper functions

In [13]:
tokenizer = AutoTokenizer.from_pretrained('distilbert/distilbert-base-uncased', model_max_length=512)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True)


accuracy = evaluate.load('accuracy')
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

- Load dataset
- Calculate id2label and label2id dictionaries
- Label, shuffle, stratify, and split

In [14]:
df = pd.read_csv('data/GB-GOV-1.csv')
unique_labels = df.label.unique()
id2label = {i: label for i, label in enumerate(unique_labels)}
label2id = {id2label[i]: i for i in id2label.keys()}
dataset = Dataset.from_pandas(df).class_encode_column("label").train_test_split(
    test_size=0.3,
    stratify_by_column="label",
    shuffle=True,
)

Casting to class labels:   0%|          | 0/3878 [00:00<?, ? examples/s]

Tokenize dataset

In [15]:
tokenized_data = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/2714 [00:00<?, ? examples/s]

Map:   0%|          | 0/1164 [00:00<?, ? examples/s]

Load and set up model

In [16]:
model = AutoModelForSequenceClassification.from_pretrained(
    'distilbert/distilbert-base-uncased', num_labels=len(id2label.keys()), id2label=id2label, label2id=label2id
)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Set up training arguments and trainer

In [17]:
training_args = TrainingArguments(
    output_dir='models/climate-classifier',
    learning_rate=1e-5, # This can be tweaked depending on how loss progresses
    per_device_train_batch_size=72, # These should be tweaked to match GPU VRAM
    per_device_eval_batch_size=72,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    logging_strategy='epoch',
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data['train'],
    eval_dataset=tokenized_data['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Initialize training

In [18]:
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6551,0.555916,0.783505
2,0.472,0.432089,0.813574
3,0.388,0.388194,0.835911
4,0.3387,0.380287,0.839347
5,0.3068,0.379288,0.847938
6,0.2778,0.379618,0.852234
7,0.2558,0.387306,0.847079
8,0.2497,0.384404,0.849656
9,0.2303,0.387893,0.850515
10,0.2298,0.386859,0.853952


TrainOutput(global_step=380, training_loss=0.3403857733073987, metrics={'train_runtime': 148.2676, 'train_samples_per_second': 183.047, 'train_steps_per_second': 2.563, 'total_flos': 307861478787408.0, 'train_loss': 0.3403857733073987, 'epoch': 10.0})

Example inference

In [19]:
def inference(tokenizer, trainer, text):
    inputs = tokenizer(text)

    with torch.no_grad():
        predictions = trainer.predict([inputs])

    predicted_class_id = np.argmax(predictions.predictions[0])
    class_name = model.config.id2label[predicted_class_id]
    predicted_confidences = softmax(predictions.predictions[0], axis=0)
    class_confidence = predicted_confidences[predicted_class_id]
    print("Output: {}; Confidence: {}%".format(
        class_name,
        round(class_confidence * 100, 2)
      )
    )

In [20]:
text = "Fiduciary Risk Assessment of the Adaptation Fund"
inference(tokenizer, trainer, text)

Output: Related to climate; Confidence: 93.85%
