Install packages (optional if already used pip and requirements.txt)

In [None]:
# ! pip install datasets
# ! pip install accelerate
# ! pip install evaluate

Load packages

In [None]:
from transformers import (
    AutoTokenizer,
    DataCollatorWithPadding,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
from datasets import Dataset
import evaluate
import numpy as np
import pandas as pd

Define helper functions

In [None]:
tokenizer = AutoTokenizer.from_pretrained('distilbert/distilbert-base-uncased', model_max_length=512)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True)


accuracy = evaluate.load('accuracy')
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

- Load dataset
- Calculate id2label and label2id dictionaries
- Label, shuffle, stratify, and split

In [None]:
df = pd.read_csv('data/GB-GOV-1.csv')
unique_labels = df.label.unique()
id2label = {i: label for i, label in enumerate(unique_labels)}
label2id = {id2label[i]: i for i in id2label.keys()}
dataset = Dataset.from_pandas(df).class_encode_column("label").train_test_split(
    test_size=0.3,
    stratify_by_column="label",
    shuffle=True,
)

Tokenize dataset

In [None]:
tokenized_data = dataset.map(preprocess_function, batched=True)

Load and set up model

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    'distilbert/distilbert-base-uncased', num_labels=len(id2label.keys()), id2label=id2label, label2id=label2id
)

Set up training arguments and trainer

In [None]:
training_args = TrainingArguments(
    output_dir='models/climate-classifier',
    learning_rate=1e-5, # This can be tweaked depending on how loss progresses
    per_device_train_batch_size=36, # These should be tweaked to match GPU VRAM
    per_device_eval_batch_size=36,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data['train'],
    eval_dataset=tokenized_data['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Initialize training

In [None]:
trainer.train()