pip install transformers[torch] datasets evaluate opendatasets

In [7]:
import os
import mlflow
import datasets
import transformers
import evaluate
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, TrainingArguments, Trainer, EvalPrediction
import opendatasets as od
import pandas as pd
import numpy as np
import pickle as pkl
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
import torch

In [2]:
od.download(
    "https://www.kaggle.com/datasets/rmisra/politifact-fact-check-dataset/data"
    )
data = datasets.Dataset.from_json("politifact-fact-check-dataset/politifact_factcheck_data.json")
data = data.train_test_split(train_size=0.8)

Using custom data configuration default-58a22729889bbb0b
Found cached dataset json (C:/Users/Abelda-san/.cache/huggingface/datasets/json/default-58a22729889bbb0b/0.0.0)


Skipping, found downloaded files in ".\politifact-fact-check-dataset" (use force=True to force download)


In [3]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

label2id = {
    'true': 0,
    'mostly-true': 1,
    'half-true': 2,
    'mostly-false': 3,
    'false': 4,
    'pants-fire': 5
}
id2label = {
    0: 'true',
    1: 'mostly-true',
    2: 'half-true',
    3: 'mostly-false',
    4: 'false',
    5: 'pants-fire'
}
num_labels = len(label2id)

def preprocess_data(examples):

    text = examples["statement"]
    labels = pd.Series(examples['verdict'])

    encoding = tokenizer(text, padding=True, truncation=True)
    encoding['labels'] = list(labels.apply(lambda x: [1.0 if label2id[x]==i else 0.0 for i in range(num_labels)]))

    return encoding


train_tokenized_dataset = data["train"].map(preprocess_data, batched=True, remove_columns=data["train"].column_names)
test_tokenized_dataset = data["test"].map(preprocess_data, batched=True, remove_columns=data["test"].column_names)

train_tokenized_dataset.set_format("torch")
test_tokenized_dataset.set_format("torch")

 94%|█████████▍| 16/17 [00:02<00:00,  7.43ba/s]
 80%|████████  | 4/5 [00:00<00:00,  8.36ba/s]


In [6]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased',
                                                            problem_type="multi_label_classification", 
                                                            num_labels=6,
                                                            id2label=id2label,
                                                            label2id=label2id)

Downloading: 100%|██████████| 268M/268M [02:48<00:00, 1.59MB/s] 
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect 

In [8]:
training_args = TrainingArguments(
    f"distilbert_finetuning-factchecking",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=15,
    per_device_eval_batch_size=15,
    num_train_epochs=5,
    load_best_model_at_end=True,
    metric_for_best_model='f1'
)

def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized_dataset,
    eval_dataset=test_tokenized_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

In [9]:
trainer.train()

***** Running training *****
  Num examples = 16921
  Num Epochs = 5
  Instantaneous batch size per device = 15
  Total train batch size (w. parallel, distributed & accumulation) = 15
  Gradient Accumulation steps = 1
  Total optimization steps = 5645
  0%|          | 0/5645 [00:00<?, ?it/s]You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  1%|          | 47/5645 [04:15<8:24:28,  5.41s/it]