In [39]:
!pip install -U transformers datasets scikit-learn --quiet



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:

import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
from collections import Counter

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments,
)
import torch

questions = pd.read_csv("../data/Questions.csv", encoding="ISO-8859-1")
tags = pd.read_csv("../data/Tags.csv", encoding="ISO-8859-1")
tag_groups = tags.groupby("Id")["Tag"].apply(list).reset_index()
df = questions.merge(tag_groups, on="Id", how="inner")

# Sample
sampled_df = df_multi.sample(n=min(5000, len(df_multi)), random_state=42)
X = sampled_df["text"].reset_index(drop=True)
y = mlb.fit_transform(sampled_df["Tag"])


X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(X_val.tolist(), truncation=True, padding=True, max_length=512)


class MultiLabelDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels, dtype=torch.float32)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = MultiLabelDataset(train_encodings, y_train)
val_dataset = MultiLabelDataset(val_encodings, y_val)


model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=y.shape[1],
    problem_type="multi_label_classification"
)


def compute_metrics(pred):
    logits, labels = pred
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.tensor(logits))
    y_pred = (probs > 0.5).int().numpy()
    y_true = labels.astype(int)
    return {
        'micro_f1': f1_score(y_true, y_pred, average='micro'),
        'macro_f1': f1_score(y_true, y_pred, average='macro'),
        'samples_f1': f1_score(y_true, y_pred, average='samples')
    }


args = TrainingArguments(
    output_dir="./distilbert_model",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    learning_rate=2e-5,
    load_best_model_at_end=True,
    metric_for_best_model="micro_f1",
    greater_is_better=True
)


trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


trainer.train()
trainer.evaluate()



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Micro F1,Macro F1,Samples F1
1,0.1809,0.0986,0.0,0.0,0.0
2,0.0999,0.097585,0.0,0.0,0.0
3,0.0923,0.083759,0.162018,0.020004,0.113567
4,0.0804,0.07593,0.290098,0.043942,0.2222
5,0.0744,0.073689,0.325613,0.052493,0.256533


{'eval_loss': 0.07368932664394379,
 'eval_micro_f1': 0.32561307901907355,
 'eval_macro_f1': 0.05249324561986624,
 'eval_samples_f1': 0.2565333333333333,
 'eval_runtime': 785.7256,
 'eval_samples_per_second': 1.273,
 'eval_steps_per_second': 0.159,
 'epoch': 5.0}