# Fine-Tuning Transformers for Report View Categorization


<a href="https://colab.research.google.com/github/cbadenes/semantic-report-search/blob/main/data/analysis/34_fine_tuning.ipynb" target="_parent">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open in Colab"/>
</a>

Install Required Libraries

In [1]:
!pip install -q transformers datasets evaluate accelerate

import pandas as pd
import numpy as np
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m85.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m70.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m46.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

Load and Prepare the Report Inventory

In [7]:
df = pd.read_excel("Reporting_Inventory.xlsx", sheet_name="Views")
df = df[['Description', 'Category']].dropna()
df = df[df['Category'].str.strip() != '']  # Filter empty strings

df['Category'] = df['Category'].astype('category')
label2id = {cat: i for i, cat in enumerate(df['Category'].cat.categories)}
id2label = {i: cat for cat, i in label2id.items()}
df['label'] = df['Category'].map(label2id)
df.head(10)


Unnamed: 0,Description,Category,label
0,Methodolody and definition of the algorithim o...,Informative,3
1,View focused on understand the performance by ...,Functional,1
2,Global view to understand Feeder Market Perfor...,Executive,0
3,View focused on understanding the booking beha...,Functional,1
4,Detail view of Feeder Markets by Destination i...,Functional,1
5,VIew focused on understanding the feeder marke...,Functional,1
6,Index page with interactive buttons to other v...,Index,2
7,Benchmark by Destination. Outside information ...,Functional,1
8,View that provides performance vs budget at a ...,Functional,1
9,Methodolody and definition of the algorithim o...,Informative,3


Training Configuration

In [9]:
model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize_fn(example):
    return tokenizer(example["Description"], truncation=True, padding="max_length", max_length=128)

dataset = Dataset.from_pandas(df[['Description', 'label']].astype({'label': int}))
dataset = dataset.train_test_split(test_size=0.2)
tokenized_datasets = dataset.map(tokenize_fn, batched=True)


Map:   0%|          | 0/446 [00:00<?, ? examples/s]

Map:   0%|          | 0/112 [00:00<?, ? examples/s]

In [10]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


- output_dir
Directory where model checkpoints and training artifacts will be saved.

- learning_rate
The initial learning rate for the optimizer — typically a small value like 2e-5 for fine-tuning.

- per_device_train_batch_size
Batch size per device (GPU or CPU) during training.

- per_device_eval_batch_size
Batch size per device for evaluation.

- num_train_epochs
Number of complete passes over the training dataset.


In [17]:
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    learning_rate=2e-5,
    report_to="none"
)



Metrics and trainer support

In [18]:
import os
os.environ["WANDB_DISABLED"] = "true"

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[]
)


  trainer = Trainer(


Train the model

In [19]:


trainer.train()


Step,Training Loss


TrainOutput(global_step=168, training_loss=0.35441412244524273, metrics={'train_runtime': 20.7939, 'train_samples_per_second': 64.346, 'train_steps_per_second': 8.079, 'total_flos': 44314295910912.0, 'train_loss': 0.35441412244524273, 'epoch': 3.0})

Inference

In [21]:
def predict_category(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

    # Move inputs to the same device as the model
    device = model.device
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        logits = model(**inputs).logits

    predicted_class_id = logits.argmax().item()
    return id2label[predicted_class_id]


predict_category("Report analyzing distribution channel performance across regions")


'Functional'