# Fine-Tuning a Transformer Model on Azure Databricks

## Overview
This notebook guides you through the process of preparing data and executing the fine-tuning of a transformer model using Azure Databricks.

## Objectives
- Setup the environments
- Load datasets 
- Configure and train a transformer model.
- Evaluate model performance and save results.


## Author
- Name: Alessandro Armillotta
- Date: 09/10/2025

# Steps
1. Data loading and preprocessing.
2. Model configuration and fine-tuning.
3. Model evaluation and saving.
4. Load Model

In [0]:
dbutils.widgets.text("base_model", "google-bert/bert-base-uncased")
base_model = dbutils.widgets.get("base_model")
base_model

## Step 0: Setup Environments

In [0]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, pipeline
from pyspark.sql import functions as F
import datasets
import torch
import numpy as np
import evaluate

import mlflow
mlflow.set_registry_uri("databricks-uc")
mlflow.set_tracking_uri("databricks")

import warnings
warnings.filterwarnings("ignore")

import transformers
transformers.logging.set_verbosity_error()

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
if device.type == "cpu":
  no_cuda=True
else:
  no_cuda=False



print(device.type)

In [0]:
experiment_name = '/fine_tuning_transformer_model'

#artifact_location = "/Volumes/main/fine_tuning_transformer_model/tmp/artifact"
train_cache_dir   = "/Volumes/main/fine_tuning_transformer_model/tmp/train"
val_cache_dir     = "/Volumes/main/fine_tuning_transformer_model/tmp/val"

model_output_dir    = "/Volumes/main/fine_tuning_transformer_model/tmp/output_model"
model_artifact_path = "classification"
training_output_dir = "/Volumes/main/fine_tuning_transformer_model/tmp/trainer"
pipeline_output_dir = "/Volumes/main/fine_tuning_transformer_model/tmp/pipeline"

In [0]:
try:
    experiment = mlflow.get_experiment_by_name(experiment_name)
    
    if experiment is None:
        experiment_id = mlflow.create_experiment(
            name=experiment_name,
            tags={'exp_name': experiment_name}
        )
        mlflow.set_experiment(experiment_id=experiment_id)
        print(f"Experiment {experiment_name} created.")
    else:
        mlflow.set_experiment(experiment_id=experiment.experiment_id)
        print(f"Experiment {experiment_name} already exists.")
except Exception as e:
    print(f"An error occurred: {e}")

## Step 1: Prepare Datasets

In [0]:
# load delta tables into dataframe
train_df = spark.read.table("main.fine_tuning_transformer_model.train_data")
print(train_df.count())
val_df   = spark.read.table("main.fine_tuning_transformer_model.val_data")
print(val_df.count())

In [0]:
labels = spark.read.table("main.fine_tuning_transformer_model.labels")
labels = labels.collect()

id2label = {index: row.label for (index, row) in enumerate(labels)}
label2id = {row.label: index for (index, row) in enumerate(labels)}

In [0]:
# drop toPandas() if GPU is not Available 
train_dataset = train_df.select("text","label_id").withColumn("label", F.col("label_id")).drop("label_id")
val_dataset   = val_df.select("text","label_id").withColumn("label", F.col("label_id")).drop("label_id")

train_dataset = datasets.Dataset.from_spark(train_dataset, cache_dir=train_cache_dir)
val_dataset   = datasets.Dataset.from_spark(val_dataset, cache_dir=val_cache_dir)
#train_dataset = datasets.Dataset.from_pandas(train_dataset)
#val_dataset   = datasets.Dataset.from_pandas(val_dataset)

## Step 2: Training Configuration

In [0]:
# Load Tokenizer baseed on the Model Name. Transformers models expect tokenized input
tokenizer = AutoTokenizer.from_pretrained(base_model)

def tokenize_function(examples):
    return tokenizer(examples["text"],truncation=True, padding='max_length', max_length=512, return_tensors="pt")


train_tokenized = train_dataset.map(tokenize_function, batched=True)
val_tokenized   = val_dataset.map(tokenize_function, batched=True)

In [0]:
model = AutoModelForSequenceClassification.from_pretrained(
        base_model,
        num_labels=len(label2id),
        label2id=label2id,
        id2label=id2label
        )

Training arguments refer to a set of hyperparameters that control how a model is trained. These are passed to the TrainingArguments class and used by the Trainer API to manage the training loop. [huggingface.co]

Think of training arguments as the recipe for model training: each parameter influences how the model learns, how fast it converges, and how well it generalizes.

In [0]:
training_args = TrainingArguments(output_dir=training_output_dir
                                 ,per_device_train_batch_size = 50
                                 ,per_device_eval_batch_size = 50
                                 ,logging_steps = 1
                                 ,logging_strategy="steps"
                                 ,num_train_epochs = 5
                                 ,load_best_model_at_end = True # RECOMMENDED
                                 ,metric_for_best_model = "eval_accuracy" # RECOMMENDED
                                 ,greater_is_better = True
                                 ,evaluation_strategy = "epoch" # RECOMMENDED
                                 ,save_strategy='epoch'
                                 ,report_to="mlflow"
                                 ,no_cuda=no_cuda)

In [0]:
data_collator = DataCollatorWithPadding(tokenizer)

In [0]:
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [0]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)

## Step 3: Training and MLFlow Tracking

In [0]:
try:
  with mlflow.start_run(experiment_id=experiment.experiment_id, run_name=f"train_{base_model}") as run:
    trainer.train()
    trainer.save_model(model_output_dir)

    pipe = pipeline("text-classification", model=AutoModelForSequenceClassification.from_pretrained(model_output_dir), batch_size=1, tokenizer=tokenizer)

    pipe.save_pretrained(pipeline_output_dir)

    model_info = mlflow.transformers.log_model(
      transformers_model=pipe,
      artifact_path=model_artifact_path,
      input_example="Hi there!")
  
    #mlflow.end_run()
  
except Exception as e:
  print(f"An error occurred: {e}")
  mlflow.end_run()



## Step 4: Load Logged Model

In [0]:
logged_model = f"runs:/{run.info.run_id}/{model_artifact_path}"

# Load model as a Spark UDF. Override result_type if the model does not return double values.
classification_class = mlflow.pyfunc.spark_udf(spark, model_uri=logged_model, result_type='string')

test = val_df.limit(50).select(val_df.text, val_df.label).select(val_df.text, val_df.label, classification_class(val_df.text).alias("prediction"))
display(test)


In [0]:
test.write.saveAsTable("main.fine_tuning_transformer_model.test")

## Step 5: Register Model