# Fine-Tuning and Evaluating a DistilBERT Model for Multi-Class Classification


Import Necessary Libraries

In [1]:

import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
from sklearn.metrics import accuracy_score, f1_score, classification_report


 Load and Combine Datasets

In [2]:

datasets = ['pitsA.csv', 'pitsB.csv', 'pitsC.csv', 'pitsD.csv', 'pitsE.csv', 'pitsF.csv']
dfs = []

# Read and combine all datasets
for dataset in datasets:
    df = pd.read_csv(dataset)
    dfs.append(df)

# Combine all datasets into a single DataFrame
df_combined = pd.concat(dfs, ignore_index=True)


Data Preprocessing

In [3]:
# Check for missing values and handle them
df_combined = df_combined.dropna(subset=['Severity', 'Subject', 'Description'])

# Convert Severity to categorical codes (for multi-class classification)
severity_map = {2: 0, 3: 1, 4: 2, 5: 3}
df_combined['Severity'] = df_combined['Severity'].map(severity_map)

# Combine Subject and Description into a single text column for the model input
df_combined['text'] = df_combined['Subject'] + " " + df_combined['Description']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_combined['Severity'] = df_combined['Severity'].map(severity_map)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_combined['text'] = df_combined['Subject'] + " " + df_combined['Description']


Split Data into Train and Test Sets

In [4]:
# Split dataset into training (80%) and test (20%) sets
train_data, test_data = train_test_split(df_combined, test_size=0.2, random_state=42, stratify=df_combined['Severity'])


Model and Tokenizer Setup

In [5]:
# Model and tokenizer setup
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize the dataset
max_length = 128

def tokenize_data(data):
    return tokenizer(
        data['text'].tolist(),
        truncation=True,
        padding='max_length',
        max_length=max_length,
        return_tensors="pt"
    )

train_encodings = tokenize_data(train_data)
test_encodings = tokenize_data(test_data)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Define Custom Dataset Class

In [6]:
# Custom Dataset Class
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)


 Prepare Training and Testing Datasets

In [7]:
# Prepare datasets
train_dataset = CustomDataset(train_encodings, train_data['Severity'].tolist())
test_dataset = CustomDataset(test_encodings, test_data['Severity'].tolist())


Define Metric Computation

In [8]:
# Compute metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    eval_accuracy = accuracy_score(labels, preds)
    eval_f1 = f1_score(labels, preds, average='weighted')
    print("\nClassification Report:")
    print(classification_report(labels, preds, target_names=['2', '3', '4', '5']))

    return {
        'accuracy': eval_accuracy,
        'f1': eval_f1
    }


Configure Training Arguments

In [13]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none",
    greater_is_better=True
)




Model Initialization and Trainer Setup

In [14]:
# Initialize model for sequence classification (4 classes based on Severity)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Train the Model

In [15]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.9015,0.696388,0.731677,0.713569
2,0.6269,0.868659,0.745342,0.721898
3,0.5308,0.890786,0.765217,0.750962



Classification Report:
              precision    recall  f1-score   support

           2       0.85      0.66      0.74        76
           3       0.78      0.85      0.81       438
           4       0.63      0.66      0.64       254
           5       0.00      0.00      0.00        37

    accuracy                           0.73       805
   macro avg       0.56      0.54      0.55       805
weighted avg       0.70      0.73      0.71       805



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Classification Report:
              precision    recall  f1-score   support

           2       0.93      0.66      0.77        76
           3       0.73      0.95      0.82       438
           4       0.75      0.52      0.62       254
           5       0.43      0.08      0.14        37

    accuracy                           0.75       805
   macro avg       0.71      0.55      0.59       805
weighted avg       0.74      0.75      0.72       805


Classification Report:
              precision    recall  f1-score   support

           2       0.88      0.67      0.76        76
           3       0.78      0.90      0.84       438
           4       0.71      0.66      0.68       254
           5       0.44      0.11      0.17        37

    accuracy                           0.77       805
   macro avg       0.70      0.58      0.61       805
weighted avg       0.75      0.77      0.75       805



TrainOutput(global_step=2415, training_loss=0.644196327478002, metrics={'train_runtime': 183.7857, 'train_samples_per_second': 52.545, 'train_steps_per_second': 13.14, 'total_flos': 319820823899136.0, 'train_loss': 0.644196327478002, 'epoch': 3.0})

 Evaluate the Model on Test Data

In [None]:
# Evaluate the model on test data
print("\nFinal Evaluation on Test Set:")
test_results = trainer.evaluate(eval_dataset=test_dataset)
print("Test Results:", test_results)


Make Predictions on Test Data

In [16]:
# Making predictions on the test set
predictions = trainer.predict(test_dataset)
pred_labels = predictions.predictions.argmax(-1)

# Output predicted severity for each test example
test_data['Predicted_Severity'] = pred_labels
print(test_data[['Subject', 'Description', 'Severity', 'Predicted_Severity']].head())



Classification Report:
              precision    recall  f1-score   support

           2       0.88      0.67      0.76        76
           3       0.78      0.90      0.84       438
           4       0.71      0.66      0.68       254
           5       0.44      0.11      0.17        37

    accuracy                           0.77       805
   macro avg       0.70      0.58      0.61       805
weighted avg       0.75      0.77      0.75       805

                                                Subject  \
34          Uplink Build 5.1 Code: Unusual pointer cast   
708   ProjectA L3 SC&FSRD Rqt: L3-SFS-825 is missing...   
4561                        PB.FSW-15 Trace Too General   
341   EngCntrl Test Scripts: EngCntrl SRS requiremen...   
3818  Inst4 SRS Section 5.3.6 Is Does Not Address th...   

                                            Description  Severity  \
34    In scstate.c there is an unusual pointer cast ...         1   
708   L3-SFS-825 addresses calling onboard block