# Fine-Tuning and Evaluating a DistilBERT Model on Augmented Data for Multi-Class Classification

Importing Necessary Libraries

In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
from sklearn.metrics import accuracy_score, f1_score, classification_report
from nltk.corpus import wordnet
import numpy as np
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

Loading and Combining Datasets

In [2]:
# Load all datasets (PITS A, B, C, D, E, F)
datasets = ['pitsA.csv', 'pitsB.csv', 'pitsC.csv', 'pitsD.csv', 'pitsE.csv', 'pitsF.csv']
dfs = []

# Read and combine all datasets
for dataset in datasets:
    df = pd.read_csv(dataset)
    dfs.append(df)

# Combine all datasets into a single DataFrame
df_combined = pd.concat(dfs, ignore_index=True)


Handling Missing Values and Encoding Severity Levels

In [3]:
# Check for missing values and handle them
df_combined = df_combined.dropna(subset=['Severity', 'Subject', 'Description'])

# Convert Severity to categorical codes (for multi-class classification)
severity_map = {2: 0, 3: 1, 4: 2, 5: 3}
df_combined['Severity'] = df_combined['Severity'].map(severity_map)

# Combine Subject and Description into a single text column for the model input
df_combined['text'] = df_combined['Subject'] + " " + df_combined['Description']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_combined['Severity'] = df_combined['Severity'].map(severity_map)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_combined['text'] = df_combined['Subject'] + " " + df_combined['Description']


Splitting the Data into Training and Test Sets

In [4]:
# Split the dataset into training (80%) and test (20%) sets
train_data, test_data = train_test_split(df_combined, test_size=0.2, random_state=42, stratify=df_combined['Severity'])

# Display the count of each severity level in the combined dataset
severity_counts = df_combined['Severity'].value_counts()
print("Severity Level Counts:")
print(severity_counts)


Severity Level Counts:
Severity
1    2191
2    1267
0     382
3     184
Name: count, dtype: int64


Defining a Synonym Replacement Function for Data Augmentation

In [5]:
# Define Synonym Replacement function
def synonym_replacement(text, num_replacements=1):
    words = text.split()
    new_words = words.copy()

    for _ in range(num_replacements):
        word_idx = np.random.randint(0, len(words))
        word = words[word_idx]

        # Get synonyms using WordNet
        synonyms = wordnet.synsets(word)
        if synonyms:
            synonym = synonyms[0].lemmas()[0].name()
            if synonym != word:
                new_words[word_idx] = synonym

    return ' '.join(new_words)


Augmenting the Dataset to Balance Classes

In [6]:
# Augment the dataset by Synonym Replacement and balance it for each class
def augment_class_data(df, class_label, target_size):
    class_data = df[df['Severity'] == class_label]
    current_size = len(class_data)

    if current_size >= target_size:
        return class_data
    else:
        augmented_data = class_data.copy()

        # Repeat the data until the target size is reached
        while len(augmented_data) < target_size:
            augmented_data = pd.concat([augmented_data, class_data], ignore_index=True)

        # Apply synonym replacement to each row
        augmented_data['text'] = augmented_data['text'].apply(synonym_replacement, num_replacements=1)

        # Shuffle the data
        augmented_data = augmented_data.sample(frac=1, random_state=42).reset_index(drop=True)
        return augmented_data

# Target size for each class (3,500 samples per class to reach approximately 14,000)
target_class_size = 3500
num_classes = 4

# Augment each class separately to ensure a balanced dataset
balanced_train_df = pd.DataFrame()
for class_label in range(num_classes):
    augmented_class_data = augment_class_data(train_data, class_label, target_class_size)
    balanced_train_df = pd.concat([balanced_train_df, augmented_class_data], ignore_index=True)

# Display the new training set size and severity counts after augmentation
print(f"Balanced Training Set Size: {len(balanced_train_df)}")
balanced_severity_counts = balanced_train_df['Severity'].value_counts()
print("Severity Level Counts After Augmentation:")
print(balanced_severity_counts)


Balanced Training Set Size: 14758
Severity Level Counts After Augmentation:
Severity
2    4052
0    3672
3    3528
1    3506
Name: count, dtype: int64


Preparing the Model and Tokenizing the Data

In [7]:
# Model and tokenizer setup
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize the dataset
max_length = 128

def tokenize_data(data):
    return tokenizer(
        data['text'].tolist(),
        truncation=True,
        padding='max_length',
        max_length=max_length,
        return_tensors="pt"
    )

train_encodings = tokenize_data(balanced_train_df)
test_encodings = tokenize_data(test_data)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Creating a Custom Dataset Class

In [8]:
# Custom Dataset Class
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

# Prepare datasets
train_dataset = CustomDataset(train_encodings, balanced_train_df['Severity'].tolist())
test_dataset = CustomDataset(test_encodings, test_data['Severity'].tolist())


Defining Metrics for Model Evaluation

In [9]:
# Compute metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    eval_accuracy = accuracy_score(labels, preds)
    eval_f1 = f1_score(labels, preds, average='weighted')

    print("\nClassification Report:")
    print(classification_report(labels, preds, target_names=['2', '3', '4', '5']))

    return {
        'accuracy': eval_accuracy,
        'f1': eval_f1
    }


Setting Training Arguments and Training the Model

In [10]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none",
    greater_is_better=True
)

# Initialize model for sequence classification (4 classes based on Severity)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()




model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.279,1.023219,0.767702,0.764683
2,0.0762,1.363989,0.791304,0.789325
3,0.0249,1.483225,0.798758,0.795162



Classification Report:
              precision    recall  f1-score   support

           2       0.69      0.74      0.71        76
           3       0.81      0.83      0.82       438
           4       0.72      0.73      0.73       254
           5       0.70      0.38      0.49        37

    accuracy                           0.77       805
   macro avg       0.73      0.67      0.69       805
weighted avg       0.77      0.77      0.76       805


Classification Report:
              precision    recall  f1-score   support

           2       0.85      0.74      0.79        76
           3       0.84      0.84      0.84       438
           4       0.71      0.79      0.75       254
           5       0.71      0.41      0.52        37

    accuracy                           0.79       805
   macro avg       0.78      0.69      0.72       805
weighted avg       0.79      0.79      0.79       805


Classification Report:
              precision    recall  f1-score   support

   

TrainOutput(global_step=5535, training_loss=0.1898233834751684, metrics={'train_runtime': 608.6613, 'train_samples_per_second': 72.74, 'train_steps_per_second': 9.094, 'total_flos': 1466267697764352.0, 'train_loss': 0.1898233834751684, 'epoch': 3.0})

Evaluating the Model and Making Predictions

In [11]:
# Evaluate the model on test data
print("\nFinal Evaluation on Test Set:")
test_results = trainer.evaluate(eval_dataset=test_dataset)
print("Test Results:", test_results)

# Make predictions on the test set
predictions = trainer.predict(test_dataset)
pred_labels = predictions.predictions.argmax(-1)

# Output predicted severity for each test example
test_data['Predicted_Severity'] = pred_labels
print(test_data[['Subject', 'Description', 'Severity', 'Predicted_Severity']].head())



Final Evaluation on Test Set:



Classification Report:
              precision    recall  f1-score   support

           2       0.82      0.74      0.78        76
           3       0.83      0.86      0.84       438
           4       0.74      0.78      0.76       254
           5       0.74      0.38      0.50        37

    accuracy                           0.80       805
   macro avg       0.78      0.69      0.72       805
weighted avg       0.80      0.80      0.80       805

Test Results: {'eval_loss': 1.483224630355835, 'eval_accuracy': 0.7987577639751553, 'eval_f1': 0.7951624939058968, 'eval_runtime': 2.962, 'eval_samples_per_second': 271.775, 'eval_steps_per_second': 34.098, 'epoch': 3.0}

Classification Report:
              precision    recall  f1-score   support

           2       0.82      0.74      0.78        76
           3       0.83      0.86      0.84       438
           4       0.74      0.78      0.76       254
           5       0.74      0.38      0.50        37

    accuracy            