# Fine-Tuning and Evaluating a T5 (Text-to-Text Transfer Transformer) Model on Augmented Data for Multi-Class Classification

# Setup and Library Imports

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
import torch
from sklearn.metrics import accuracy_score, f1_score, classification_report
from nltk.corpus import wordnet
import random
import nltk

# Download WordNet if not already downloaded
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

# Data Loading

In [2]:
datasets = [
    'pitsA.csv',
    'pitsB.csv',
    'pitsC.csv',
    'pitsD.csv',
    'pitsE.csv',
    'pitsF.csv'
]

dfs = [pd.read_csv(dataset) for dataset in datasets]
df_combined = pd.concat(dfs, ignore_index=True)

df_combined = df_combined.dropna(subset=['Severity', 'Subject', 'Description'])
df_combined['Severity'] = df_combined['Severity'].astype(str)


 Data Preparation

In [3]:
df_combined['text'] = df_combined['Subject'] + " " + df_combined['Description']

train_data, test_data = train_test_split(
    df_combined, test_size=0.2, random_state=42, stratify=df_combined['Severity']
)

print(f"Total examples in the training set: {len(train_data)}")
print(f"Total examples in the testing set: {len(test_data)}")

# Class distribution in training set
train_class_counts = train_data['Severity'].value_counts()
print("\nClass distribution in the training set:")
print(train_class_counts)

# Class distribution in testing set
test_class_counts = test_data['Severity'].value_counts()
print("\nClass distribution in the testing set:")
print(test_class_counts)

num_classes = df_combined['Severity'].nunique()
print(f"\nTotal number of classes: {num_classes}")


Total examples in the training set: 3219
Total examples in the testing set: 805

Class distribution in the training set:
Severity
3.0    1753
4.0    1013
2.0     306
5.0     147
Name: count, dtype: int64

Class distribution in the testing set:
Severity
3.0    438
4.0    254
2.0     76
5.0     37
Name: count, dtype: int64

Total number of classes: 4


# Data Augmentation

In [4]:
def synonym_replacement(text, num_replacements=1):
    words = text.split()
    new_words = words.copy()

    for _ in range(num_replacements):
        word_idx = random.randint(0, len(words) - 1)
        word = words[word_idx]
        synonyms = wordnet.synsets(word)
        if synonyms:
            synonym = synonyms[0].lemmas()[0].name()
            if synonym != word:
                new_words[word_idx] = synonym

    return ' '.join(new_words)

def augment_class_data(df, class_label, target_size):
    class_data = df[df['Severity'] == class_label]
    current_size = len(class_data)

    if current_size >= target_size:
        return class_data
    else:
        augmented_data = class_data.copy()
        while len(augmented_data) < target_size:
            new_samples = class_data.copy()
            new_samples['text'] = new_samples['text'].apply(
                lambda x: synonym_replacement(x, num_replacements=1)
            )
            augmented_data = pd.concat([augmented_data, new_samples], ignore_index=True)

        augmented_data = augmented_data.sample(n=target_size, random_state=42).reset_index(drop=True)
        return augmented_data


# Balancing the Training Set

In [11]:
# Target size for each class
target_class_size = 3500

# Balance and augment the training set
balanced_train_df = pd.DataFrame()
for class_label in train_data['Severity'].unique():
    augmented_class_data = augment_class_data(train_data, class_label, target_class_size)
    balanced_train_df = pd.concat([balanced_train_df, augmented_class_data], ignore_index=True)

# Verify the new class distribution
print("\nBalanced Training Set Size:", len(balanced_train_df))
print("\nClass Distribution in Balanced Training Set:")
print(balanced_train_df['Severity'].value_counts())
print("\nTesting dataset size:", len(test_data))



Balanced Training Set Size: 14000

Class Distribution in Balanced Training Set:
Severity
3.0    3500
4.0    3500
2.0    3500
5.0    3500
Name: count, dtype: int64

Testing dataset size: 805


# Tokenization

In [12]:
tokenizer = AutoTokenizer.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base")

def preprocess_data(data, tokenizer, max_length=128):
    inputs = data['text'].apply(lambda x: f"Classify: {x}")
    labels = data['Severity']
    encodings = tokenizer(
        inputs.tolist(),
        truncation=True,
        padding='max_length',
        max_length=max_length,
        return_tensors="pt"
    )
    label_encodings = tokenizer(
        labels.tolist(),
        truncation=True,
        padding='max_length',
        max_length=2,
        return_tensors="pt"
    )
    encodings["labels"] = label_encodings["input_ids"]
    return encodings

train_encodings = preprocess_data(balanced_train_df, tokenizer)
test_encodings = preprocess_data(test_data, tokenizer)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

# Dataset Creation

In [13]:
class T5Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

train_dataset = T5Dataset(train_encodings)
test_dataset = T5Dataset(test_encodings)


# Metric Calculation

In [14]:
def compute_metrics(pred):
    logits = pred.predictions[0] if isinstance(pred.predictions, tuple) else pred.predictions
    preds = logits.argmax(-1)
    decoded_preds = [tokenizer.decode(pred, skip_special_tokens=True) for pred in preds]
    decoded_labels = [tokenizer.decode(label, skip_special_tokens=True) for label in pred.label_ids]

    accuracy = accuracy_score(decoded_labels, decoded_preds)
    f1 = f1_score(decoded_labels, decoded_preds, average='weighted')

    print("\nClassification Report:")
    print(classification_report(decoded_labels, decoded_preds))

    return {
        'accuracy': accuracy,
        'f1': f1
    }


# Training Configuration

In [15]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none",
    greater_is_better=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)




# Model Training

In [16]:
trainer.train()


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2587,0.414377,0.704348,0.70361
2,0.1727,0.386728,0.757764,0.75931
3,0.1379,0.439672,0.757764,0.75891



Classification Report:
              precision    recall  f1-score   support

                   0.60      0.86      0.71       291
         2.0       0.67      0.74      0.70        76
          3.       0.85      0.59      0.70       438

    accuracy                           0.70       805
   macro avg       0.71      0.73      0.70       805
weighted avg       0.75      0.70      0.70       805


Classification Report:
              precision    recall  f1-score   support

                   0.69      0.79      0.74       291
         2.0       0.67      0.74      0.70        76
          3.       0.83      0.74      0.78       438

    accuracy                           0.76       805
   macro avg       0.73      0.76      0.74       805
weighted avg       0.77      0.76      0.76       805


Classification Report:
              precision    recall  f1-score   support

                   0.70      0.76      0.73       291
         2.0       0.69      0.74      0.71        76
   

There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=5250, training_loss=0.2554216853550502, metrics={'train_runtime': 1995.182, 'train_samples_per_second': 21.051, 'train_steps_per_second': 2.631, 'total_flos': 6394057850880000.0, 'train_loss': 0.2554216853550502, 'epoch': 3.0})

# Evaluation

In [17]:
print("\nFinal Evaluation on Test Set:")
test_results = trainer.evaluate(eval_dataset=test_dataset)
print("Test Results:", test_results)



Final Evaluation on Test Set:



Classification Report:
              precision    recall  f1-score   support

                   0.69      0.79      0.74       291
         2.0       0.67      0.74      0.70        76
          3.       0.83      0.74      0.78       438

    accuracy                           0.76       805
   macro avg       0.73      0.76      0.74       805
weighted avg       0.77      0.76      0.76       805

Test Results: {'eval_loss': 0.38672831654548645, 'eval_accuracy': 0.7577639751552795, 'eval_f1': 0.7593095300893659, 'eval_runtime': 8.2158, 'eval_samples_per_second': 97.982, 'eval_steps_per_second': 12.293, 'epoch': 3.0}


# Predictions and Saving Results

In [18]:
predictions = trainer.predict(test_dataset)
logits = predictions.predictions[0] if isinstance(predictions.predictions, tuple) else predictions.predictions
pred_labels = [tokenizer.decode(pred, skip_special_tokens=True) for pred in logits.argmax(-1)]

test_data['Predicted_Severity'] = pred_labels
test_data[['Subject', 'Description', 'Severity', 'Predicted_Severity']].to_csv('test_predictions.csv', index=False)



Classification Report:
              precision    recall  f1-score   support

                   0.69      0.79      0.74       291
         2.0       0.67      0.74      0.70        76
          3.       0.83      0.74      0.78       438

    accuracy                           0.76       805
   macro avg       0.73      0.76      0.74       805
weighted avg       0.77      0.76      0.76       805

