# DistilBERT News Classification

IT20012892 Ahamed M.S.A

In [1]:
!git clone https://github.com/aneeq-shaffy/SE4050-Deep-Learning.git

Cloning into 'SE4050-Deep-Learning'...
remote: Enumerating objects: 143, done.[K
remote: Counting objects: 100% (27/27), done.[K
remote: Compressing objects: 100% (23/23), done.[K
remote: Total 143 (delta 8), reused 17 (delta 4), pack-reused 116 (from 1)[K
Receiving objects: 100% (143/143), 195.56 MiB | 18.82 MiB/s, done.
Resolving deltas: 100% (44/44), done.
Updating files: 100% (24/24), done.
Encountered 8 file(s) that should have been pointers, but weren't:
	Dataset/encoders/label_encoder.pkl
	Dataset/processed/data_splits.pkl
	Dataset/processed/news_preprocessed.csv
	Dataset/reports/category_analysis.png
	Dataset/reports/class_balancing.png
	Dataset/reports/eda_comprehensive.png
	Dataset/reports/quality_report.png
	Dataset/reports/wordclouds_by_category.png


Install Required Libraries

In [2]:
!pip install transformers datasets torch accelerate -q

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    DistilBertTokenizer,
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
import pickle

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


Load and Prepare Data

In [3]:
# Load your data
df = pd.read_csv('/content/SE4050-Deep-Learning/Dataset/processed/news_preprocessed.csv')

print(f"Dataset shape: {df.shape}")
print(f"Categories: {df['category'].nunique()}")
print(f"\nLabel distribution:\n{df['label'].value_counts().sort_index()}")

# Load pre-split data
with open('/content/SE4050-Deep-Learning/Dataset/processed/data_splits.pkl', 'rb') as f:
    data_splits = pickle.load(f)

print(f"\nKeys in data_splits: {data_splits.keys()}")

train_texts = data_splits['X_train']
val_texts = data_splits['X_val']
test_texts = data_splits['X_test']
train_labels = data_splits['y_train']
val_labels = data_splits['y_val']
test_labels = data_splits['y_test']

print(f"\nTrain: {len(train_texts)}")
print(f"Validation: {len(val_texts)}")
print(f"Test: {len(test_texts)}")

Dataset shape: (30000, 3)
Categories: 15

Label distribution:
label
0     2000
1     2000
2     2000
3     2000
4     2000
5     2000
6     2000
7     2000
8     2000
9     2000
10    2000
11    2000
12    2000
13    2000
14    2000
Name: count, dtype: int64

Keys in data_splits: dict_keys(['X_train', 'X_val', 'X_test', 'y_train', 'y_val', 'y_test'])

Train: 21000
Validation: 4500
Test: 4500


Create Dataset Class

In [4]:
class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

 Initialize Tokenizer and Model

In [5]:
# Load tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Create datasets
train_dataset = NewsDataset(train_texts, train_labels, tokenizer, max_length=128)
val_dataset = NewsDataset(val_texts, val_labels, tokenizer, max_length=128)
test_dataset = NewsDataset(test_texts, test_labels, tokenizer, max_length=128)

# Load model with number of labels
num_labels = df['label'].nunique()
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=num_labels
)

model.to(device)
print(f"\n✅ Model loaded with {num_labels} output classes")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



✅ Model loaded with 15 output classes


Define Training Arguments

In [6]:
training_args = TrainingArguments(
    output_dir='/content/SE4050-Deep-Learning/results/distilbert_news_model_results', # Updated output directory
    num_train_epochs=3,              # Start with 3 epochs
    per_device_train_batch_size=16,  # Adjust based on your GPU memory
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    eval_strategy='steps',           # Evaluate during training
    eval_steps=500,
    save_strategy='steps',
    save_steps=500,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    save_total_limit=2,              # Only keep 2 best checkpoints
    report_to='none',                # Disable wandb/tensorboard
    fp16=torch.cuda.is_available(),  # Use mixed precision if GPU available
)

print("Training configuration set ✅")

Training configuration set ✅


Define Metrics

In [7]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')

    return {
        'accuracy': acc,
        'f1': f1
    }

Train the Model

In [8]:
# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# Start training
print("\n🚀 Starting training...\n")
trainer.train()

print("\n✅ Training complete!")


🚀 Starting training...



Step,Training Loss,Validation Loss,Accuracy,F1
500,1.3272,1.244013,0.638889,0.634554
1000,1.1329,1.096547,0.676667,0.671821
1500,0.7615,1.086582,0.698444,0.698812
2000,0.8202,1.064947,0.693333,0.692426
2500,0.8082,0.979552,0.709778,0.708227
3000,0.4556,1.035117,0.707778,0.708043
3500,0.4511,1.046786,0.717333,0.715977



✅ Training complete!


Evaluate on Test Set

In [9]:
# Evaluate on test set
print("\n📊 Evaluating on test set...")
test_results = trainer.evaluate(test_dataset)

print("\nTest Results:")
for key, value in test_results.items():
    print(f"{key}: {value:.4f}")

# Get detailed predictions
predictions = trainer.predict(test_dataset)
pred_labels = predictions.predictions.argmax(-1)

print("\n📋 Detailed Classification Report:")
print(classification_report(test_labels, pred_labels))


📊 Evaluating on test set...



Test Results:
eval_loss: 1.0726
eval_accuracy: 0.7133
eval_f1: 0.7122
eval_runtime: 5.8328
eval_samples_per_second: 771.4960
eval_steps_per_second: 24.1740
epoch: 3.0000

📋 Detailed Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.59      0.63       300
           1       0.68      0.68      0.68       300
           2       0.77      0.88      0.82       300
           3       0.58      0.57      0.58       300
           4       0.61      0.75      0.67       300
           5       0.78      0.85      0.81       300
           6       0.63      0.55      0.58       300
           7       0.74      0.69      0.71       300
           8       0.67      0.69      0.68       300
           9       0.84      0.75      0.79       300
          10       0.81      0.86      0.84       300
          11       0.86      0.76      0.81       300
          12       0.66      0.74      0.69       300
          13       0.61      0.57    

Save Model and Tokenizer

In [10]:
# Save the fine-tuned model
model.save_pretrained('/content/SE4050-Deep-Learning/models/distilbert_news_model')
tokenizer.save_pretrained('/content/SE4050-Deep-Learning/models/distilbert_news_model')

# Save label mapping
label_mapping = df[['category', 'label']].drop_duplicates().sort_values('label')
with open('/content/SE4050-Deep-Learning/models/distilbert_news_model/label_encoder.pkl', 'wb') as f:
    pickle.dump(label_mapping, f)

print("\n✅ Model, tokenizer, and label encoder saved!")


✅ Model, tokenizer, and label encoder saved!


Test Predictions

In [11]:
def predict_text(text, model, tokenizer, label_mapping):
    # Tokenize
    inputs = tokenizer(
        text,
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors='pt'
    )

    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Predict
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probs = torch.nn.functional.softmax(logits, dim=-1)
        pred_label = torch.argmax(probs, dim=-1).item()
        confidence = probs[0][pred_label].item()

    # Get category name
    category = label_mapping[label_mapping['label'] == pred_label]['category'].values[0]

    print(f"Text: {text[:100]}...")
    print(f"Predicted: {category}")
    print(f"Confidence: {confidence*100:.2f}%\n")

# Test examples
predict_text("The Lakers won the championship game last night", model, tokenizer, label_mapping)
predict_text("New study shows benefits of meditation", model, tokenizer, label_mapping)
predict_text("Stock market reaches all-time high", model, tokenizer, label_mapping)

Text: The Lakers won the championship game last night...
Predicted: SPORTS
Confidence: 98.99%

Text: New study shows benefits of meditation...
Predicted: WELLNESS
Confidence: 98.53%

Text: Stock market reaches all-time high...
Predicted: BUSINESS
Confidence: 99.00%



Load Model Later

In [12]:
# To load your trained model later
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import pickle

# Load model and tokenizer
model = DistilBertForSequenceClassification.from_pretrained('/content/SE4050-Deep-Learning/models/distilbert_news_model')
tokenizer = DistilBertTokenizer.from_pretrained('/content/SE4050-Deep-Learning/models/distilbert_news_model')

with open('/content/SE4050-Deep-Learning/models/distilbert_news_model/label_encoder.pkl', 'rb') as f:
    label_mapping = pickle.load(f)

model.to(device)
model.eval()

print("✅ Model loaded and ready!")

✅ Model loaded and ready!


# Experimenting with different approaches to improve model accuracy

Increasing Training Epochs

In [18]:
print("🚀 Training with 10 epochs...")

training_args = TrainingArguments(
    output_dir='/content/SE4050-Deep-Learning/results/exp2',
    num_train_epochs=10,  # 🔥 3→10
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='/content/SE4050-Deep-Learning/logs/exp2',
    logging_steps=100,
    eval_strategy='steps',
    eval_steps=500,
    save_strategy='steps',
    save_steps=500,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    save_total_limit=2,
    report_to='none',
    fp16=torch.cuda.is_available(),
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

trainer.train()

# Evaluate
test_results = trainer.evaluate(test_dataset)
exp2_acc = test_results['eval_accuracy']

print(f"\n✅ Baseline: 71.00% | Exp2: {exp2_acc*100:.2f}% | Improvement: {(exp2_acc-0.71)*100:+.2f}%")

# Save
model.save_pretrained('/content/SE4050-Deep-Learning/models/distilbert_exp2')
tokenizer.save_pretrained('/content/SE4050-Deep-Learning/models/distilbert_exp2')
print("✅ Model saved!")

🚀 Training with 10 epochs...


Step,Training Loss,Validation Loss,Accuracy,F1
500,0.3545,1.255449,0.687333,0.690141
1000,0.3578,1.271836,0.686667,0.686363
1500,0.2043,1.480404,0.680444,0.680658
2000,0.2594,1.50091,0.682444,0.681492



✅ Baseline: 71.00% | Exp2: 68.42% | Improvement: -2.58%
✅ Model saved!


Increasing epochs resulted in overfitting and lower accuracy

Extended Max Length from 128 to 256 Tokens

In [19]:
print("="*70)
print("EXPERIMENT 3: INCREASED MAX LENGTH")
print("="*70)
print("\nChanges from Baseline:")
print("  ✅ max_length: 128 → 256")
print("  ⚠️  Note: This will use more GPU memory")
print("="*70)
print()

from datetime import datetime
start_time = datetime.now()

# Recreate datasets with longer max_length
print("📦 Creating datasets with max_length=256...")

train_dataset_256 = NewsDataset(train_texts, train_labels, tokenizer, max_length=256)  # 🔥 256
val_dataset_256 = NewsDataset(val_texts, val_labels, tokenizer, max_length=256)  # 🔥 256
test_dataset_256 = NewsDataset(test_texts, test_labels, tokenizer, max_length=256)  # 🔥 256

print("✅ Datasets created with max_length=256")

# Reload model (fresh start)
print("\n🔄 Loading fresh model...")
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=num_labels
)
model.to(device)
print("✅ Model loaded")

# Training Configuration
training_args = TrainingArguments(
    output_dir='/content/SE4050-Deep-Learning/results/distilbert_exp3_results',
    num_train_epochs=3,  # Keep at 3 to isolate the effect of max_length change
    per_device_train_batch_size=8,  # 🔥 REDUCED: 16 → 8 (longer sequences need more memory)
    per_device_eval_batch_size=16,  # 🔥 REDUCED: 32 → 16
    gradient_accumulation_steps=2,  # 🔥 ADDED: Effective batch = 8*2=16
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='/content/SE4050-Deep-Learning/logs/exp3',
    logging_steps=100,
    eval_strategy='steps',
    eval_steps=500,
    save_strategy='steps',
    save_steps=500,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    save_total_limit=2,
    report_to='none',
    fp16=torch.cuda.is_available(),
)

print("\n✅ Training configuration set")
print(f"   Epochs: {training_args.num_train_epochs}")
print(f"   Max Length: 256 🔥")
print(f"   Batch Size: {training_args.per_device_train_batch_size} (reduced due to longer sequences)")
print(f"   Gradient Accumulation: {training_args.gradient_accumulation_steps}")
print()

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_256,  # 🔥 Using new dataset
    eval_dataset=val_dataset_256,      # 🔥 Using new dataset
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# Train
print("🚀 Starting training...\n")
trainer.train()

end_time = datetime.now()
duration = end_time - start_time

print("\n✅ Training complete!")
print(f"   Duration: {duration}")

# Evaluate on Test Set
print("\n📊 Evaluating on test set...")
test_results = trainer.evaluate(test_dataset_256)  # 🔥 Using new dataset

print("\n" + "="*70)
print("RESULTS")
print("="*70)
print(f"Test Accuracy: {test_results['eval_accuracy']:.4f} ({test_results['eval_accuracy']*100:.2f}%)")
print(f"Test F1 Score: {test_results['eval_f1']:.4f}")

# Detailed predictions
predictions = trainer.predict(test_dataset_256)  # 🔥 Using new dataset
pred_labels = predictions.predictions.argmax(-1)

print("\n📋 Classification Report:")
print(classification_report(test_labels, pred_labels))

# Comparison with Previous Experiments
baseline_acc = 0.71  # Baseline (3 epochs, 128 length)
exp3_acc = test_results['eval_accuracy']
improvement = exp3_acc - baseline_acc

print("\n" + "="*70)
print("COMPARISON")
print("="*70)
print(f"Baseline (128 max_length):     {baseline_acc:.4f} ({baseline_acc*100:.2f}%)")
print(f"Experiment 3 (256 max_length): {exp3_acc:.4f} ({exp3_acc*100:.2f}%)")
print(f"Improvement: {improvement:+.4f} ({improvement*100:+.2f}%)")
print("="*70)

# Save Model
save_path = '/content/SE4050-Deep-Learning/models/distilbert_exp3_maxlen256'
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"\n✅ Model saved to: {save_path}")

# Save results
import pickle
results_dict = {
    'experiment': 'Experiment 3 - Increased Max Length',
    'max_length': 256,
    'epochs': 3,
    'baseline_acc': baseline_acc,
    'test_acc': exp3_acc,
    'improvement': improvement,
    'duration': str(duration),
    'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
}

with open('/content/SE4050-Deep-Learning/results/exp3_results.pkl', 'wb') as f:
    pickle.dump(results_dict, f)

# Save config for later use
config_dict = {
    'max_length': 256,
    'model_name': 'distilbert-base-uncased',
    'num_labels': num_labels
}

with open('/content/SE4050-Deep-Learning/models/distilbert_exp3_maxlen256/config.pkl', 'wb') as f:
    pickle.dump(config_dict, f)

print("✅ Results and config saved!")
print("\n🎉 EXPERIMENT 3 COMPLETE!")

EXPERIMENT 3: INCREASED MAX LENGTH

Changes from Baseline:
  ✅ max_length: 128 → 256
  ⚠️  Note: This will use more GPU memory

📦 Creating datasets with max_length=256...
✅ Datasets created with max_length=256

🔄 Loading fresh model...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ Model loaded

✅ Training configuration set
   Epochs: 3
   Max Length: 256 🔥
   Batch Size: 8 (reduced due to longer sequences)
   Gradient Accumulation: 2

🚀 Starting training...



Step,Training Loss,Validation Loss,Accuracy,F1
500,1.3452,1.257646,0.639333,0.637395
1000,1.1283,1.093779,0.675111,0.669337
1500,0.7744,1.077256,0.686667,0.685655
2000,0.8037,1.058437,0.689556,0.689797
2500,0.7929,0.978592,0.712889,0.712438
3000,0.4822,1.032262,0.710222,0.710138
3500,0.4192,1.023952,0.721111,0.718871



✅ Training complete!
   Duration: 0:11:19.563932

📊 Evaluating on test set...



RESULTS
Test Accuracy: 0.7176 (71.76%)
Test F1 Score: 0.7160

📋 Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.61      0.66       300
           1       0.66      0.68      0.67       300
           2       0.77      0.90      0.83       300
           3       0.59      0.61      0.60       300
           4       0.67      0.69      0.68       300
           5       0.77      0.87      0.82       300
           6       0.66      0.52      0.58       300
           7       0.71      0.68      0.70       300
           8       0.69      0.68      0.69       300
           9       0.85      0.77      0.81       300
          10       0.83      0.84      0.83       300
          11       0.82      0.79      0.81       300
          12       0.64      0.71      0.68       300
          13       0.59      0.60      0.59       300
          14       0.79      0.82      0.80       300

    accuracy                           0.72   

Both combined

In [20]:
print("="*70)
print("EXPERIMENT 4: COMBINED IMPROVEMENTS")
print("="*70)
print("\nChanges from Baseline:")
print("  ✅ num_train_epochs: 3 → 10")
print("  ✅ max_length: 128 → 256")
print("="*70)
print()

from datetime import datetime
start_time = datetime.now()

# Recreate datasets with max_length=256
train_dataset_256 = NewsDataset(train_texts, train_labels, tokenizer, max_length=256)
val_dataset_256 = NewsDataset(val_texts, val_labels, tokenizer, max_length=256)
test_dataset_256 = NewsDataset(test_texts, test_labels, tokenizer, max_length=256)

# Reload model
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=num_labels
)
model.to(device)

# Training Configuration
training_args = TrainingArguments(
    output_dir='/content/SE4050-Deep-Learning/results/exp4',
    num_train_epochs=10,  # 🔥 Increased
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='/content/SE4050-Deep-Learning/logs/exp4',
    logging_steps=100,
    eval_strategy='steps',
    eval_steps=500,
    save_strategy='steps',
    save_steps=500,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    save_total_limit=2,
    report_to='none',
    fp16=torch.cuda.is_available(),
)

print("✅ Config: 10 epochs, 256 max_length")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_256,
    eval_dataset=val_dataset_256,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
)

print("🚀 Starting training...\n")
trainer.train()

end_time = datetime.now()
duration = end_time - start_time

print(f"\n✅ Training complete! Duration: {duration}")

# Evaluate
test_results = trainer.evaluate(test_dataset_256)
exp4_acc = test_results['eval_accuracy']

predictions = trainer.predict(test_dataset_256)
pred_labels = predictions.predictions.argmax(-1)

print("\n" + "="*70)
print("FINAL COMPARISON")
print("="*70)
print(f"Baseline (3 epochs, 128 len):  71.00%")
print(f"Exp 4 (10 epochs, 256 len):    {exp4_acc*100:.2f}%")
print(f"Total Improvement:             {(exp4_acc-0.71)*100:+.2f}%")
print("="*70)

print("\n📋 Classification Report:")
print(classification_report(test_labels, pred_labels))

# Save
model.save_pretrained('/content/SE4050-Deep-Learning/models/distilbert_exp4_best')
tokenizer.save_pretrained('/content/SE4050-Deep-Learning/models/distilbert_exp4_best')

print("\n✅ Best model saved!")
print("🎉 ALL EXPERIMENTS COMPLETE!")

EXPERIMENT 4: COMBINED IMPROVEMENTS

Changes from Baseline:
  ✅ num_train_epochs: 3 → 10
  ✅ max_length: 128 → 256



Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ Config: 10 epochs, 256 max_length
🚀 Starting training...



Step,Training Loss,Validation Loss,Accuracy,F1
500,1.3482,1.27284,0.634889,0.632345
1000,1.1334,1.130337,0.666667,0.659629
1500,0.7952,1.111236,0.681111,0.681597
2000,0.8406,1.078567,0.685111,0.685688
2500,0.8377,1.017084,0.703333,0.704553
3000,0.5268,1.090727,0.697778,0.698651
3500,0.4712,1.124084,0.703778,0.704689
4000,0.3462,1.179147,0.699556,0.700653
4500,0.2802,1.351599,0.699111,0.695303
5000,0.2483,1.392532,0.699333,0.700869



✅ Training complete! Duration: 0:18:03.683764



FINAL COMPARISON
Baseline (3 epochs, 128 len):  71.00%
Exp 4 (10 epochs, 256 len):    69.51%
Total Improvement:             -1.49%

📋 Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.57      0.62       300
           1       0.69      0.65      0.67       300
           2       0.75      0.89      0.81       300
           3       0.49      0.69      0.58       300
           4       0.60      0.72      0.66       300
           5       0.82      0.78      0.80       300
           6       0.63      0.48      0.54       300
           7       0.71      0.70      0.70       300
           8       0.73      0.55      0.63       300
           9       0.85      0.74      0.79       300
          10       0.78      0.86      0.82       300
          11       0.88      0.73      0.80       300
          12       0.64      0.69      0.66       300
          13       0.54      0.63      0.58       300
          14       0.81      0.

Optimizing Learning Rate and Scheduler

In [21]:
print("="*70)
print("EXPERIMENT 5: OPTIMIZED LEARNING RATE + SCHEDULER")
print("="*70)
print("\nChanges from Baseline:")
print("  ✅ learning_rate: 5e-5 → 2e-5 (lower, more stable)")
print("  ✅ warmup_ratio: 0 → 0.1 (10% warmup)")
print("  ✅ lr_scheduler_type: linear → cosine (better decay)")
print("="*70)
print()

from datetime import datetime
start_time = datetime.now()

# Reload model (fresh start)
print("🔄 Loading fresh model...")
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=num_labels
)
model.to(device)
print("✅ Model loaded")

# Training Configuration with Optimized LR + Scheduler
training_args = TrainingArguments(
    output_dir='/content/SE4050-Deep-Learning/results/distilbert_exp5_results',
    num_train_epochs=5,  # Using 5 epochs to see the effect

    # 🔥 LEARNING RATE OPTIMIZATION
    learning_rate=2e-5,  # 🔥 Lower LR (default is 5e-5)

    # 🔥 WARMUP SCHEDULE
    warmup_ratio=0.1,  # 🔥 Warmup for first 10% of training

    # 🔥 LEARNING RATE SCHEDULER
    lr_scheduler_type='cosine',  # 🔥 Cosine decay (smoother than linear)

    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    weight_decay=0.01,
    logging_dir='/content/SE4050-Deep-Learning/logs/exp5',
    logging_steps=100,
    eval_strategy='steps',
    eval_steps=500,
    save_strategy='steps',
    save_steps=500,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    save_total_limit=2,
    report_to='none',
    fp16=torch.cuda.is_available(),
)

print("\n✅ Training configuration set")
print(f"   Epochs: {training_args.num_train_epochs}")
print(f"   Learning Rate: {training_args.learning_rate} (was 5e-5)")
print(f"   Warmup Ratio: {training_args.warmup_ratio}")
print(f"   LR Scheduler: {training_args.lr_scheduler_type}")
print(f"   Batch Size: {training_args.per_device_train_batch_size}")
print()

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,  # Using original 128 max_length
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# Train
print("🚀 Starting training with optimized LR + Scheduler...\n")
trainer.train()

end_time = datetime.now()
duration = end_time - start_time

print("\n✅ Training complete!")
print(f"   Duration: {duration}")

# Evaluate on Test Set
print("\n📊 Evaluating on test set...")
test_results = trainer.evaluate(test_dataset)

print("\n" + "="*70)
print("RESULTS")
print("="*70)
print(f"Test Accuracy: {test_results['eval_accuracy']:.4f} ({test_results['eval_accuracy']*100:.2f}%)")
print(f"Test F1 Score: {test_results['eval_f1']:.4f}")

# Detailed predictions
predictions = trainer.predict(test_dataset)
pred_labels = predictions.predictions.argmax(-1)

print("\n📋 Classification Report:")
print(classification_report(test_labels, pred_labels))

# Comparison with Baseline
baseline_acc = 0.71  # Baseline
exp5_acc = test_results['eval_accuracy']
improvement = exp5_acc - baseline_acc

print("\n" + "="*70)
print("COMPARISON")
print("="*70)
print(f"Baseline (default LR, 3 epochs):           {baseline_acc:.4f} ({baseline_acc*100:.2f}%)")
print(f"Experiment 5 (optimized LR+scheduler, 5ep): {exp5_acc:.4f} ({exp5_acc*100:.2f}%)")
print(f"Improvement: {improvement:+.4f} ({improvement*100:+.2f}%)")
print("="*70)

# Save Model
save_path = '/content/SE4050-Deep-Learning/models/distilbert_exp5_lr_optimized'
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"\n✅ Model saved to: {save_path}")

# Save results
import pickle
results_dict = {
    'experiment': 'Experiment 5 - Optimized LR + Scheduler',
    'learning_rate': 2e-5,
    'warmup_ratio': 0.1,
    'lr_scheduler': 'cosine',
    'epochs': 5,
    'baseline_acc': baseline_acc,
    'test_acc': exp5_acc,
    'improvement': improvement,
    'duration': str(duration),
    'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
}

with open('/content/SE4050-Deep-Learning/results/exp5_results.pkl', 'wb') as f:
    pickle.dump(results_dict, f)

print("✅ Results saved!")
print("\n🎉 EXPERIMENT 5 COMPLETE!")

EXPERIMENT 5: OPTIMIZED LEARNING RATE + SCHEDULER

Changes from Baseline:
  ✅ learning_rate: 5e-5 → 2e-5 (lower, more stable)
  ✅ warmup_ratio: 0 → 0.1 (10% warmup)
  ✅ lr_scheduler_type: linear → cosine (better decay)

🔄 Loading fresh model...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ Model loaded

✅ Training configuration set
   Epochs: 5
   Learning Rate: 2e-05 (was 5e-5)
   Warmup Ratio: 0.1
   LR Scheduler: SchedulerType.COSINE
   Batch Size: 16

🚀 Starting training with optimized LR + Scheduler...



Step,Training Loss,Validation Loss,Accuracy,F1
500,1.7794,1.574244,0.604,0.58923
1000,1.1936,1.152289,0.669111,0.665319
1500,0.9044,1.088142,0.686,0.687082
2000,0.9482,1.077997,0.690889,0.690665
2500,0.9267,1.00181,0.706,0.705944
3000,0.6587,1.042058,0.697333,0.696829
3500,0.6526,1.041465,0.703778,0.701871
4000,0.5055,1.022705,0.706444,0.704482
4500,0.4553,1.065315,0.703556,0.70204
5000,0.4157,1.071067,0.708222,0.707392



✅ Training complete!
   Duration: 0:14:33.908631

📊 Evaluating on test set...



RESULTS
Test Accuracy: 0.7040 (70.40%)
Test F1 Score: 0.7035

📋 Classification Report:
              precision    recall  f1-score   support

           0       0.68      0.63      0.65       300
           1       0.67      0.65      0.66       300
           2       0.79      0.85      0.82       300
           3       0.56      0.60      0.58       300
           4       0.66      0.66      0.66       300
           5       0.78      0.85      0.81       300
           6       0.58      0.55      0.56       300
           7       0.72      0.65      0.68       300
           8       0.67      0.68      0.67       300
           9       0.83      0.75      0.79       300
          10       0.82      0.86      0.84       300
          11       0.82      0.77      0.80       300
          12       0.63      0.70      0.66       300
          13       0.58      0.57      0.58       300
          14       0.77      0.79      0.78       300

    accuracy                           0.70   