## Models

### Importing packages

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from tensorflow.keras.optimizers import Adam
from transformers import BertForSequenceClassification, BertTokenizer
import time
import torch
from tqdm import tqdm
from torch.optim import AdamW
from sklearn.model_selection import KFold


### Importing cleaned dataset

In [None]:
df = pd.read_csv('/content/clean_resume_dataset.csv')
df

Unnamed: 0,Category,Resume,clean_text
0,Accountant,education omba executive leadership university...,education omba executive leadership bachelor ...
1,Accountant,howard gerrard accountant deyjobcom birmingham...,accountant deyjobcom infodayjobcom linkedinn...
2,Accountant,kevin frank senior accountant inforesumekraftc...,senior accountant inforesumekraftcom chicago ...
3,Accountant,place birth nationality olivia ogilvy accounta...,place birth nationality olivia accountant 151...
4,Accountant,stephen greet cpa senior accountant 9 year exp...,cpa senior accountant year experience establi...
...,...,...,...
12239,Testing,Computer Skills: â¢ Proficient in MS office (...,in MS office Word Basic Excel Power point wo...
12240,Testing,â Willingness to accept the challenges. â ...,Willingness to accept the challenges Positive ...
12241,Testing,"PERSONAL SKILLS â¢ Quick learner, â¢ Eagerne...",PERSONAL SKILLS Quick learner Eagerness to lea...
12242,Testing,COMPUTER SKILLS & SOFTWARE KNOWLEDGE MS-Power ...,COMPUTER SKILLS SOFTWARE KNOWLEDGE wind I nt...


In [None]:
# Get the lengths of all documents
text_lengths = df['clean_text'].apply(lambda x: len(x.split()))

# Get statistics about the lengths
print(f"Max Length: {text_lengths.max()}")
print(f"Min Length: {text_lengths.min()}")
print(f"Average Length: {text_lengths.mean()}")


Max Length: 6146
Min Length: 4
Average Length: 439.7765436131983


In [None]:

# --- Step 3: Encode Labels ---
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['Category'])
num_labels = len(label_encoder.classes_)
print(f"Number of unique job roles: {num_labels}")
label_encoder.classes_

# --- Step 4: Train-Test Split (First Split) ---
X_train, X_test, y_train, y_test = train_test_split(
    df['clean_text'],
    df['label'],
    test_size=0.1,  # 10% for testing
    random_state=42,
    stratify=df['label']  # Ensures class balance
)

# --- Step 5: Tokenize Text for BERT ---
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

num_labels


Number of unique job roles: 43
Using device: cuda


43

### BASE MODEL

In [None]:

# --- Configuration ---
EPOCHS = 3
BATCH_SIZE = 16
MAX_LENGTH = 128
LEARNING_RATE = 2e-5
WEIGHT_DECAY = 0.01

# --- Step 6: Cross-Validation Setup ---
kfold = KFold(n_splits=3, shuffle=True, random_state=42)

fold_no = 1
accuracies = []
precisions = []
recalls = []
f1_scores = []
fold_results = []

# --- Step 7: K-Fold Cross-Validation ---
for train_index, val_index in kfold.split(X_train):
    print(f"\nTraining fold {fold_no}...")

    # Track time for this fold
    start_time = time.time()

    # Split data
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

    train_encodings = tokenizer(
        list(X_train_fold),
        truncation=True,
        padding=True,
        max_length=MAX_LENGTH,
        return_tensors='pt'
    )

    val_encodings = tokenizer(
        list(X_val_fold),
        truncation=True,
        padding=True,
        max_length=MAX_LENGTH,
        return_tensors='pt'
    )

    # Create datasets
    train_dataset = torch.utils.data.TensorDataset(
        train_encodings['input_ids'],
        train_encodings['attention_mask'],
        torch.tensor(y_train_fold.values)
    )

    val_dataset = torch.utils.data.TensorDataset(
        val_encodings['input_ids'],
        val_encodings['attention_mask'],
        torch.tensor(y_val_fold.values)
    )

    # Create dataloaders
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=BATCH_SIZE)

    # --- Load Model (PyTorch version) ---
    model = BertForSequenceClassification.from_pretrained(
        'bert-base-uncased',
        num_labels=num_labels
    ).to(device)

    # --- Optimizer & Scheduler ---
    optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, eps=1e-8, weight_decay=WEIGHT_DECAY)

    total_steps = len(train_loader) * EPOCHS
    scheduler = torch.optim.lr_scheduler.LinearLR(
        optimizer,
        start_factor=1e-6,
        end_factor=1.0,
        total_iters=total_steps
    )

    # --- Training Loop ---
    for epoch in range(EPOCHS):
        print(f'\nEpoch {epoch + 1}/{EPOCHS}')

        # Training
        model.train()
        total_train_loss = 0
        train_pbar = tqdm(train_loader, desc='Training')

        for batch in train_pbar:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]

            model.zero_grad()

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss = outputs.loss
            total_train_loss += loss.item()

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()
            scheduler.step()

            train_pbar.set_postfix({'loss': loss.item()})

        avg_train_loss = total_train_loss / len(train_loader)

        # Validation
        model.eval()
        total_val_loss = 0
        all_preds = []
        all_labels = []

        with torch.no_grad():
            for batch in tqdm(val_loader, desc='Validation'):
                input_ids, attention_mask, labels = [b.to(device) for b in batch]

                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )

                total_val_loss += outputs.loss.item()

                logits = outputs.logits
                preds = torch.argmax(logits, dim=1)

                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

        avg_val_loss = total_val_loss / len(val_loader)
        val_accuracy = accuracy_score(all_labels, all_preds)

        print(f'Train Loss: {avg_train_loss:.4f}')
        print(f'Val Loss: {avg_val_loss:.4f}')
        print(f'Val Accuracy: {val_accuracy:.4f}')

    # --- Final Evaluation ---
    model.eval()
    final_preds = []
    final_labels = []

    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)

            final_preds.extend(preds.cpu().numpy())
            final_labels.extend(labels.cpu().numpy())

    # Calculate metrics
    accuracy = accuracy_score(final_labels, final_preds)
    report = classification_report(
        final_labels,
        final_preds,
        target_names=label_encoder.classes_,
        output_dict=True,
        zero_division=0
    )

    fold_result = {
        'fold': fold_no,
        'accuracy': accuracy,
        'precision': report['macro avg']['precision'],
        'recall': report['macro avg']['recall'],
        'f1_score': report['macro avg']['f1-score'],
        'time': time.time() - start_time,
        'report': report
    }

    fold_results.append(fold_result)

    print(f"\n{'='*60}")
    print(f"Fold {fold_no} Results:")
    print(f"  Accuracy:  {accuracy:.4f}")
    print(f"  Precision: {report['macro avg']['precision']:.4f}")
    print(f"  Recall:    {report['macro avg']['recall']:.4f}")
    print(f"  F1-Score:  {report['macro avg']['f1-score']:.4f}")
    print(f"  Time:      {fold_result['time']:.2f}s")
    print(f"{'='*60}")

    fold_no += 1

    # Clean up memory
    del model
    torch.cuda.empty_cache() if torch.cuda.is_available() else None

# --- Final Results ---
print(f"\n{'='*60}")
print("FINAL K-FOLD CROSS-VALIDATION RESULTS")
print(f"{'='*60}")

accuracies = [f['accuracy'] for f in fold_results]
precisions = [f['precision'] for f in fold_results]
recalls = [f['recall'] for f in fold_results]
f1_scores = [f['f1_score'] for f in fold_results]

print(f"\nAccuracy:  {np.mean(accuracies):.4f} ± {np.std(accuracies):.4f}")
print(f"Precision: {np.mean(precisions):.4f} ± {np.std(precisions):.4f}")
print(f"Recall:    {np.mean(recalls):.4f} ± {np.std(recalls):.4f}")
print(f"F1-Score:  {np.mean(f1_scores):.4f} ± {np.std(f1_scores):.4f}")

print(f"\nTotal Time: {sum(f['time'] for f in fold_results):.2f}s")
print(f"Average Time per Fold: {np.mean([f['time'] for f in fold_results]):.2f}s")


Using device: cuda

Training fold 1...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/3


Training: 100%|██████████| 460/460 [02:53<00:00,  2.65it/s, loss=3.4]
Validation: 100%|██████████| 230/230 [00:27<00:00,  8.22it/s]


Train Loss: 3.6468
Val Loss: 3.1894
Val Accuracy: 0.3077

Epoch 2/3


Training: 100%|██████████| 460/460 [02:52<00:00,  2.67it/s, loss=1.45]
Validation: 100%|██████████| 230/230 [00:27<00:00,  8.25it/s]


Train Loss: 2.4556
Val Loss: 1.6782
Val Accuracy: 0.7335

Epoch 3/3


Training: 100%|██████████| 460/460 [02:52<00:00,  2.67it/s, loss=4.2]
Validation: 100%|██████████| 230/230 [00:27<00:00,  8.27it/s]


Train Loss: 1.3433
Val Loss: 1.1101
Val Accuracy: 0.7708

Fold 1 Results:
  Accuracy:  0.7708
  Precision: 0.7493
  Recall:    0.7445
  F1-Score:  0.7398
  Time:      742.18s

Training fold 2...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/3


Training: 100%|██████████| 460/460 [02:53<00:00,  2.65it/s, loss=2.95]
Validation: 100%|██████████| 230/230 [00:27<00:00,  8.22it/s]


Train Loss: 3.6127
Val Loss: 3.0554
Val Accuracy: 0.4816

Epoch 2/3


Training: 100%|██████████| 460/460 [02:51<00:00,  2.68it/s, loss=1.49]
Validation: 100%|██████████| 230/230 [00:27<00:00,  8.27it/s]


Train Loss: 2.3660
Val Loss: 1.6196
Val Accuracy: 0.7343

Epoch 3/3


Training: 100%|██████████| 460/460 [02:51<00:00,  2.68it/s, loss=0.316]
Validation: 100%|██████████| 230/230 [00:27<00:00,  8.27it/s]


Train Loss: 1.3001
Val Loss: 1.0744
Val Accuracy: 0.7860

Fold 2 Results:
  Accuracy:  0.7860
  Precision: 0.8086
  Recall:    0.7654
  F1-Score:  0.7738
  Time:      739.98s

Training fold 3...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/3


Training: 100%|██████████| 460/460 [02:53<00:00,  2.65it/s, loss=3.05]
Validation: 100%|██████████| 230/230 [00:28<00:00,  8.21it/s]


Train Loss: 3.6239
Val Loss: 3.0847
Val Accuracy: 0.4179

Epoch 2/3


Training: 100%|██████████| 460/460 [02:51<00:00,  2.68it/s, loss=1.37]
Validation: 100%|██████████| 230/230 [00:27<00:00,  8.27it/s]


Train Loss: 2.3830
Val Loss: 1.6015
Val Accuracy: 0.7465

Epoch 3/3


Training: 100%|██████████| 460/460 [02:51<00:00,  2.68it/s, loss=1.88]
Validation: 100%|██████████| 230/230 [00:27<00:00,  8.27it/s]


Train Loss: 1.3168
Val Loss: 1.0527
Val Accuracy: 0.7852

Fold 3 Results:
  Accuracy:  0.7852
  Precision: 0.7567
  Recall:    0.7538
  F1-Score:  0.7500
  Time:      740.21s

FINAL K-FOLD CROSS-VALIDATION RESULTS

Accuracy:  0.7807 ± 0.0070
Precision: 0.7715 ± 0.0264
Recall:    0.7545 ± 0.0086
F1-Score:  0.7545 ± 0.0142

Total Time: 2222.36s
Average Time per Fold: 740.79s


### Ablation studies: Batch size 16 -> Batch size 32

In [None]:
# --- Configuration ---
EPOCHS = 3
BATCH_SIZE = 32
MAX_LENGTH = 128
LEARNING_RATE = 2e-5
WEIGHT_DECAY = 0.01

# --- Step 6: Cross-Validation Setup ---
kfold = KFold(n_splits=3, shuffle=True, random_state=42)

fold_no = 1
accuracies = []
precisions = []
recalls = []
f1_scores = []
fold_results = []

# --- Step 7: K-Fold Cross-Validation ---
for train_index, val_index in kfold.split(X_train):
    print(f"\nTraining fold {fold_no}...")

    # Track time for this fold
    start_time = time.time()

    # Split data
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

    train_encodings = tokenizer(
        list(X_train_fold),
        truncation=True,
        padding=True,
        max_length=MAX_LENGTH,
        return_tensors='pt'
    )

    val_encodings = tokenizer(
        list(X_val_fold),
        truncation=True,
        padding=True,
        max_length=MAX_LENGTH,
        return_tensors='pt'
    )

    # Create datasets
    train_dataset = torch.utils.data.TensorDataset(
        train_encodings['input_ids'],
        train_encodings['attention_mask'],
        torch.tensor(y_train_fold.values)
    )

    val_dataset = torch.utils.data.TensorDataset(
        val_encodings['input_ids'],
        val_encodings['attention_mask'],
        torch.tensor(y_val_fold.values)
    )

    # Create dataloaders
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=BATCH_SIZE)

    # --- Load Model (PyTorch version) ---
    model = BertForSequenceClassification.from_pretrained(
        'bert-base-uncased',
        num_labels=num_labels
    ).to(device)

    # --- Optimizer & Scheduler ---
    optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, eps=1e-8, weight_decay=WEIGHT_DECAY)

    total_steps = len(train_loader) * EPOCHS
    scheduler = torch.optim.lr_scheduler.LinearLR(
        optimizer,
        start_factor=1e-6,
        end_factor=1.0,
        total_iters=total_steps
    )

    # --- Training Loop ---
    for epoch in range(EPOCHS):
        print(f'\nEpoch {epoch + 1}/{EPOCHS}')

        # Training
        model.train()
        total_train_loss = 0
        train_pbar = tqdm(train_loader, desc='Training')

        for batch in train_pbar:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]

            model.zero_grad()

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss = outputs.loss
            total_train_loss += loss.item()

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()
            scheduler.step()

            train_pbar.set_postfix({'loss': loss.item()})

        avg_train_loss = total_train_loss / len(train_loader)

        # Validation
        model.eval()
        total_val_loss = 0
        all_preds = []
        all_labels = []

        with torch.no_grad():
            for batch in tqdm(val_loader, desc='Validation'):
                input_ids, attention_mask, labels = [b.to(device) for b in batch]

                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )

                total_val_loss += outputs.loss.item()

                logits = outputs.logits
                preds = torch.argmax(logits, dim=1)

                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

        avg_val_loss = total_val_loss / len(val_loader)
        val_accuracy = accuracy_score(all_labels, all_preds)

        print(f'Train Loss: {avg_train_loss:.4f}')
        print(f'Val Loss: {avg_val_loss:.4f}')
        print(f'Val Accuracy: {val_accuracy:.4f}')

    # --- Final Evaluation ---
    model.eval()
    final_preds = []
    final_labels = []

    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)

            final_preds.extend(preds.cpu().numpy())
            final_labels.extend(labels.cpu().numpy())

    # Calculate metrics
    accuracy = accuracy_score(final_labels, final_preds)
    report = classification_report(
        final_labels,
        final_preds,
        target_names=label_encoder.classes_,
        output_dict=True,
        zero_division=0
    )

    fold_result = {
        'fold': fold_no,
        'accuracy': accuracy,
        'precision': report['macro avg']['precision'],
        'recall': report['macro avg']['recall'],
        'f1_score': report['macro avg']['f1-score'],
        'time': time.time() - start_time,
        'report': report
    }

    fold_results.append(fold_result)

    print(f"\n{'='*60}")
    print(f"Fold {fold_no} Results:")
    print(f"  Accuracy:  {accuracy:.4f}")
    print(f"  Precision: {report['macro avg']['precision']:.4f}")
    print(f"  Recall:    {report['macro avg']['recall']:.4f}")
    print(f"  F1-Score:  {report['macro avg']['f1-score']:.4f}")
    print(f"  Time:      {fold_result['time']:.2f}s")
    print(f"{'='*60}")

    fold_no += 1

    # Clean up memory
    del model
    torch.cuda.empty_cache() if torch.cuda.is_available() else None

# --- Final Results ---
print(f"\n{'='*60}")
print("FINAL K-FOLD CROSS-VALIDATION RESULTS")
print(f"{'='*60}")

accuracies = [f['accuracy'] for f in fold_results]
precisions = [f['precision'] for f in fold_results]
recalls = [f['recall'] for f in fold_results]
f1_scores = [f['f1_score'] for f in fold_results]

print(f"\nAccuracy:  {np.mean(accuracies):.4f} ± {np.std(accuracies):.4f}")
print(f"Precision: {np.mean(precisions):.4f} ± {np.std(precisions):.4f}")
print(f"Recall:    {np.mean(recalls):.4f} ± {np.std(recalls):.4f}")
print(f"F1-Score:  {np.mean(f1_scores):.4f} ± {np.std(f1_scores):.4f}")

print(f"\nTotal Time: {sum(f['time'] for f in fold_results):.2f}s")
print(f"Average Time per Fold: {np.mean([f['time'] for f in fold_results]):.2f}s")



Training fold 1...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/3


Training: 100%|██████████| 230/230 [02:42<00:00,  1.42it/s, loss=3.39]
Validation: 100%|██████████| 115/115 [00:25<00:00,  4.51it/s]


Train Loss: 3.7242
Val Loss: 3.4705
Val Accuracy: 0.1865

Epoch 2/3


Training: 100%|██████████| 230/230 [02:40<00:00,  1.43it/s, loss=2.27]
Validation: 100%|██████████| 115/115 [00:25<00:00,  4.51it/s]


Train Loss: 2.8870
Val Loss: 2.1551
Val Accuracy: 0.6638

Epoch 3/3


Training: 100%|██████████| 230/230 [02:40<00:00,  1.44it/s, loss=0.809]
Validation: 100%|██████████| 115/115 [00:25<00:00,  4.52it/s]


Train Loss: 1.6913
Val Loss: 1.2881
Val Accuracy: 0.7563

Fold 1 Results:
  Accuracy:  0.7563
  Precision: 0.7406
  Recall:    0.7295
  F1-Score:  0.7201
  Time:      697.87s

Training fold 2...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/3


Training: 100%|██████████| 230/230 [02:42<00:00,  1.42it/s, loss=3.54]
Validation: 100%|██████████| 115/115 [00:25<00:00,  4.51it/s]


Train Loss: 3.6635
Val Loss: 3.3257
Val Accuracy: 0.2287

Epoch 2/3


Training: 100%|██████████| 230/230 [02:40<00:00,  1.43it/s, loss=2.49]
Validation: 100%|██████████| 115/115 [00:25<00:00,  4.51it/s]


Train Loss: 2.8128
Val Loss: 2.0977
Val Accuracy: 0.6545

Epoch 3/3


Training: 100%|██████████| 230/230 [02:40<00:00,  1.44it/s, loss=1.1]
Validation: 100%|██████████| 115/115 [00:25<00:00,  4.53it/s]


Train Loss: 1.7126
Val Loss: 1.2839
Val Accuracy: 0.7550

Fold 2 Results:
  Accuracy:  0.7550
  Precision: 0.7283
  Recall:    0.7172
  F1-Score:  0.7093
  Time:      698.41s

Training fold 3...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/3


Training: 100%|██████████| 230/230 [02:41<00:00,  1.42it/s, loss=3.5]
Validation: 100%|██████████| 115/115 [00:25<00:00,  4.52it/s]


Train Loss: 3.6868
Val Loss: 3.3840
Val Accuracy: 0.2238

Epoch 2/3


Training: 100%|██████████| 230/230 [02:40<00:00,  1.43it/s, loss=2.02]
Validation: 100%|██████████| 115/115 [00:25<00:00,  4.52it/s]


Train Loss: 2.8155
Val Loss: 2.0786
Val Accuracy: 0.6654

Epoch 3/3


Training: 100%|██████████| 230/230 [02:40<00:00,  1.43it/s, loss=1.12]
Validation: 100%|██████████| 115/115 [00:25<00:00,  4.52it/s]


Train Loss: 1.6991
Val Loss: 1.2592
Val Accuracy: 0.7686

Fold 3 Results:
  Accuracy:  0.7686
  Precision: 0.7500
  Recall:    0.7353
  F1-Score:  0.7332
  Time:      696.67s

FINAL K-FOLD CROSS-VALIDATION RESULTS

Accuracy:  0.7600 ± 0.0061
Precision: 0.7396 ± 0.0089
Recall:    0.7273 ± 0.0075
F1-Score:  0.7209 ± 0.0098

Total Time: 2092.95s
Average Time per Fold: 697.65s


### Ablation Studies: Max length 128 -> Max length 256

In [None]:
# --- Configuration ---
EPOCHS = 3
BATCH_SIZE = 16
MAX_LENGTH = 256
LEARNING_RATE = 2e-5
WEIGHT_DECAY = 0.01

# --- Step 6: Cross-Validation Setup ---
kfold = KFold(n_splits=3, shuffle=True, random_state=42)

fold_no = 1
accuracies = []
precisions = []
recalls = []
f1_scores = []
fold_results = []

# --- Step 7: K-Fold Cross-Validation ---
for train_index, val_index in kfold.split(X_train):
    print(f"\nTraining fold {fold_no}...")

    # Track time for this fold
    start_time = time.time()

    # Split data
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

    train_encodings = tokenizer(
        list(X_train_fold),
        truncation=True,
        padding=True,
        max_length=MAX_LENGTH,
        return_tensors='pt'
    )

    val_encodings = tokenizer(
        list(X_val_fold),
        truncation=True,
        padding=True,
        max_length=MAX_LENGTH,
        return_tensors='pt'
    )

    # Create datasets
    train_dataset = torch.utils.data.TensorDataset(
        train_encodings['input_ids'],
        train_encodings['attention_mask'],
        torch.tensor(y_train_fold.values)
    )

    val_dataset = torch.utils.data.TensorDataset(
        val_encodings['input_ids'],
        val_encodings['attention_mask'],
        torch.tensor(y_val_fold.values)
    )

    # Create dataloaders
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=BATCH_SIZE)

    # --- Load Model (PyTorch version) ---
    model = BertForSequenceClassification.from_pretrained(
        'bert-base-uncased',
        num_labels=num_labels
    ).to(device)

    # --- Optimizer & Scheduler ---
    optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, eps=1e-8, weight_decay=WEIGHT_DECAY)

    total_steps = len(train_loader) * EPOCHS
    scheduler = torch.optim.lr_scheduler.LinearLR(
        optimizer,
        start_factor=1e-6,
        end_factor=1.0,
        total_iters=total_steps
    )

    # --- Training Loop ---
    for epoch in range(EPOCHS):
        print(f'\nEpoch {epoch + 1}/{EPOCHS}')

        # Training
        model.train()
        total_train_loss = 0
        train_pbar = tqdm(train_loader, desc='Training')

        for batch in train_pbar:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]

            model.zero_grad()

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss = outputs.loss
            total_train_loss += loss.item()

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()
            scheduler.step()

            train_pbar.set_postfix({'loss': loss.item()})

        avg_train_loss = total_train_loss / len(train_loader)

        # Validation
        model.eval()
        total_val_loss = 0
        all_preds = []
        all_labels = []

        with torch.no_grad():
            for batch in tqdm(val_loader, desc='Validation'):
                input_ids, attention_mask, labels = [b.to(device) for b in batch]

                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )

                total_val_loss += outputs.loss.item()

                logits = outputs.logits
                preds = torch.argmax(logits, dim=1)

                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

        avg_val_loss = total_val_loss / len(val_loader)
        val_accuracy = accuracy_score(all_labels, all_preds)

        print(f'Train Loss: {avg_train_loss:.4f}')
        print(f'Val Loss: {avg_val_loss:.4f}')
        print(f'Val Accuracy: {val_accuracy:.4f}')

    # --- Final Evaluation ---
    model.eval()
    final_preds = []
    final_labels = []

    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)

            final_preds.extend(preds.cpu().numpy())
            final_labels.extend(labels.cpu().numpy())

    # Calculate metrics
    accuracy = accuracy_score(final_labels, final_preds)
    report = classification_report(
        final_labels,
        final_preds,
        target_names=label_encoder.classes_,
        output_dict=True,
        zero_division=0
    )

    fold_result = {
        'fold': fold_no,
        'accuracy': accuracy,
        'precision': report['macro avg']['precision'],
        'recall': report['macro avg']['recall'],
        'f1_score': report['macro avg']['f1-score'],
        'time': time.time() - start_time,
        'report': report
    }

    fold_results.append(fold_result)

    print(f"\n{'='*60}")
    print(f"Fold {fold_no} Results:")
    print(f"  Accuracy:  {accuracy:.4f}")
    print(f"  Precision: {report['macro avg']['precision']:.4f}")
    print(f"  Recall:    {report['macro avg']['recall']:.4f}")
    print(f"  F1-Score:  {report['macro avg']['f1-score']:.4f}")
    print(f"  Time:      {fold_result['time']:.2f}s")
    print(f"{'='*60}")

    fold_no += 1

    # Clean up memory
    del model
    torch.cuda.empty_cache() if torch.cuda.is_available() else None

# --- Final Results ---
print(f"\n{'='*60}")
print("FINAL K-FOLD CROSS-VALIDATION RESULTS")
print(f"{'='*60}")

accuracies = [f['accuracy'] for f in fold_results]
precisions = [f['precision'] for f in fold_results]
recalls = [f['recall'] for f in fold_results]
f1_scores = [f['f1_score'] for f in fold_results]

print(f"\nAccuracy:  {np.mean(accuracies):.4f} ± {np.std(accuracies):.4f}")
print(f"Precision: {np.mean(precisions):.4f} ± {np.std(precisions):.4f}")
print(f"Recall:    {np.mean(recalls):.4f} ± {np.std(recalls):.4f}")
print(f"F1-Score:  {np.mean(f1_scores):.4f} ± {np.std(f1_scores):.4f}")

print(f"\nTotal Time: {sum(f['time'] for f in fold_results):.2f}s")
print(f"Average Time per Fold: {np.mean([f['time'] for f in fold_results]):.2f}s")



Training fold 1...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/3


Training: 100%|██████████| 460/460 [05:15<00:00,  1.46it/s, loss=3.01]
Validation: 100%|██████████| 230/230 [00:50<00:00,  4.54it/s]


Train Loss: 3.6241
Val Loss: 3.2501
Val Accuracy: 0.1835

Epoch 2/3


Training: 100%|██████████| 460/460 [05:23<00:00,  1.42it/s, loss=0.943]
Validation: 100%|██████████| 230/230 [00:50<00:00,  4.57it/s]


Train Loss: 2.4713
Val Loss: 1.5911
Val Accuracy: 0.7370

Epoch 3/3


Training: 100%|██████████| 460/460 [05:24<00:00,  1.42it/s, loss=0.283]
Validation: 100%|██████████| 230/230 [00:50<00:00,  4.56it/s]


Train Loss: 1.1675
Val Loss: 0.8600
Val Accuracy: 0.8334

Fold 1 Results:
  Accuracy:  0.8334
  Precision: 0.8479
  Recall:    0.8186
  F1-Score:  0.8182
  Time:      1276.76s

Training fold 2...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/3


Training: 100%|██████████| 460/460 [05:24<00:00,  1.42it/s, loss=3.22]
Validation: 100%|██████████| 230/230 [00:50<00:00,  4.57it/s]


Train Loss: 3.6438
Val Loss: 3.2126
Val Accuracy: 0.2559

Epoch 2/3


Training: 100%|██████████| 460/460 [05:23<00:00,  1.42it/s, loss=1.98]
Validation: 100%|██████████| 230/230 [00:50<00:00,  4.57it/s]


Train Loss: 2.4125
Val Loss: 1.5074
Val Accuracy: 0.7672

Epoch 3/3


Training: 100%|██████████| 460/460 [05:23<00:00,  1.42it/s, loss=2.3]
Validation: 100%|██████████| 230/230 [00:50<00:00,  4.56it/s]


Train Loss: 1.1463
Val Loss: 0.8401
Val Accuracy: 0.8339

Fold 2 Results:
  Accuracy:  0.8339
  Precision: 0.8373
  Recall:    0.8173
  F1-Score:  0.8180
  Time:      1283.37s

Training fold 3...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/3


Training: 100%|██████████| 460/460 [05:11<00:00,  1.48it/s, loss=2.93]
Validation: 100%|██████████| 230/230 [00:49<00:00,  4.61it/s]


Train Loss: 3.6207
Val Loss: 3.0754
Val Accuracy: 0.3757

Epoch 2/3


Training: 100%|██████████| 460/460 [05:24<00:00,  1.42it/s, loss=2.25]
Validation: 100%|██████████| 230/230 [00:50<00:00,  4.56it/s]


Train Loss: 2.3552
Val Loss: 1.5027
Val Accuracy: 0.7620

Epoch 3/3


Training: 100%|██████████| 460/460 [05:23<00:00,  1.42it/s, loss=0.25]
Validation: 100%|██████████| 230/230 [00:50<00:00,  4.56it/s]


Train Loss: 1.1427
Val Loss: 0.8135
Val Accuracy: 0.8424

Fold 3 Results:
  Accuracy:  0.8424
  Precision: 0.8508
  Recall:    0.8262
  F1-Score:  0.8304
  Time:      1273.18s

FINAL K-FOLD CROSS-VALIDATION RESULTS

Accuracy:  0.8366 ± 0.0041
Precision: 0.8453 ± 0.0058
Recall:    0.8207 ± 0.0039
F1-Score:  0.8222 ± 0.0058

Total Time: 3833.32s
Average Time per Fold: 1277.77s


### Ablation Studies: Learning rate 2e-5 -> Learning Rate 5e-5

In [None]:
# --- Configuration ---
EPOCHS = 3
BATCH_SIZE = 16
MAX_LENGTH = 128
LEARNING_RATE = 5e-5
WEIGHT_DECAY = 0.01

# --- Step 6: Cross-Validation Setup ---
kfold = KFold(n_splits=3, shuffle=True, random_state=42)

fold_no = 1
accuracies = []
precisions = []
recalls = []
f1_scores = []
fold_results = []

# --- Step 7: K-Fold Cross-Validation ---
for train_index, val_index in kfold.split(X_train):
    print(f"\nTraining fold {fold_no}...")

    # Track time for this fold
    start_time = time.time()

    # Split data
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

    train_encodings = tokenizer(
        list(X_train_fold),
        truncation=True,
        padding=True,
        max_length=MAX_LENGTH,
        return_tensors='pt'
    )

    val_encodings = tokenizer(
        list(X_val_fold),
        truncation=True,
        padding=True,
        max_length=MAX_LENGTH,
        return_tensors='pt'
    )

    # Create datasets
    train_dataset = torch.utils.data.TensorDataset(
        train_encodings['input_ids'],
        train_encodings['attention_mask'],
        torch.tensor(y_train_fold.values)
    )

    val_dataset = torch.utils.data.TensorDataset(
        val_encodings['input_ids'],
        val_encodings['attention_mask'],
        torch.tensor(y_val_fold.values)
    )

    # Create dataloaders
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=BATCH_SIZE)

    # --- Load Model (PyTorch version) ---
    model = BertForSequenceClassification.from_pretrained(
        'bert-base-uncased',
        num_labels=num_labels
    ).to(device)

    # --- Optimizer & Scheduler ---
    optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, eps=1e-8, weight_decay=WEIGHT_DECAY)

    total_steps = len(train_loader) * EPOCHS
    scheduler = torch.optim.lr_scheduler.LinearLR(
        optimizer,
        start_factor=1e-6,
        end_factor=1.0,
        total_iters=total_steps
    )

    # --- Training Loop ---
    for epoch in range(EPOCHS):
        print(f'\nEpoch {epoch + 1}/{EPOCHS}')

        # Training
        model.train()
        total_train_loss = 0
        train_pbar = tqdm(train_loader, desc='Training')

        for batch in train_pbar:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]

            model.zero_grad()

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss = outputs.loss
            total_train_loss += loss.item()

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()
            scheduler.step()

            train_pbar.set_postfix({'loss': loss.item()})

        avg_train_loss = total_train_loss / len(train_loader)

        # Validation
        model.eval()
        total_val_loss = 0
        all_preds = []
        all_labels = []

        with torch.no_grad():
            for batch in tqdm(val_loader, desc='Validation'):
                input_ids, attention_mask, labels = [b.to(device) for b in batch]

                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )

                total_val_loss += outputs.loss.item()

                logits = outputs.logits
                preds = torch.argmax(logits, dim=1)

                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

        avg_val_loss = total_val_loss / len(val_loader)
        val_accuracy = accuracy_score(all_labels, all_preds)

        print(f'Train Loss: {avg_train_loss:.4f}')
        print(f'Val Loss: {avg_val_loss:.4f}')
        print(f'Val Accuracy: {val_accuracy:.4f}')

    # --- Final Evaluation ---
    model.eval()
    final_preds = []
    final_labels = []

    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)

            final_preds.extend(preds.cpu().numpy())
            final_labels.extend(labels.cpu().numpy())

    # Calculate metrics
    accuracy = accuracy_score(final_labels, final_preds)
    report = classification_report(
        final_labels,
        final_preds,
        target_names=label_encoder.classes_,
        output_dict=True,
        zero_division=0
    )

    fold_result = {
        'fold': fold_no,
        'accuracy': accuracy,
        'precision': report['macro avg']['precision'],
        'recall': report['macro avg']['recall'],
        'f1_score': report['macro avg']['f1-score'],
        'time': time.time() - start_time,
        'report': report
    }

    fold_results.append(fold_result)

    print(f"\n{'='*60}")
    print(f"Fold {fold_no} Results:")
    print(f"  Accuracy:  {accuracy:.4f}")
    print(f"  Precision: {report['macro avg']['precision']:.4f}")
    print(f"  Recall:    {report['macro avg']['recall']:.4f}")
    print(f"  F1-Score:  {report['macro avg']['f1-score']:.4f}")
    print(f"  Time:      {fold_result['time']:.2f}s")
    print(f"{'='*60}")

    fold_no += 1

    # Clean up memory
    del model
    torch.cuda.empty_cache() if torch.cuda.is_available() else None

# --- Final Results ---
print(f"\n{'='*60}")
print("FINAL K-FOLD CROSS-VALIDATION RESULTS")
print(f"{'='*60}")

accuracies = [f['accuracy'] for f in fold_results]
precisions = [f['precision'] for f in fold_results]
recalls = [f['recall'] for f in fold_results]
f1_scores = [f['f1_score'] for f in fold_results]

print(f"\nAccuracy:  {np.mean(accuracies):.4f} ± {np.std(accuracies):.4f}")
print(f"Precision: {np.mean(precisions):.4f} ± {np.std(precisions):.4f}")
print(f"Recall:    {np.mean(recalls):.4f} ± {np.std(recalls):.4f}")
print(f"F1-Score:  {np.mean(f1_scores):.4f} ± {np.std(f1_scores):.4f}")

print(f"\nTotal Time: {sum(f['time'] for f in fold_results):.2f}s")
print(f"Average Time per Fold: {np.mean([f['time'] for f in fold_results]):.2f}s")



Training fold 1...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/3


Training: 100%|██████████| 460/460 [02:46<00:00,  2.77it/s, loss=1.8]
Validation: 100%|██████████| 230/230 [00:26<00:00,  8.59it/s]


Train Loss: 3.3001
Val Loss: 2.2047
Val Accuracy: 0.6523

Epoch 2/3


Training: 100%|██████████| 460/460 [02:46<00:00,  2.77it/s, loss=0.987]
Validation: 100%|██████████| 230/230 [00:26<00:00,  8.59it/s]


Train Loss: 1.5433
Val Loss: 1.1410
Val Accuracy: 0.7550

Epoch 3/3


Training: 100%|██████████| 460/460 [02:45<00:00,  2.77it/s, loss=0.129]
Validation: 100%|██████████| 230/230 [00:26<00:00,  8.58it/s]


Train Loss: 0.9194
Val Loss: 0.9649
Val Accuracy: 0.7822

Fold 1 Results:
  Accuracy:  0.7822
  Precision: 0.7806
  Recall:    0.7578
  F1-Score:  0.7588
  Time:      726.89s

Training fold 2...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/3


Training: 100%|██████████| 460/460 [02:46<00:00,  2.77it/s, loss=2.78]
Validation: 100%|██████████| 230/230 [00:26<00:00,  8.57it/s]


Train Loss: 3.3144
Val Loss: 2.2674
Val Accuracy: 0.6352

Epoch 2/3


Training: 100%|██████████| 460/460 [02:45<00:00,  2.77it/s, loss=0.738]
Validation: 100%|██████████| 230/230 [00:26<00:00,  8.57it/s]


Train Loss: 1.5808
Val Loss: 1.1154
Val Accuracy: 0.7678

Epoch 3/3


Training: 100%|██████████| 460/460 [02:45<00:00,  2.78it/s, loss=2.69]
Validation: 100%|██████████| 230/230 [00:26<00:00,  8.53it/s]


Train Loss: 0.9172
Val Loss: 0.9377
Val Accuracy: 0.7849

Fold 2 Results:
  Accuracy:  0.7849
  Precision: 0.8055
  Recall:    0.7633
  F1-Score:  0.7735
  Time:      713.41s

Training fold 3...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/3


Training: 100%|██████████| 460/460 [02:46<00:00,  2.77it/s, loss=1.4]
Validation: 100%|██████████| 230/230 [00:26<00:00,  8.55it/s]


Train Loss: 3.3801
Val Loss: 2.2929
Val Accuracy: 0.6104

Epoch 2/3


Training: 100%|██████████| 460/460 [02:45<00:00,  2.77it/s, loss=0.238]
Validation: 100%|██████████| 230/230 [00:27<00:00,  8.49it/s]


Train Loss: 1.5846
Val Loss: 1.0699
Val Accuracy: 0.7735

Epoch 3/3


Training: 100%|██████████| 460/460 [02:46<00:00,  2.77it/s, loss=0.338]
Validation: 100%|██████████| 230/230 [00:26<00:00,  8.54it/s]


Train Loss: 0.9182
Val Loss: 0.9038
Val Accuracy: 0.7947

Fold 3 Results:
  Accuracy:  0.7947
  Precision: 0.8074
  Recall:    0.7863
  F1-Score:  0.7887
  Time:      714.94s

FINAL K-FOLD CROSS-VALIDATION RESULTS

Accuracy:  0.7873 ± 0.0054
Precision: 0.7979 ± 0.0122
Recall:    0.7691 ± 0.0123
F1-Score:  0.7737 ± 0.0122

Total Time: 2155.24s
Average Time per Fold: 718.41s


### Ablation Studies: Weight decay 0.01 -> Weight decay 0.1

In [None]:
# --- Configuration ---
EPOCHS = 3
BATCH_SIZE = 16
MAX_LENGTH = 128
LEARNING_RATE = 2e-5
WEIGHT_DECAY = 0.1

# --- Step 6: Cross-Validation Setup ---
kfold = KFold(n_splits=3, shuffle=True, random_state=42)

fold_no = 1
accuracies = []
precisions = []
recalls = []
f1_scores = []
fold_results = []

# --- Step 7: K-Fold Cross-Validation ---
for train_index, val_index in kfold.split(X_train):
    print(f"\nTraining fold {fold_no}...")

    # Track time for this fold
    start_time = time.time()

    # Split data
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

    train_encodings = tokenizer(
        list(X_train_fold),
        truncation=True,
        padding=True,
        max_length=MAX_LENGTH,
        return_tensors='pt'
    )

    val_encodings = tokenizer(
        list(X_val_fold),
        truncation=True,
        padding=True,
        max_length=MAX_LENGTH,
        return_tensors='pt'
    )

    # Create datasets
    train_dataset = torch.utils.data.TensorDataset(
        train_encodings['input_ids'],
        train_encodings['attention_mask'],
        torch.tensor(y_train_fold.values)
    )

    val_dataset = torch.utils.data.TensorDataset(
        val_encodings['input_ids'],
        val_encodings['attention_mask'],
        torch.tensor(y_val_fold.values)
    )

    # Create dataloaders
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=BATCH_SIZE)

    # --- Load Model (PyTorch version) ---
    model = BertForSequenceClassification.from_pretrained(
        'bert-base-uncased',
        num_labels=num_labels
    ).to(device)

    # --- Optimizer & Scheduler ---
    optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, eps=1e-8, weight_decay=WEIGHT_DECAY)

    total_steps = len(train_loader) * EPOCHS
    scheduler = torch.optim.lr_scheduler.LinearLR(
        optimizer,
        start_factor=1e-6,
        end_factor=1.0,
        total_iters=total_steps
    )

    # --- Training Loop ---
    for epoch in range(EPOCHS):
        print(f'\nEpoch {epoch + 1}/{EPOCHS}')

        # Training
        model.train()
        total_train_loss = 0
        train_pbar = tqdm(train_loader, desc='Training')

        for batch in train_pbar:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]

            model.zero_grad()

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss = outputs.loss
            total_train_loss += loss.item()

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()
            scheduler.step()

            train_pbar.set_postfix({'loss': loss.item()})

        avg_train_loss = total_train_loss / len(train_loader)

        # Validation
        model.eval()
        total_val_loss = 0
        all_preds = []
        all_labels = []

        with torch.no_grad():
            for batch in tqdm(val_loader, desc='Validation'):
                input_ids, attention_mask, labels = [b.to(device) for b in batch]

                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )

                total_val_loss += outputs.loss.item()

                logits = outputs.logits
                preds = torch.argmax(logits, dim=1)

                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

        avg_val_loss = total_val_loss / len(val_loader)
        val_accuracy = accuracy_score(all_labels, all_preds)

        print(f'Train Loss: {avg_train_loss:.4f}')
        print(f'Val Loss: {avg_val_loss:.4f}')
        print(f'Val Accuracy: {val_accuracy:.4f}')

    # --- Final Evaluation ---
    model.eval()
    final_preds = []
    final_labels = []

    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)

            final_preds.extend(preds.cpu().numpy())
            final_labels.extend(labels.cpu().numpy())

    # Calculate metrics
    accuracy = accuracy_score(final_labels, final_preds)
    report = classification_report(
        final_labels,
        final_preds,
        target_names=label_encoder.classes_,
        output_dict=True,
        zero_division=0
    )

    fold_result = {
        'fold': fold_no,
        'accuracy': accuracy,
        'precision': report['macro avg']['precision'],
        'recall': report['macro avg']['recall'],
        'f1_score': report['macro avg']['f1-score'],
        'time': time.time() - start_time,
        'report': report
    }

    fold_results.append(fold_result)

    print(f"\n{'='*60}")
    print(f"Fold {fold_no} Results:")
    print(f"  Accuracy:  {accuracy:.4f}")
    print(f"  Precision: {report['macro avg']['precision']:.4f}")
    print(f"  Recall:    {report['macro avg']['recall']:.4f}")
    print(f"  F1-Score:  {report['macro avg']['f1-score']:.4f}")
    print(f"  Time:      {fold_result['time']:.2f}s")
    print(f"{'='*60}")

    fold_no += 1

    # Clean up memory
    del model
    torch.cuda.empty_cache() if torch.cuda.is_available() else None

# --- Final Results ---
print(f"\n{'='*60}")
print("FINAL K-FOLD CROSS-VALIDATION RESULTS")
print(f"{'='*60}")

accuracies = [f['accuracy'] for f in fold_results]
precisions = [f['precision'] for f in fold_results]
recalls = [f['recall'] for f in fold_results]
f1_scores = [f['f1_score'] for f in fold_results]

print(f"\nAccuracy:  {np.mean(accuracies):.4f} ± {np.std(accuracies):.4f}")
print(f"Precision: {np.mean(precisions):.4f} ± {np.std(precisions):.4f}")
print(f"Recall:    {np.mean(recalls):.4f} ± {np.std(recalls):.4f}")
print(f"F1-Score:  {np.mean(f1_scores):.4f} ± {np.std(f1_scores):.4f}")

print(f"\nTotal Time: {sum(f['time'] for f in fold_results):.2f}s")
print(f"Average Time per Fold: {np.mean([f['time'] for f in fold_results]):.2f}s")



Training fold 1...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/3


Training: 100%|██████████| 460/460 [02:46<00:00,  2.77it/s, loss=3.57]
Validation: 100%|██████████| 230/230 [00:26<00:00,  8.59it/s]


Train Loss: 3.6321
Val Loss: 3.1723
Val Accuracy: 0.3488

Epoch 2/3


Training: 100%|██████████| 460/460 [02:45<00:00,  2.77it/s, loss=1.13]
Validation: 100%|██████████| 230/230 [00:26<00:00,  8.57it/s]


Train Loss: 2.4392
Val Loss: 1.6497
Val Accuracy: 0.7310

Epoch 3/3


Training: 100%|██████████| 460/460 [02:45<00:00,  2.77it/s, loss=0.554]
Validation: 100%|██████████| 230/230 [00:26<00:00,  8.53it/s]


Train Loss: 1.3201
Val Loss: 1.0786
Val Accuracy: 0.7727

Fold 1 Results:
  Accuracy:  0.7727
  Precision: 0.7922
  Recall:    0.7610
  F1-Score:  0.7628
  Time:      714.69s

Training fold 2...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/3


Training: 100%|██████████| 460/460 [02:46<00:00,  2.77it/s, loss=3.75]
Validation: 100%|██████████| 230/230 [00:26<00:00,  8.58it/s]


Train Loss: 3.6482
Val Loss: 3.1350
Val Accuracy: 0.4005

Epoch 2/3


Training: 100%|██████████| 460/460 [02:45<00:00,  2.77it/s, loss=1.74]
Validation: 100%|██████████| 230/230 [00:26<00:00,  8.55it/s]


Train Loss: 2.4146
Val Loss: 1.6082
Val Accuracy: 0.7365

Epoch 3/3


Training: 100%|██████████| 460/460 [02:45<00:00,  2.77it/s, loss=0.641]
Validation: 100%|██████████| 230/230 [00:26<00:00,  8.56it/s]


Train Loss: 1.3055
Val Loss: 1.0592
Val Accuracy: 0.7746

Fold 2 Results:
  Accuracy:  0.7746
  Precision: 0.7949
  Recall:    0.7434
  F1-Score:  0.7435
  Time:      713.75s

Training fold 3...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/3


Training: 100%|██████████| 460/460 [02:46<00:00,  2.77it/s, loss=3.5]
Validation:  37%|███▋      | 86/230 [00:10<00:16,  8.56it/s]

### Ablation Studies: Epoch 3 -> Epoch 4

In [None]:
# --- Configuration ---
EPOCHS = 4
BATCH_SIZE = 16
MAX_LENGTH = 128
LEARNING_RATE = 2e-5
WEIGHT_DECAY = 0.01

# --- Step 6: Cross-Validation Setup ---
kfold = KFold(n_splits=3, shuffle=True, random_state=42)

fold_no = 1
accuracies = []
precisions = []
recalls = []
f1_scores = []
fold_results = []

# --- Step 7: K-Fold Cross-Validation ---
for train_index, val_index in kfold.split(X_train):
    print(f"\nTraining fold {fold_no}...")

    # Track time for this fold
    start_time = time.time()

    # Split data
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

    train_encodings = tokenizer(
        list(X_train_fold),
        truncation=True,
        padding=True,
        max_length=MAX_LENGTH,
        return_tensors='pt'
    )

    val_encodings = tokenizer(
        list(X_val_fold),
        truncation=True,
        padding=True,
        max_length=MAX_LENGTH,
        return_tensors='pt'
    )

    # Create datasets
    train_dataset = torch.utils.data.TensorDataset(
        train_encodings['input_ids'],
        train_encodings['attention_mask'],
        torch.tensor(y_train_fold.values)
    )

    val_dataset = torch.utils.data.TensorDataset(
        val_encodings['input_ids'],
        val_encodings['attention_mask'],
        torch.tensor(y_val_fold.values)
    )

    # Create dataloaders
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=BATCH_SIZE)

    # --- Load Model (PyTorch version) ---
    model = BertForSequenceClassification.from_pretrained(
        'bert-base-uncased',
        num_labels=num_labels
    ).to(device)

    # --- Optimizer & Scheduler ---
    optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, eps=1e-8, weight_decay=WEIGHT_DECAY)

    total_steps = len(train_loader) * EPOCHS
    scheduler = torch.optim.lr_scheduler.LinearLR(
        optimizer,
        start_factor=1e-6,
        end_factor=1.0,
        total_iters=total_steps
    )

    # --- Training Loop ---
    for epoch in range(EPOCHS):
        print(f'\nEpoch {epoch + 1}/{EPOCHS}')

        # Training
        model.train()
        total_train_loss = 0
        train_pbar = tqdm(train_loader, desc='Training')

        for batch in train_pbar:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]

            model.zero_grad()

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss = outputs.loss
            total_train_loss += loss.item()

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()
            scheduler.step()

            train_pbar.set_postfix({'loss': loss.item()})

        avg_train_loss = total_train_loss / len(train_loader)

        # Validation
        model.eval()
        total_val_loss = 0
        all_preds = []
        all_labels = []

        with torch.no_grad():
            for batch in tqdm(val_loader, desc='Validation'):
                input_ids, attention_mask, labels = [b.to(device) for b in batch]

                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )

                total_val_loss += outputs.loss.item()

                logits = outputs.logits
                preds = torch.argmax(logits, dim=1)

                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

        avg_val_loss = total_val_loss / len(val_loader)
        val_accuracy = accuracy_score(all_labels, all_preds)

        print(f'Train Loss: {avg_train_loss:.4f}')
        print(f'Val Loss: {avg_val_loss:.4f}')
        print(f'Val Accuracy: {val_accuracy:.4f}')

    # --- Final Evaluation ---
    model.eval()
    final_preds = []
    final_labels = []

    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)

            final_preds.extend(preds.cpu().numpy())
            final_labels.extend(labels.cpu().numpy())

    # Calculate metrics
    accuracy = accuracy_score(final_labels, final_preds)
    report = classification_report(
        final_labels,
        final_preds,
        target_names=label_encoder.classes_,
        output_dict=True,
        zero_division=0
    )

    fold_result = {
        'fold': fold_no,
        'accuracy': accuracy,
        'precision': report['macro avg']['precision'],
        'recall': report['macro avg']['recall'],
        'f1_score': report['macro avg']['f1-score'],
        'time': time.time() - start_time,
        'report': report
    }

    fold_results.append(fold_result)

    print(f"\n{'='*60}")
    print(f"Fold {fold_no} Results:")
    print(f"  Accuracy:  {accuracy:.4f}")
    print(f"  Precision: {report['macro avg']['precision']:.4f}")
    print(f"  Recall:    {report['macro avg']['recall']:.4f}")
    print(f"  F1-Score:  {report['macro avg']['f1-score']:.4f}")
    print(f"  Time:      {fold_result['time']:.2f}s")
    print(f"{'='*60}")

    fold_no += 1

    # Clean up memory
    del model
    torch.cuda.empty_cache() if torch.cuda.is_available() else None

# --- Final Results ---
print(f"\n{'='*60}")
print("FINAL K-FOLD CROSS-VALIDATION RESULTS")
print(f"{'='*60}")

accuracies = [f['accuracy'] for f in fold_results]
precisions = [f['precision'] for f in fold_results]
recalls = [f['recall'] for f in fold_results]
f1_scores = [f['f1_score'] for f in fold_results]

print(f"\nAccuracy:  {np.mean(accuracies):.4f} ± {np.std(accuracies):.4f}")
print(f"Precision: {np.mean(precisions):.4f} ± {np.std(precisions):.4f}")
print(f"Recall:    {np.mean(recalls):.4f} ± {np.std(recalls):.4f}")
print(f"F1-Score:  {np.mean(f1_scores):.4f} ± {np.std(f1_scores):.4f}")

print(f"\nTotal Time: {sum(f['time'] for f in fold_results):.2f}s")
print(f"Average Time per Fold: {np.mean([f['time'] for f in fold_results]):.2f}s")


In [None]:
# After k-fold cross-validation, train a final model on the entire training data
# Train the final model on the entire training set (all data used in k-folds)
final_model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=num_labels
).to(device)

final_optimizer = AdamW(final_model.parameters(), lr=LEARNING_RATE, eps=1e-8)

# Tokenize the entire training data
final_train_encodings = tokenizer(
    list(X_train),
    truncation=True,
    padding=True,
    max_length=MAX_LENGTH,
    return_tensors='pt'
)

final_train_dataset = torch.utils.data.TensorDataset(
    final_train_encodings['input_ids'],
    final_train_encodings['attention_mask'],
    torch.tensor(y_train.values)
)

final_train_loader = torch.utils.data.DataLoader(final_train_dataset, batch_size=BATCH_SIZE, shuffle=True)

# Train the model on the entire training data
final_model.train()
for epoch in range(EPOCHS):
    print(f'\nTraining Epoch {epoch + 1}/{EPOCHS} on Full Data')

    total_train_loss = 0
    train_pbar = tqdm(final_train_loader, desc='Training')

    for batch in train_pbar:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]

        final_model.zero_grad()

        outputs = final_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_train_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(final_model.parameters(), 1.0)
        final_optimizer.step()

    avg_train_loss = total_train_loss / len(final_train_loader)
    print(f'Train Loss: {avg_train_loss:.4f}')

# Final evaluation on the test set using the model trained on all the data
final_model.eval()

# Tokenize the test set
final_test_encodings = tokenizer(
    list(X_test),
    truncation=True,
    padding=True,
    max_length=MAX_LENGTH,
    return_tensors='pt'
)

final_test_labels = torch.tensor(y_test.values).to(device)

with torch.no_grad():
    outputs = final_model(
        input_ids=final_test_encodings['input_ids'].to(device),
        attention_mask=final_test_encodings['attention_mask'].to(device)
    )

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=1)  # Get the predicted class labels

# Evaluate the model's performance on the test set
test_accuracy = accuracy_score(final_test_labels.cpu(), predictions.cpu())  # Move to CPU for accuracy calculation

# Print test accuracy and classification report
print(f"Test Accuracy: {test_accuracy:.4f}")

# Optional: print the classification report
report = classification_report(final_test_labels.cpu(), predictions.cpu(), output_dict=True, zero_division=0)
print("\nClassification Report on Test Set:")
print(report)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training Epoch 1/3 on Full Data


Training: 100%|██████████| 689/689 [04:18<00:00,  2.66it/s]


Train Loss: 2.0730

Training Epoch 2/3 on Full Data


Training: 100%|██████████| 689/689 [04:20<00:00,  2.65it/s]


Train Loss: 0.9004

Training Epoch 3/3 on Full Data


Training: 100%|██████████| 689/689 [04:19<00:00,  2.65it/s]


Train Loss: 0.6674
Test Accuracy: 0.8098

Classification Report on Test Set:
{'0': {'precision': 0.8333333333333334, 'recall': 0.8823529411764706, 'f1-score': 0.8571428571428571, 'support': 34.0}, '1': {'precision': 0.7297297297297297, 'recall': 0.9310344827586207, 'f1-score': 0.8181818181818182, 'support': 29.0}, '2': {'precision': 0.8421052631578947, 'recall': 0.6956521739130435, 'f1-score': 0.7619047619047619, 'support': 23.0}, '3': {'precision': 0.6388888888888888, 'recall': 0.71875, 'f1-score': 0.6764705882352942, 'support': 32.0}, '4': {'precision': 0.782608695652174, 'recall': 0.6, 'f1-score': 0.6792452830188679, 'support': 30.0}, '5': {'precision': 0.8076923076923077, 'recall': 0.84, 'f1-score': 0.8235294117647058, 'support': 25.0}, '6': {'precision': 0.7391304347826086, 'recall': 0.5666666666666667, 'f1-score': 0.6415094339622641, 'support': 30.0}, '7': {'precision': 0.8857142857142857, 'recall': 0.9393939393939394, 'f1-score': 0.9117647058823529, 'support': 33.0}, '8': {'prec

In [None]:
# Example text you want to classify
custom_text = ["Dynamic and results-driven Human Resources professional with [X] years of experience in recruitment, employee relations, and organizational development. Proven expertise in managing end-to-end recruitment processes, onboarding, performance management, and implementing HR policies to ensure a positive work culture. Strong communication and problem-solving skills with a focus on enhancing employee engagement and supporting business objectives."]

# Tokenize the custom text (ensure to handle the padding and truncation as needed)
custom_encodings = tokenizer(
    custom_text,
    truncation=True,
    padding=True,
    max_length=MAX_LENGTH,
    return_tensors='pt'
)

# Move the tokenized input to the correct device
input_ids = custom_encodings['input_ids'].to(device)
attention_mask = custom_encodings['attention_mask'].to(device)

# Make prediction with the trained model
final_model.eval()  # Set model to evaluation mode
with torch.no_grad():
    outputs = final_model(input_ids=input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    prediction = torch.argmax(logits, dim=1).item()  # Get the predicted class index

# Map the prediction back to the corresponding class label
predicted_label = label_encoder.classes_[prediction]

# Print the predicted label
print(f"Predicted Label: {predicted_label}")


Predicted Label: Human Resources
