In [2]:
#@title Import
import kagglehub

import os
import time
import warnings
import itertools
from copy import deepcopy

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.profiler import profile, record_function, ProfilerActivity

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    get_linear_schedule_with_warmup,
    BertConfig
)

from peft import LoraConfig, get_peft_model, TaskType

import pandas as pd
import numpy as np
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    f1_score
)
from sklearn.utils.class_weight import compute_class_weight

warnings.filterwarnings('ignore')


In [3]:
#@title Time and memory
def get_gpu_memory_mb():
    if torch.cuda.is_available():
        return torch.cuda.memory_allocated() / 1024**2
    return 0

def get_peak_gpu_memory_mb():
    if torch.cuda.is_available():
        return torch.cuda.max_memory_allocated() / 1024**2
    return 0

def reset_peak_memory():
    if torch.cuda.is_available():
        torch.cuda.reset_peak_memory_stats()

In [4]:
#@title load dataset
# Set cache directory (just the directory, not including specific files)
# Don't include the filename or 'versions/1/train.tsv' in the path
os.environ['KAGGLE_HUB_CACHE'] = '/root/.cache/kagglehub'

# Download the dataset
path = kagglehub.dataset_download("doanquanvietnamca/liar-dataset")

# CORRECT way to join paths - don't start with '/'
train_path = os.path.join(path, 'train.tsv')
valid_path = os.path.join(path, 'valid.tsv')
test_path = os.path.join(path, 'test.tsv')

print("Path to dataset directory:", path)
print("Path to train.tsv file:", train_path)
print("Path to valid.tsv file:", valid_path)
print("Path to test.tsv file:", test_path)


Using Colab cache for faster access to the 'liar-dataset' dataset.
Path to dataset directory: /kaggle/input/liar-dataset
Path to train.tsv file: /kaggle/input/liar-dataset/train.tsv
Path to valid.tsv file: /kaggle/input/liar-dataset/valid.tsv
Path to test.tsv file: /kaggle/input/liar-dataset/test.tsv


In [5]:
#@title Profiler function

def train_epoch_with_profiler(model, dataloader, optimizer, scheduler, criterion, device, prof):
    model.train()
    total_loss = 0
    dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16

    torch.cuda.reset_peak_memory_stats()
    start_time = time.time()

    # Profile the first 5 batches
    for batch_idx, batch in enumerate(dataloader):
        if batch_idx < 5:
            with record_function("data_loading"):
                input_ids = batch['input_ids'].to(device, non_blocking=True)
                attention_mask = batch['attention_mask'].to(device, non_blocking=True)
                labels = batch['label'].to(device, non_blocking=True)

            optimizer.zero_grad()

            with record_function("forward_pass"):
                with torch.amp.autocast(device_type='cuda', dtype=dtype):
                    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                    loss = criterion(outputs.logits, labels)

            with record_function("backward_pass"):
                loss.backward()

            with record_function("optimizer_step"):
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                optimizer.step()
                scheduler.step()

            total_loss += loss.item()
            prof.step()
        else:
            input_ids = batch['input_ids'].to(device, non_blocking=True)
            attention_mask = batch['attention_mask'].to(device, non_blocking=True)
            labels = batch['label'].to(device, non_blocking=True)

            optimizer.zero_grad()

            with torch.amp.autocast(device_type='cuda', dtype=dtype):
                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                loss = criterion(outputs.logits, labels)

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            scheduler.step()

            total_loss += loss.item()

    end_time = time.time()
    epoch_time = end_time - start_time
    peak_memory = torch.cuda.max_memory_allocated() / 1024**2  # MB

    return total_loss / len(dataloader), epoch_time, peak_memory

In [6]:
#@title Profiler function for baseline (no amp)


def train_epoch_with_profiler_baseline(model, dataloader, optimizer, scheduler, criterion, device, prof):
    model.train()
    total_loss = 0

    torch.cuda.reset_peak_memory_stats()
    start_time = time.time()

    # Profile the first 5 batches
    for batch_idx, batch in enumerate(dataloader):
        if batch_idx < 5:
            with record_function("data_loading"):
                input_ids = batch['input_ids'].to(device, non_blocking=True)
                attention_mask = batch['attention_mask'].to(device, non_blocking=True)
                labels = batch['label'].to(device, non_blocking=True)

            optimizer.zero_grad()

            with record_function("forward_pass"):
                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                loss = criterion(outputs.logits, labels)

            with record_function("backward_pass"):
                loss.backward()

            with record_function("optimizer_step"):
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                optimizer.step()
                scheduler.step()

            total_loss += loss.item()
            prof.step()
        else:
            input_ids = batch['input_ids'].to(device, non_blocking=True)
            attention_mask = batch['attention_mask'].to(device, non_blocking=True)
            labels = batch['label'].to(device, non_blocking=True)

            optimizer.zero_grad()

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(outputs.logits, labels)

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            scheduler.step()

            total_loss += loss.item()

    end_time = time.time()
    epoch_time = end_time - start_time
    peak_memory = torch.cuda.max_memory_allocated() / 1024**2

    return total_loss / len(dataloader), epoch_time, peak_memory

In [7]:
#@title Only baseline

# Model Configuration
MODEL_NAME = 'bert-base-uncased'
MAX_LEN = 256
BATCH_SIZE = 128
LR = 2e-5
EPOCHS = 10
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
NUM_WORKERS = 8

TRAIN_PATH = '/kaggle/input/liar-dataset/train.tsv'
VALID_PATH = '/kaggle/input/liar-dataset/valid.tsv'
TEST_PATH  = '/kaggle/input/liar-dataset/test.tsv'

#1. Dataset Class
class TextualizedLIARDataset(Dataset):
    def __init__(self, tsv_path, tokenizer, max_len=128):
        self.df = pd.read_csv(tsv_path, sep='\t', header=None, names=[
            "id", "label", "statement", "subject", "speaker", "speaker_job",
            "state", "party", "barely_true_counts", "false_counts",
            "half_true_counts", "mostly_true_counts", "pants_on_fire_counts",
            "context"
        ])

        self.df.dropna(subset=['statement'], inplace=True)

        # Label logic: False/Pants-fire/Barely-true = 0 (Fake)
        self.label_map = {
            "pants-fire": 0, "false": 0, "barely-true": 0,
            "half-true": 1, "mostly-true": 1, "true": 1
        }

        self.df['label'] = self.df['label'].map(self.label_map)
        self.df.dropna(subset=['label'], inplace=True)
        self.df['label'] = self.df['label'].astype(int)

        text_cols = ['statement', 'subject', 'speaker', 'party', 'state', 'speaker_job', 'context']
        for col in text_cols:
            self.df[col] = self.df[col].fillna("Unknown")

        self.tokenizer = tokenizer
        self.max_len = max_len
        self.df.reset_index(drop=True, inplace=True)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        metadata_str = (
            f"Speaker: {row['speaker']} | "
            f"Job: {row['speaker_job']} | "
            f"Party: {row['party']} | "
            f"State: {row['state']} | "
            f"Context: {row['context']} | "
            f"Subject: {row['subject']}"
        )
        final_text = f"{metadata_str} [SEP] Statement: {row['statement']}"

        encoded = self.tokenizer(
            final_text,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )

        return {
            'input_ids': encoded['input_ids'].squeeze(0),
            'attention_mask': encoded['attention_mask'].squeeze(0),
            'label': torch.tensor(row['label'], dtype=torch.long)
        }

#2. Training and Evaluation Functions
def train_epoch(model, dataloader, optimizer, scheduler, criterion, device):
    model.train()
    total_loss = 0
    #dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16

    reset_peak_memory()
    start_time = time.time()


    for batch in dataloader:
        input_ids = batch['input_ids'].to(device, non_blocking=True)
        attention_mask = batch['attention_mask'].to(device, non_blocking=True)
        labels = batch['label'].to(device, non_blocking=True)

        optimizer.zero_grad()

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(outputs.logits, labels)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    end_time = time.time()
    epoch_time = end_time - start_time

    peak_memory = get_peak_gpu_memory_mb()

    return total_loss / len(dataloader), epoch_time, peak_memory

def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    all_preds, all_labels = [], []

    reset_peak_memory()
    start_time = time.time()


    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(outputs.logits, labels)

            total_loss += loss.item()
            preds = torch.argmax(outputs.logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    eval_time = time.time() - start_time
    peak_memory = get_peak_gpu_memory_mb()

    return total_loss / len(dataloader), accuracy_score(all_labels, all_preds), all_labels, all_preds, eval_time, peak_memory

#Main training
def run_official_split_training():
    torch.manual_seed(42)
    np.random.seed(42)
    print(f"Using device: {DEVICE} | Model: {MODEL_NAME}")

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    print("Loading Official Datasets (Train / Valid / Test)...")
    if not os.path.exists(TRAIN_PATH):
        print(f"Error: Path {TRAIN_PATH} not found.")
        return

    train_dataset = TextualizedLIARDataset(TRAIN_PATH, tokenizer, max_len=MAX_LEN)
    valid_dataset = TextualizedLIARDataset(VALID_PATH, tokenizer, max_len=MAX_LEN)
    test_dataset  = TextualizedLIARDataset(TEST_PATH, tokenizer, max_len=MAX_LEN)

    # Build DataLoader
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS, pin_memory=True)
    valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True)
    test_loader  = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True)

    print(f"Stats: Train={len(train_dataset)}, Valid={len(valid_dataset)}, Test={len(test_dataset)}")

    # Get Class Weights (directly from train_dataset)
    print("Calculating class weights from Training set...")
    train_labels = train_dataset.df['label'].values

    # Automatically compute class weights
    class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(train_labels), y=train_labels)
    class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(DEVICE)

    print(f"Class Weights: {class_weights} (Index 0 is Fake, Index 1 is True)")

    # Define the weighted loss
    criterion = nn.CrossEntropyLoss(weight=class_weights_tensor)

    # Model Loading and Configuration (Dropout 0.3)
    print("Loading model with increased dropout...")
    config = BertConfig.from_pretrained(MODEL_NAME)
    config.hidden_dropout_prob = 0.3
    config.attention_probs_dropout_prob = 0.3
    config.num_labels = 2

    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, config=config)

    # calculate the num of parameters
    total = sum(p.numel() for p in model.parameters())
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"trainable params: {trainable:,} || all params: {total:,} || trainable%: {100*trainable/total:.4f}")

    model.to(DEVICE)

    optimizer = optim.AdamW(model.parameters(), lr=LR, weight_decay=0.01)

    total_train_time = 0
    epoch_times = []
    epoch_memories = []


    total_steps = len(train_loader) * EPOCHS
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(0.1 * total_steps), num_training_steps=total_steps)

    best_val_f1 = 0

    print("\nStarting Training on Official Split...")

    for epoch in range(1, EPOCHS + 1):

      if epoch == 2:
        print("Profiling enabled for epoch 2 ...")

        with profile(
            activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
            record_shapes=True,
            profile_memory=True,
            with_stack=False
        ) as prof:
            train_loss, train_time, train_memory = train_epoch_with_profiler_baseline(
                model, train_loader, optimizer, scheduler, criterion, DEVICE, prof
            )


        print("PROFILER SUMMARY (Epoch 2)")


        print("\nTop Operations by CPU Time:")
        print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=15))

        print("\nTop Operations by CUDA Time:")
        try:
            print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=15))
        except:
            print(prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=15))

        print("\nTop Operations by Memory:")
        print(prof.key_averages().table(sort_by="self_cuda_memory_usage", row_limit=15))

        trace_file = "profiler_trace_baselineonly_epoch1.json"
        prof.export_chrome_trace(trace_file)
        print(f"\n Chrome trace saved to: {trace_file}")
        print("  Download and open in chrome://tracing")
      else:
        train_loss, train_time, train_memory = train_epoch(
          model, train_loader, optimizer, scheduler, criterion, DEVICE
        )

      val_loss, val_acc, val_labels, val_preds, val_time, val_memory = evaluate(
          model, valid_loader, criterion, DEVICE
      )

      total_train_time += train_time
      epoch_times.append(train_time)
      epoch_memories.append(train_memory)

      report_dict = classification_report(val_labels, val_preds, output_dict=True)
      macro_f1 = report_dict['macro avg']['f1-score']
      fake_recall = report_dict['0']['recall']

      print(f"Epoch {epoch}/{EPOCHS} | Val Loss: {val_loss:.4f} | Acc: {val_acc:.4f} | Macro-F1: {macro_f1:.4f} | Fake Recall: {fake_recall:.4f}|  T_Time: {train_time:.1f}s | T_Mem: {train_memory:.0f}MB | V_Time: {val_time:.2f}s | V_Memory: {val_memory:.1f} MB")

      if macro_f1 > best_val_f1:
          best_val_f1 = macro_f1
          torch.save(model.state_dict(), 'best_baselineonly_model.pth')
          print(" -> Best baselineamp model updated!")

    print(f"Total Training Time: {total_train_time:.1f}s")
    print(f"Avg Epoch Time: {sum(epoch_times)/len(epoch_times):.1f}s")
    print(f"Avg Peak Memory: {sum(epoch_memories)/len(epoch_memories):.0f}MB")

    # automatically run best model on Official Test Set after training
    print("\n========== FINAL TEST RESULT (Official Test Set) ==========")

    model.load_state_dict(torch.load('best_baselineonly_model.pth'))
    test_loss, test_acc, test_labels, test_preds, test_time, test_memory = evaluate(model, test_loader, criterion, DEVICE)
    print(classification_report(test_labels, test_preds, target_names=['Fake (0)', 'True (1)']))

if __name__ == "__main__":
    run_official_split_training()

Using device: cuda | Model: bert-base-uncased


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Loading Official Datasets (Train / Valid / Test)...
Stats: Train=10240, Valid=1284, Test=1267
Calculating class weights from Training set...
Class Weights: [1.14081996 0.89012517] (Index 0 is Fake, Index 1 is True)
Loading model with increased dropout...


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 109,483,778 || all params: 109,483,778 || trainable%: 100.0000

Starting Training on Official Split...
Epoch 1/10 | Val Loss: 0.6858 | Acc: 0.5927 | Macro-F1: 0.5680 | Fake Recall: 0.3685|  T_Time: 85.5s | T_Mem: 21032MB | V_Time: 3.74s | V_Memory: 2780.7 MB
 -> Best baselineamp model updated!
Profiling enabled for epoch 2 ...
PROFILER SUMMARY (Epoch 2)

Top Operations by CPU Time:
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
-------------------------------------------------------  ------------  ------------  --

In [8]:
#@title rebuild for baseline only

# 1. Re-prepare environment and data

# Redefine configuration
MODEL_NAME = 'bert-base-uncased'
BATCH_SIZE = 128
MAX_LEN = 256
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
VALID_PATH = '/kaggle/input/liar-dataset/valid.tsv'
TEST_PATH =  '/kaggle/input/liar-dataset/test.tsv'

print("Re-loading Tokenizer and Dataloaders...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Re-instantiate datasets (ensure TextualizedLIARDataset class has been run earlier)
valid_dataset = TextualizedLIARDataset(VALID_PATH, tokenizer, max_len=MAX_LEN)
test_dataset  = TextualizedLIARDataset(TEST_PATH, tokenizer, max_len=MAX_LEN)

# Re-instantiate DataLoader
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4, pin_memory=True)
test_loader  = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4, pin_memory=True)


# 2. Define function to find optimal threshold

def find_optimal_threshold(model, dataloader, device):
    model.eval()
    all_probs = []
    all_labels = []

    print("Running inference on Validation Set...")
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            # Softmax to obtain probabilities
            probs = F.softmax(outputs.logits, dim=1)
            # Extract probability for Label 1 (True)
            true_probs = probs[:, 1].cpu().numpy()

            all_probs.extend(true_probs)
            all_labels.extend(labels.cpu().numpy())

    all_probs = np.array(all_probs)
    all_labels = np.array(all_labels)

    # Iterate to find best F1 score
    best_threshold = 0.5
    best_f1 = 0

    thresholds = np.arange(0.1, 0.95, 0.05)

    print(f"\n{'Threshold':<10} | {'Macro F1':<10} | {'Fake Recall':<12} | {'True Recall':<12}")
    print("-" * 60)

    for thresh in thresholds:
        preds = (all_probs > thresh).astype(int)

        report = classification_report(all_labels, preds, output_dict=True)
        macro_f1 = report['macro avg']['f1-score']
        fake_recall = report['0']['recall']
        true_recall = report['1']['recall']

        print(f"{thresh:.2f}       | {macro_f1:.4f}     | {fake_recall:.4f}       | {true_recall:.4f}")

        if macro_f1 > best_f1:
            best_f1 = macro_f1
            best_threshold = thresh

    print(f"\nBest Threshold found: {best_threshold:.2f}")
    return best_threshold


# 3. Run optimization

# Load model
print("\nLoading model weights from 'best_baselineonly_model.pth'...")
# Must reinitialize model structure before loading weights
config = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME).config
config.num_labels = 2
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, config=config)
model.load_state_dict(torch.load('best_baselineonly_model.pth'))
model.to(DEVICE)

# 1. Search for best threshold on Validation Set
best_thresh = find_optimal_threshold(model, valid_loader, DEVICE)

# 2. Apply to Test Set
print(f"\nApplying Threshold {best_thresh:.2f} to Test Set...")
model.eval()
test_probs = []
test_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['label'].to(DEVICE)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        probs = F.softmax(outputs.logits, dim=1)
        test_probs.extend(probs[:, 1].cpu().numpy())
        test_labels.extend(labels.cpu().numpy())

test_probs = np.array(test_probs)
final_preds = (test_probs > best_thresh).astype(int)

print("\n========== OPTIMIZED TEST RESULT ==========")
print(classification_report(test_labels, final_preds, target_names=['Fake (0)', 'True (1)']))


Re-loading Tokenizer and Dataloaders...

Loading model weights from 'best_baselineonly_model.pth'...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Running inference on Validation Set...

Threshold  | Macro F1   | Fake Recall  | True Recall 
------------------------------------------------------------
0.10       | 0.3422     | 0.0000       | 1.0000
0.15       | 0.3511     | 0.0081       | 1.0000
0.20       | 0.3764     | 0.0325       | 0.9970
0.25       | 0.4105     | 0.0682       | 0.9895
0.30       | 0.4669     | 0.1347       | 0.9731
0.35       | 0.5244     | 0.2143       | 0.9506
0.40       | 0.5793     | 0.3068       | 0.9192
0.45       | 0.6237     | 0.3880       | 0.8982
0.50       | 0.6443     | 0.4578       | 0.8503
0.55       | 0.6651     | 0.5422       | 0.7934
0.60       | 0.6672     | 0.6201       | 0.7141
0.65       | 0.6588     | 0.7013       | 0.6198
0.70       | 0.6420     | 0.7776       | 0.5240
0.75       | 0.6037     | 0.8328       | 0.4177
0.80       | 0.5421     | 0.8896       | 0.2889
0.85       | 0.4826     | 0.9578       | 0.1781
0.90       | 0.3722     | 0.9935       | 0.0464

Best Threshold found: 0.60



In [None]:
#@title baseline+Amp


# Model Configuration
MODEL_NAME = 'bert-base-uncased'
MAX_LEN = 256
BATCH_SIZE = 128
LR = 2e-5
EPOCHS = 10
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
NUM_WORKERS = 8

TRAIN_PATH = '/kaggle/input/liar-dataset/train.tsv'
VALID_PATH = '/kaggle/input/liar-dataset/valid.tsv'
TEST_PATH  = '/kaggle/input/liar-dataset/test.tsv'

#1. Dataset Class
class TextualizedLIARDataset(Dataset):
    def __init__(self, tsv_path, tokenizer, max_len=128):
        self.df = pd.read_csv(tsv_path, sep='\t', header=None, names=[
            "id", "label", "statement", "subject", "speaker", "speaker_job",
            "state", "party", "barely_true_counts", "false_counts",
            "half_true_counts", "mostly_true_counts", "pants_on_fire_counts",
            "context"
        ])

        self.df.dropna(subset=['statement'], inplace=True)

        #Label logic：False/Pants-fire/Barely-true = 0 (Fake)
        self.label_map = {
            "pants-fire": 0, "false": 0, "barely-true": 0,
            "half-true": 1, "mostly-true": 1, "true": 1
        }

        self.df['label'] = self.df['label'].map(self.label_map)
        self.df.dropna(subset=['label'], inplace=True)
        self.df['label'] = self.df['label'].astype(int)

        text_cols = ['statement', 'subject', 'speaker', 'party', 'state', 'speaker_job', 'context']
        for col in text_cols:
            self.df[col] = self.df[col].fillna("Unknown")

        self.tokenizer = tokenizer
        self.max_len = max_len
        self.df.reset_index(drop=True, inplace=True)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        metadata_str = (
            f"Speaker: {row['speaker']} | "
            f"Job: {row['speaker_job']} | "
            f"Party: {row['party']} | "
            f"State: {row['state']} | "
            f"Context: {row['context']} | "
            f"Subject: {row['subject']}"
        )
        final_text = f"{metadata_str} [SEP] Statement: {row['statement']}"

        encoded = self.tokenizer(
            final_text,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )

        return {
            'input_ids': encoded['input_ids'].squeeze(0),
            'attention_mask': encoded['attention_mask'].squeeze(0),
            'label': torch.tensor(row['label'], dtype=torch.long)
        }

#2. Training and Evaluation Functions
def train_epoch(model, dataloader, optimizer, scheduler, criterion, device):
    model.train()
    total_loss = 0
    dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16

    reset_peak_memory()
    start_time = time.time()


    for batch in dataloader:
        input_ids = batch['input_ids'].to(device, non_blocking=True)
        attention_mask = batch['attention_mask'].to(device, non_blocking=True)
        labels = batch['label'].to(device, non_blocking=True)

        optimizer.zero_grad()

        with torch.amp.autocast(device_type='cuda', dtype=dtype):
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(outputs.logits, labels)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    end_time = time.time()
    epoch_time = end_time - start_time

    peak_memory = get_peak_gpu_memory_mb()

    return total_loss / len(dataloader), epoch_time, peak_memory

def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    all_preds, all_labels = [], []

    reset_peak_memory()
    start_time = time.time()


    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(outputs.logits, labels)

            total_loss += loss.item()
            preds = torch.argmax(outputs.logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    eval_time = time.time() - start_time
    peak_memory = get_peak_gpu_memory_mb()

    return total_loss / len(dataloader), accuracy_score(all_labels, all_preds), all_labels, all_preds, eval_time, peak_memory

#Main training
def run_official_split_training():
    torch.manual_seed(42)
    np.random.seed(42)
    print(f"Using device: {DEVICE} | Model: {MODEL_NAME}")

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    print("Loading Official Datasets (Train / Valid / Test)...")
    if not os.path.exists(TRAIN_PATH):
        print(f"Error: Path {TRAIN_PATH} not found.")
        return

    train_dataset = TextualizedLIARDataset(TRAIN_PATH, tokenizer, max_len=MAX_LEN)
    valid_dataset = TextualizedLIARDataset(VALID_PATH, tokenizer, max_len=MAX_LEN)
    test_dataset  = TextualizedLIARDataset(TEST_PATH, tokenizer, max_len=MAX_LEN)

    # Build DataLoader
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS, pin_memory=True)
    valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True)
    test_loader  = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True)

    print(f"Stats: Train={len(train_dataset)}, Valid={len(valid_dataset)}, Test={len(test_dataset)}")

    # Get Class Weights (directly from train_dataset)
    print("Calculating class weights from Training set...")
    train_labels = train_dataset.df['label'].values

    # Automatically compute class weights
    class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(train_labels), y=train_labels)
    class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(DEVICE)

    print(f"Class Weights: {class_weights} (Index 0 is Fake, Index 1 is True)")

    # Define the weighted loss
    criterion = nn.CrossEntropyLoss(weight=class_weights_tensor)

    # Model Loading and Configuration (Dropout 0.3)
    print("Loading model with increased dropout...")
    config = BertConfig.from_pretrained(MODEL_NAME)
    config.hidden_dropout_prob = 0.3
    config.attention_probs_dropout_prob = 0.3
    config.num_labels = 2

    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, config=config)

    # calculate the num of parameters
    total = sum(p.numel() for p in model.parameters())
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"trainable params: {trainable:,} || all params: {total:,} || trainable%: {100*trainable/total:.4f}")

    model.to(DEVICE)

    optimizer = optim.AdamW(model.parameters(), lr=LR, weight_decay=0.01)

    total_train_time = 0
    epoch_times = []
    epoch_memories = []


    total_steps = len(train_loader) * EPOCHS
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(0.1 * total_steps), num_training_steps=total_steps)

    best_val_f1 = 0

    print("\nStarting Training on Official Split...")

    profiler_data = {
    'cpu_time': [],
    'cuda_time': [],
    'memory': []
    }

    for epoch in range(1, EPOCHS + 1):

      if epoch == 2:
        print("Profiling enabled for epoch 2 ...")

        with profile(
            activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
            record_shapes=True,
            profile_memory=True,
            with_stack=False
        ) as prof:
            train_loss, train_time, train_memory = train_epoch_with_profiler(
                model, train_loader, optimizer, scheduler, criterion, DEVICE, prof
            )

        print("PROFILER SUMMARY (Epoch 2)")


        print("\nTop Operations by CPU Time:")
        print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=15))

        print("\nTop Operations by CUDA Time:")
        try:
            print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=15))
        except:
            print(prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=15))

        print("\nTop Operations by Memory:")
        print(prof.key_averages().table(sort_by="self_cuda_memory_usage", row_limit=15))

        trace_file = "profiler_trace_epoch1.json"# profile records
        prof.export_chrome_trace(trace_file)
        print(f"\ Chrome trace saved to: {trace_file}")
        print("  Download and open in chrome://tracing")
      else:
        train_loss, train_time, train_memory = train_epoch(
          model, train_loader, optimizer, scheduler, criterion, DEVICE
        )

      val_loss, val_acc, val_labels, val_preds, val_time, val_memory = evaluate(
          model, valid_loader, criterion, DEVICE
      )

      total_train_time += train_time
      epoch_times.append(train_time)
      epoch_memories.append(train_memory)

      report_dict = classification_report(val_labels, val_preds, output_dict=True)
      macro_f1 = report_dict['macro avg']['f1-score']
      fake_recall = report_dict['0']['recall']

      print(f"Epoch {epoch}/{EPOCHS} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Acc: {val_acc:.4f} | Macro-F1: {macro_f1:.4f} | Fake Recall: {fake_recall:.4f}|  T_Time: {train_time:.1f}s | T_Mem: {train_memory:.0f}MB | V_Time: {val_time:.2f}s | V_Memory: {val_memory:.1f} MB")

      if macro_f1 > best_val_f1:
          best_val_f1 = macro_f1
          torch.save(model.state_dict(), 'best_baselineamp_model.pth')
          print(" -> Best baselineamp model updated!")


    print(f"Total Training Time: {total_train_time:.1f}s")
    print(f"Avg Epoch Time: {sum(epoch_times)/len(epoch_times):.1f}s")
    print(f"Avg Peak Memory: {sum(epoch_memories)/len(epoch_memories):.0f}MB")

    # automatically run best model on Official Test Set after training
    print("\n========== FINAL TEST RESULT (Official Test Set) ==========")
    model.load_state_dict(torch.load('best_baselineamp_model.pth'))
    test_loss, test_acc, test_labels, test_preds, test_time, test_memory = evaluate(model, test_loader, criterion, DEVICE)
    print(classification_report(test_labels, test_preds, target_names=['Fake (0)', 'True (1)']))

if __name__ == "__main__":
    run_official_split_training()

Using device: cuda | Model: bert-base-uncased
Loading Official Datasets (Train / Valid / Test)...
Stats: Train=10240, Valid=1284, Test=1267
Calculating class weights from Training set...
Class Weights: [1.14081996 0.89012517] (Index 0 is Fake, Index 1 is True)
Loading model with increased dropout...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 109,483,778 || all params: 109,483,778 || trainable%: 100.0000

Starting Training on Official Split...
Epoch 1/10 | Train Loss: 0.6994 | Val Loss: 0.6917 | Acc: 0.5576 | Macro-F1: 0.4643 | Fake Recall: 0.1461|  T_Time: 14.1s | T_Mem: 16057MB | V_Time: 3.81s | V_Memory: 4475.1 MB
 -> Best baselineamp model updated!
Profiling enabled for epoch 2 ...
PROFILER SUMMARY (Epoch 2)

Top Operations by CPU Time:
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
-------------------------------------------------------  ---------

In [None]:
#@title rebuild for baseline + amp

# 1. Re-prepare environment and data

# Redefine configuration
MODEL_NAME = 'bert-base-uncased'
BATCH_SIZE = 128
MAX_LEN = 256
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
VALID_PATH = '/kaggle/input/liar-dataset/valid.tsv'
TEST_PATH =  '/kaggle/input/liar-dataset/test.tsv'

print("Re-loading Tokenizer and Dataloaders...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Re-instantiate datasets (make sure TextualizedLIARDataset class has been executed earlier)
valid_dataset = TextualizedLIARDataset(VALID_PATH, tokenizer, max_len=MAX_LEN)
test_dataset  = TextualizedLIARDataset(TEST_PATH, tokenizer, max_len=MAX_LEN)

# Re-instantiate DataLoaders
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4, pin_memory=True)
test_loader  = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4, pin_memory=True)

# 2. Define function to search for best threshold
def find_optimal_threshold(model, dataloader, device):
    model.eval()
    all_probs = []
    all_labels = []

    print("Running inference on Validation Set...")
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

            # Use softmax to obtain probabilities
            probs = F.softmax(outputs.logits, dim=1)
            # Extract probability of label 1 (True)
            true_probs = probs[:, 1].cpu().numpy()

            all_probs.extend(true_probs)
            all_labels.extend(labels.cpu().numpy())

    all_probs = np.array(all_probs)
    all_labels = np.array(all_labels)

    # Iterate to find best F1 score
    best_threshold = 0.5
    best_f1 = 0

    thresholds = np.arange(0.1, 0.95, 0.05)

    print(f"\n{'Threshold':<10} | {'Macro F1':<10} | {'Fake Recall':<12} | {'True Recall':<12}")
    print("-" * 60)

    for thresh in thresholds:
        preds = (all_probs > thresh).astype(int)

        report = classification_report(all_labels, preds, output_dict=True)
        macro_f1 = report['macro avg']['f1-score']
        fake_recall = report['0']['recall']
        true_recall = report['1']['recall']

        print(f"{thresh:.2f}       | {macro_f1:.4f}     | {fake_recall:.4f}       | {true_recall:.4f}")

        if macro_f1 > best_f1:
            best_f1 = macro_f1
            best_threshold = thresh

    print(f"\nBest Threshold found: {best_threshold:.2f}")
    return best_threshold


# 3. Run optimization

# Load model
print("\nLoading model weights from 'best_baselineamp_model.pth'...")
# Must reinitialize model architecture before loading weights
config = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME).config
config.num_labels = 2
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, config=config)
model.load_state_dict(torch.load('best_baselineamp_model.pth'))
model.to(DEVICE)

# 1. Find best threshold on Validation Set
best_thresh = find_optimal_threshold(model, valid_loader, DEVICE)

# 2. Apply optimal threshold to Test Set
print(f"\nApplying Threshold {best_thresh:.2f} to Test Set...")
model.eval()
test_probs = []
test_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['label'].to(DEVICE)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        probs = F.softmax(outputs.logits, dim=1)
        test_probs.extend(probs[:, 1].cpu().numpy())
        test_labels.extend(labels.cpu().numpy())

test_probs = np.array(test_probs)
final_preds = (test_probs > best_thresh).astype(int)

print("\n========== OPTIMIZED TEST RESULT ==========")
print(classification_report(test_labels, final_preds, target_names=['Fake (0)', 'True (1)']))


Re-loading Tokenizer and Dataloaders...

Loading model weights from 'best_baselineamp_model.pth'...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Running inference on Validation Set...

Threshold  | Macro F1   | Fake Recall  | True Recall 
------------------------------------------------------------
0.10       | 0.3582     | 0.0146       | 1.0000
0.15       | 0.3845     | 0.0406       | 0.9955
0.20       | 0.4392     | 0.1006       | 0.9820
0.25       | 0.4865     | 0.1575       | 0.9731
0.30       | 0.5153     | 0.2013       | 0.9536
0.35       | 0.5534     | 0.2549       | 0.9461
0.40       | 0.5832     | 0.3052       | 0.9311
0.45       | 0.6060     | 0.3523       | 0.9102
0.50       | 0.6333     | 0.4221       | 0.8728
0.55       | 0.6436     | 0.4805       | 0.8204
0.60       | 0.6546     | 0.5373       | 0.7769
0.65       | 0.6605     | 0.6055       | 0.7156
0.70       | 0.6464     | 0.6623       | 0.6317
0.75       | 0.6331     | 0.7597       | 0.5225
0.80       | 0.6043     | 0.8409       | 0.4132
0.85       | 0.5463     | 0.9058       | 0.2859
0.90       | 0.4568     | 0.9643       | 0.1452

Best Threshold found: 0.65



In [None]:
#@title LORA
!pip install peft --break-system-packages



In [None]:
#@title grid search outside function

def grid_search_lora_bert_model(
    model_name,
    train_loader,
    valid_loader,
    param_grid,
    tokenizer,
    device,
    class_weights_tensor,
    epochs_per_config=2,
    verbose=True
):
    """
    Perform Grid Search for BERT + LoRA model
    """

    # Generate all parameter combinations
    keys = param_grid.keys()
    values = param_grid.values()
    param_combinations = [dict(zip(keys, v)) for v in itertools.product(*values)]

    print(f"\n{'='*70}")
    print("GRID SEARCH START")
    print(f"{'='*70}")
    print(f"Total configurations to search: {len(param_combinations)}")
    print(f"Epochs per configuration: {epochs_per_config}")
    print(f"{'='*70}\n")

    results = []
    best_f1 = 0
    best_config = None
    best_model_state = None
    last_r = None

    for idx, params in enumerate(param_combinations, 1):
        # Print current configuration (compact format)
        param_str = " | ".join([f"{k}={v}" for k, v in params.items()])
        print(f"[{idx}/{len(param_combinations)}] {param_str}")

        # Re-create DataLoader if batch_size is part of parameters
        if 'batch_size' in params:
            current_train_loader = DataLoader(
                train_loader.dataset,
                batch_size=params['batch_size'],
                shuffle=True,
                num_workers=train_loader.num_workers,
                pin_memory=True
            )
            current_valid_loader = DataLoader(
                valid_loader.dataset,
                batch_size=params['batch_size'],
                shuffle=False,
                num_workers=valid_loader.num_workers,
                pin_memory=True
            )
        else:
            current_train_loader = train_loader
            current_valid_loader = valid_loader

        # Create base BERT classification model
        base_model = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=2
        )

        # Configure LoRA
        lora_config = LoraConfig(
            task_type=TaskType.SEQ_CLS,
            r=params.get('lora_r', 8),
            lora_alpha=params.get('lora_alpha', 32),
            lora_dropout=params.get('lora_dropout', 0.01),
            target_modules=["query", "value"],
            bias="none",
            inference_mode=False
        )

        # Apply LoRA
        model = get_peft_model(base_model, lora_config)

        # Print trainable parameters when LoRA rank changes
        if verbose:
            current_r = params.get('lora_r', 8)
            if current_r != last_r:
                model.print_trainable_parameters()
                last_r = current_r

        model.to(device)

        # Create optimizer
        optimizer = optim.AdamW(
            model.parameters(),
            lr=params.get('learning_rate', 2e-5),
            weight_decay=params.get('weight_decay', 0.01)
        )

        # Learning rate scheduler
        total_steps = len(current_train_loader) * epochs_per_config
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=int(0.1 * total_steps),
            num_training_steps=total_steps
        )

        # Loss function
        criterion = nn.CrossEntropyLoss(weight=class_weights_tensor)

        # Training loop (no detailed epoch-level printing)
        best_epoch_f1 = 0
        best_epoch_acc = 0
        best_epoch_loss = float('inf')

        for epoch in range(1, epochs_per_config + 1):
            train_loss, train_time, train_memory = train_epoch(
                model, current_train_loader, optimizer, scheduler, criterion, device
            )
            val_loss, val_acc, val_labels, val_preds, val_time, val_memory = evaluate(
                model, current_valid_loader, criterion, device
            )

            report_dict = classification_report(
                val_labels, val_preds, output_dict=True, zero_division=0
            )
            macro_f1 = report_dict['macro avg']['f1-score']

            # Track best epoch result under this configuration
            if macro_f1 > best_epoch_f1:
                best_epoch_f1 = macro_f1
                best_epoch_acc = val_acc
                best_epoch_loss = val_loss

        # Print results for this configuration (one-line summary)
        print(
            f" Val Loss: {best_epoch_loss:.4f} | Val Acc: {best_epoch_acc:.4f} | Val F1: {best_epoch_f1:.4f}",
            end=""
        )

        # Store results
        result = {
            'params': params.copy(),
            'best_val_f1': best_epoch_f1,
            'best_val_acc': best_epoch_acc,
            'best_val_loss': best_epoch_loss,
            'final_train_time': train_time,
            'final_memory': train_memory
        }
        results.append(result)

        # Update global best
        if best_epoch_f1 > best_f1:
            best_f1 = best_epoch_f1
            best_config = params.copy()
            best_model_state = deepcopy(model.state_dict())
            print("NEW BEST!")
        else:
            print()  # newline

        # Memory cleanup
        del model, base_model, optimizer, scheduler
        torch.cuda.empty_cache()

    # Print final summary
    print("\nGRID SEARCH COMPLETE\n")
    print("Top 5 Configurations (sorted by F1 score):")

    results_sorted = sorted(results, key=lambda x: x['best_val_f1'], reverse=True)

    for i, r in enumerate(results_sorted[:5], 1):
        param_str = " | ".join([f"{k}={v}" for k, v in r['params'].items()])
        print(f"{i}. F1: {r['best_val_f1']:.4f} | Acc: {r['best_val_acc']:.4f} | Loss: {r['best_val_loss']:.4f}")
        print(f"   {param_str}")

    print("\nBEST CONFIGURATION:")
    print(f"Best Validation F1: {best_f1:.4f}")
    param_str = " | ".join([f"{k}={v}" for k, v in best_config.items()])
    print(f"Parameters: {param_str}\n")

    return {
        'all_results': results,
        'best_config': best_config,
        'best_f1': best_f1,
        'best_model_state': best_model_state
    }


In [None]:
#@title Lora grid search before final training

def run_lora_training_with_grid_search():
    torch.manual_seed(42)
    np.random_seed(42)
    print(f"Using device: {DEVICE} | Model: {MODEL_NAME}")

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    print("Loading Official Datasets (Train / Valid / Test)...")
    if not os.path.exists(TRAIN_PATH):
        print(f"Error: Path {TRAIN_PATH} not found.")
        return

    train_dataset = TextualizedLIARDataset(TRAIN_PATH, tokenizer, max_len=MAX_LEN)
    valid_dataset = TextualizedLIARDataset(VALID_PATH, tokenizer, max_len=MAX_LEN)
    test_dataset  = TextualizedLIARDataset(TEST_PATH, tokenizer, max_len=MAX_LEN)

    train_loader = DataLoader(
        train_dataset,
        batch_size=BATCH_SIZE,
        shuffle=True,
        num_workers=NUM_WORKERS,
        pin_memory=True
    )
    valid_loader = DataLoader(
        valid_dataset,
        batch_size=BATCH_SIZE,
        shuffle=False,
        num_workers=NUM_WORKERS,
        pin_memory=True
    )
    test_loader = DataLoader(
        test_dataset,
        batch_size=BATCH_SIZE,
        shuffle=False,
        num_workers=NUM_WORKERS,
        pin_memory=True
    )

    print(f"Stats: Train={len(train_dataset)}, Valid={len(valid_dataset)}, Test={len(test_dataset)}")

    # Calculate class weights
    print("Calculating class weights from Training set...")
    train_labels = train_dataset.df['label'].values
    class_weights = compute_class_weight(
        class_weight='balanced',
        classes=np.unique(train_labels),
        y=train_labels
    )
    class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(DEVICE)
    print(f"Class Weights: {class_weights}")

    # Define LoRA Parameter Grid
    param_grid = {
        'learning_rate': [1e-5, 2e-5, 3e-5],
        'lora_r': [4, 8, 16],
        'lora_alpha': [16, 32],
        'lora_dropout': [0.0, 0.1],
        'weight_decay': [0.01]
        # 'batch_size': [64, 128]  # enable this if you want to grid-search batch_size
    }

    # Run Grid Search
    print("Starting Grid Search for LoRA Model...")

    search_results = grid_search_lora_bert_model(
        model_name=MODEL_NAME,
        train_loader=train_loader,
        valid_loader=valid_loader,
        param_grid=param_grid,
        tokenizer=tokenizer,
        device=DEVICE,
        class_weights_tensor=class_weights_tensor,
        epochs_per_config=2,  # train 2 epochs per configuration
        verbose=True
    )

    # Retrain Full Model with Best Configuration
    print("Training final model with best LoRA configuration...")

    best_params = search_results['best_config']

    print("\nBest Configuration Selected:")
    print(f"  Learning Rate:  {best_params.get('learning_rate', 2e-5)}")
    print(f"  LoRA Rank (r):  {best_params.get('lora_r', 8)}")
    print(f"  LoRA Alpha:     {best_params.get('lora_alpha', 32)}")
    print(f"  LoRA Dropout:   {best_params.get('lora_dropout', 0.01)}")
    print(f"  Weight Decay:   {best_params.get('weight_decay', 0.01)}")
    print(f"  Best Val F1:    {search_results['best_f1']:.4f}")

    return best_params


In [None]:
#@title get best params

grid_search_output = run_lora_training_with_grid_search()

Using device: cuda | Model: bert-base-uncased
Loading Official Datasets (Train / Valid / Test)...
Stats: Train=10240, Valid=1284, Test=1267
Calculating class weights from Training set...
Class Weights: [1.14081996 0.89012517]
Starting Grid Search for LoRA Model...

GRID SEARCH START
Total configurations to search: 36
Epochs per configuration: 2

[1/36] learning_rate=1e-05 | lora_r=4 | lora_alpha=16 | lora_dropout=0.0 | weight_decay=0.01


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 148,994 || all params: 109,632,772 || trainable%: 0.1359
 Val Loss: 0.6903 | Val Acc: 0.5350 | Val F1: 0.5329NEW BEST!
[2/36] learning_rate=1e-05 | lora_r=4 | lora_alpha=16 | lora_dropout=0.1 | weight_decay=0.01


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


 Val Loss: 0.6973 | Val Acc: 0.4930 | Val F1: 0.4795
[3/36] learning_rate=1e-05 | lora_r=4 | lora_alpha=32 | lora_dropout=0.0 | weight_decay=0.01


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


 Val Loss: 0.6946 | Val Acc: 0.4782 | Val F1: 0.4202
[4/36] learning_rate=1e-05 | lora_r=4 | lora_alpha=32 | lora_dropout=0.1 | weight_decay=0.01


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


 Val Loss: 0.6947 | Val Acc: 0.4961 | Val F1: 0.4959
[5/36] learning_rate=1e-05 | lora_r=8 | lora_alpha=16 | lora_dropout=0.0 | weight_decay=0.01


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 296,450 || all params: 109,780,228 || trainable%: 0.2700
 Val Loss: 0.7017 | Val Acc: 0.4540 | Val F1: 0.4524
[6/36] learning_rate=1e-05 | lora_r=8 | lora_alpha=16 | lora_dropout=0.1 | weight_decay=0.01


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


 Val Loss: 0.6926 | Val Acc: 0.5101 | Val F1: 0.5092
[7/36] learning_rate=1e-05 | lora_r=8 | lora_alpha=32 | lora_dropout=0.0 | weight_decay=0.01


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


 Val Loss: 0.6939 | Val Acc: 0.5000 | Val F1: 0.4743
[8/36] learning_rate=1e-05 | lora_r=8 | lora_alpha=32 | lora_dropout=0.1 | weight_decay=0.01


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


 Val Loss: 0.6922 | Val Acc: 0.5421 | Val F1: 0.5418NEW BEST!
[9/36] learning_rate=1e-05 | lora_r=16 | lora_alpha=16 | lora_dropout=0.0 | weight_decay=0.01


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 591,362 || all params: 110,075,140 || trainable%: 0.5372
 Val Loss: 0.6901 | Val Acc: 0.5343 | Val F1: 0.5058
[10/36] learning_rate=1e-05 | lora_r=16 | lora_alpha=16 | lora_dropout=0.1 | weight_decay=0.01


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


 Val Loss: 0.6946 | Val Acc: 0.5467 | Val F1: 0.5419NEW BEST!
[11/36] learning_rate=1e-05 | lora_r=16 | lora_alpha=32 | lora_dropout=0.0 | weight_decay=0.01


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


 Val Loss: 0.6966 | Val Acc: 0.4922 | Val F1: 0.4780
[12/36] learning_rate=1e-05 | lora_r=16 | lora_alpha=32 | lora_dropout=0.1 | weight_decay=0.01


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


 Val Loss: 0.6920 | Val Acc: 0.5522 | Val F1: 0.5425NEW BEST!
[13/36] learning_rate=2e-05 | lora_r=4 | lora_alpha=16 | lora_dropout=0.0 | weight_decay=0.01


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 148,994 || all params: 109,632,772 || trainable%: 0.1359
 Val Loss: 0.6919 | Val Acc: 0.5600 | Val F1: 0.5463NEW BEST!
[14/36] learning_rate=2e-05 | lora_r=4 | lora_alpha=16 | lora_dropout=0.1 | weight_decay=0.01


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


 Val Loss: 0.6898 | Val Acc: 0.5421 | Val F1: 0.5404
[15/36] learning_rate=2e-05 | lora_r=4 | lora_alpha=32 | lora_dropout=0.0 | weight_decay=0.01


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


 Val Loss: 0.6907 | Val Acc: 0.5584 | Val F1: 0.5524NEW BEST!
[16/36] learning_rate=2e-05 | lora_r=4 | lora_alpha=32 | lora_dropout=0.1 | weight_decay=0.01


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


 Val Loss: 0.6926 | Val Acc: 0.5483 | Val F1: 0.5264
[17/36] learning_rate=2e-05 | lora_r=8 | lora_alpha=16 | lora_dropout=0.0 | weight_decay=0.01


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 296,450 || all params: 109,780,228 || trainable%: 0.2700
 Val Loss: 0.6922 | Val Acc: 0.5296 | Val F1: 0.4980
[18/36] learning_rate=2e-05 | lora_r=8 | lora_alpha=16 | lora_dropout=0.1 | weight_decay=0.01


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


 Val Loss: 0.6948 | Val Acc: 0.4626 | Val F1: 0.4189
[19/36] learning_rate=2e-05 | lora_r=8 | lora_alpha=32 | lora_dropout=0.0 | weight_decay=0.01


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


 Val Loss: 0.6966 | Val Acc: 0.5374 | Val F1: 0.5277
[20/36] learning_rate=2e-05 | lora_r=8 | lora_alpha=32 | lora_dropout=0.1 | weight_decay=0.01


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


 Val Loss: 0.6940 | Val Acc: 0.5117 | Val F1: 0.4903
[21/36] learning_rate=2e-05 | lora_r=16 | lora_alpha=16 | lora_dropout=0.0 | weight_decay=0.01


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 591,362 || all params: 110,075,140 || trainable%: 0.5372
 Val Loss: 0.6955 | Val Acc: 0.4938 | Val F1: 0.4936
[22/36] learning_rate=2e-05 | lora_r=16 | lora_alpha=16 | lora_dropout=0.1 | weight_decay=0.01


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


 Val Loss: 0.6938 | Val Acc: 0.5101 | Val F1: 0.5059
[23/36] learning_rate=2e-05 | lora_r=16 | lora_alpha=32 | lora_dropout=0.0 | weight_decay=0.01


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


 Val Loss: 0.6943 | Val Acc: 0.4782 | Val F1: 0.4751
[24/36] learning_rate=2e-05 | lora_r=16 | lora_alpha=32 | lora_dropout=0.1 | weight_decay=0.01


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


 Val Loss: 0.6935 | Val Acc: 0.5140 | Val F1: 0.4792
[25/36] learning_rate=3e-05 | lora_r=4 | lora_alpha=16 | lora_dropout=0.0 | weight_decay=0.01


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 148,994 || all params: 109,632,772 || trainable%: 0.1359
 Val Loss: 0.6915 | Val Acc: 0.5132 | Val F1: 0.5108
[26/36] learning_rate=3e-05 | lora_r=4 | lora_alpha=16 | lora_dropout=0.1 | weight_decay=0.01


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


 Val Loss: 0.6909 | Val Acc: 0.5055 | Val F1: 0.4858
[27/36] learning_rate=3e-05 | lora_r=4 | lora_alpha=32 | lora_dropout=0.0 | weight_decay=0.01


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


 Val Loss: 0.6919 | Val Acc: 0.5413 | Val F1: 0.4960
[28/36] learning_rate=3e-05 | lora_r=4 | lora_alpha=32 | lora_dropout=0.1 | weight_decay=0.01


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


 Val Loss: 0.6958 | Val Acc: 0.5187 | Val F1: 0.5167
[29/36] learning_rate=3e-05 | lora_r=8 | lora_alpha=16 | lora_dropout=0.0 | weight_decay=0.01


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 296,450 || all params: 109,780,228 || trainable%: 0.2700
 Val Loss: 0.6941 | Val Acc: 0.5055 | Val F1: 0.5032
[30/36] learning_rate=3e-05 | lora_r=8 | lora_alpha=16 | lora_dropout=0.1 | weight_decay=0.01


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


 Val Loss: 0.6904 | Val Acc: 0.5600 | Val F1: 0.5562NEW BEST!
[31/36] learning_rate=3e-05 | lora_r=8 | lora_alpha=32 | lora_dropout=0.0 | weight_decay=0.01


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


 Val Loss: 0.6904 | Val Acc: 0.5600 | Val F1: 0.5597NEW BEST!
[32/36] learning_rate=3e-05 | lora_r=8 | lora_alpha=32 | lora_dropout=0.1 | weight_decay=0.01


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


 Val Loss: 0.6930 | Val Acc: 0.5179 | Val F1: 0.5100
[33/36] learning_rate=3e-05 | lora_r=16 | lora_alpha=16 | lora_dropout=0.0 | weight_decay=0.01


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 591,362 || all params: 110,075,140 || trainable%: 0.5372
 Val Loss: 0.6956 | Val Acc: 0.4540 | Val F1: 0.4512
[34/36] learning_rate=3e-05 | lora_r=16 | lora_alpha=16 | lora_dropout=0.1 | weight_decay=0.01


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


 Val Loss: 0.6922 | Val Acc: 0.5358 | Val F1: 0.5313
[35/36] learning_rate=3e-05 | lora_r=16 | lora_alpha=32 | lora_dropout=0.0 | weight_decay=0.01


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


 Val Loss: 0.6922 | Val Acc: 0.5148 | Val F1: 0.5115
[36/36] learning_rate=3e-05 | lora_r=16 | lora_alpha=32 | lora_dropout=0.1 | weight_decay=0.01


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


 Val Loss: 0.6923 | Val Acc: 0.5506 | Val F1: 0.5391


GRID SEARCH COMPLETE

Top 5 Configurations (sorted by F1 score):
1. F1: 0.5597 | Acc: 0.5600 | Loss: 0.6904
   learning_rate=3e-05 | lora_r=8 | lora_alpha=32 | lora_dropout=0.0 | weight_decay=0.01
2. F1: 0.5562 | Acc: 0.5600 | Loss: 0.6904
   learning_rate=3e-05 | lora_r=8 | lora_alpha=16 | lora_dropout=0.1 | weight_decay=0.01
3. F1: 0.5524 | Acc: 0.5584 | Loss: 0.6907
   learning_rate=2e-05 | lora_r=4 | lora_alpha=32 | lora_dropout=0.0 | weight_decay=0.01
4. F1: 0.5463 | Acc: 0.5600 | Loss: 0.6919
   learning_rate=2e-05 | lora_r=4 | lora_alpha=16 | lora_dropout=0.0 | weight_decay=0.01
5. F1: 0.5425 | Acc: 0.5522 | Loss: 0.6920
   learning_rate=1e-05 | lora_r=16 | lora_alpha=32 | lora_dropout=0.1 | weight_decay=0.01


BEST CONFIGURATION:
Best Validation F1: 0.5597
Parameters: learning_rate=3e-05 | lora_r=8 | lora_alpha=32 | lora_dropout=0.0 | weight_decay=0.01


Training final model with best LoRA configuration...

Best Configurati

In [None]:
#@title model with lora + amp

#Configuration
MODEL_NAME = 'bert-base-uncased'
MAX_LEN = 256
BATCH_SIZE = 128    # A100 has large VRAM
LR = 2e-5
EPOCHS = 10
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
NUM_WORKERS = 8

#Modification Point 1: Define official dataset paths
TRAIN_PATH = '/kaggle/input/liar-dataset/train.tsv'
VALID_PATH = '/kaggle/input/liar-dataset/valid.tsv'
TEST_PATH  = '/kaggle/input/liar-dataset/test.tsv'

#1. Dataset Class (unchanged)
class TextualizedLIARDataset(Dataset):
    def __init__(self, tsv_path, tokenizer, max_len=128):
        self.df = pd.read_csv(tsv_path, sep='\t', header=None, names=[
            "id", "label", "statement", "subject", "speaker", "speaker_job",
            "state", "party", "barely_true_counts", "false_counts",
            "half_true_counts", "mostly_true_counts", "pants_on_fire_counts",
            "context"
        ])

        self.df.dropna(subset=['statement'], inplace=True)

        # Label logic: False / Pants-fire / Barely-true = 0 (Fake)
        self.label_map = {
            "pants-fire": 0, "false": 0, "barely-true": 0,
            "half-true": 1, "mostly-true": 1, "true": 1
        }

        self.df['label'] = self.df['label'].map(self.label_map)
        self.df.dropna(subset=['label'], inplace=True)
        self.df['label'] = self.df['label'].astype(int)

        text_cols = ['statement', 'subject', 'speaker', 'party', 'state', 'speaker_job', 'context']
        for col in text_cols:
            self.df[col] = self.df[col].fillna("Unknown")

        self.tokenizer = tokenizer
        self.max_len = max_len
        self.df.reset_index(drop=True, inplace=True)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        metadata_str = (
            f"Speaker: {row['speaker']} | "
            f"Job: {row['speaker_job']} | "
            f"Party: {row['party']} | "
            f"State: {row['state']} | "
            f"Context: {row['context']} | "
            f"Subject: {row['subject']}"
        )
        final_text = f"{metadata_str} [SEP] Statement: {row['statement']}"

        encoded = self.tokenizer(
            final_text,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )

        return {
            'input_ids': encoded['input_ids'].squeeze(0),
            'attention_mask': encoded['attention_mask'].squeeze(0),
            'label': torch.tensor(row['label'], dtype=torch.long)
        }

#2. Training and Evaluation Functions
def train_epoch(model, dataloader, optimizer, scheduler, criterion, device):
    model.train()
    total_loss = 0
    dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16

    reset_peak_memory()
    start_time = time.time()


    for batch in dataloader:
        input_ids = batch['input_ids'].to(device, non_blocking=True)
        attention_mask = batch['attention_mask'].to(device, non_blocking=True)
        labels = batch['label'].to(device, non_blocking=True)

        optimizer.zero_grad()

        with torch.amp.autocast(device_type='cuda', dtype=dtype):
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(outputs.logits, labels)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    end_time = time.time()
    epoch_time = end_time - start_time

    peak_memory = get_peak_gpu_memory_mb()

    return total_loss / len(dataloader), epoch_time, peak_memory

def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    all_preds, all_labels = [], []

    reset_peak_memory()
    start_time = time.time()


    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(outputs.logits, labels)

            total_loss += loss.item()
            preds = torch.argmax(outputs.logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    eval_time = time.time() - start_time
    peak_memory = get_peak_gpu_memory_mb()

    return total_loss / len(dataloader), accuracy_score(all_labels, all_preds), all_labels, all_preds, eval_time, peak_memory

#Main training
def run_official_split_training():
    torch.manual_seed(42)
    np.random.seed(42)
    print(f"Using device: {DEVICE} | Model: {MODEL_NAME}")

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    print("Loading Official Datasets (Train / Valid / Test)...")
    if not os.path.exists(TRAIN_PATH):
        print(f"Error: Path {TRAIN_PATH} not found.")
        return

    best_params = grid_search_output

    train_dataset = TextualizedLIARDataset(TRAIN_PATH, tokenizer, max_len=MAX_LEN)
    valid_dataset = TextualizedLIARDataset(VALID_PATH, tokenizer, max_len=MAX_LEN)
    test_dataset  = TextualizedLIARDataset(TEST_PATH, tokenizer, max_len=MAX_LEN)

    train_loader = DataLoader(
        train_dataset,
        batch_size=BATCH_SIZE,
        shuffle=True,
        num_workers=NUM_WORKERS,
        pin_memory=True
    )
    valid_loader = DataLoader(
        valid_dataset,
        batch_size=BATCH_SIZE,
        shuffle=False,
        num_workers=NUM_WORKERS,
        pin_memory=True
    )
    test_loader = DataLoader(
        test_dataset,
        batch_size=BATCH_SIZE,
        shuffle=False,
        num_workers=NUM_WORKERS,
        pin_memory=True
    )

    print(f"Stats: Train={len(train_dataset)}, Valid={len(valid_dataset)}, Test={len(test_dataset)}")

    print("Calculating class weights from Training set...")
    train_labels = train_dataset.df['label'].values
    class_weights = compute_class_weight(
        class_weight='balanced',
        classes=np.unique(train_labels),
        y=train_labels
    )
    class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(DEVICE)
    print(f"Class Weights: {class_weights}")
    # LoRA
    print("Loading base model with LoRA...")
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)


    lora_config = LoraConfig(
        task_type=TaskType.SEQ_CLS,
        r=best_params.get('lora_r', 8),
        lora_alpha=best_params.get('lora_alpha', 32),
        lora_dropout=best_params.get('lora_dropout', 0.01),
        target_modules=["query", "value"],
        bias="none",
        inference_mode=False
    )


    print("Final LoRA Configuration")
    print(f"  LoRA Rank (r):        {lora_config.r}")
    print(f"  LoRA Alpha:           {lora_config.lora_alpha}")
    print(f"  LoRA Dropout:         {lora_config.lora_dropout}")
    print(f"  Target Modules:       {lora_config.target_modules}")
    print(f"  Bias:                 {lora_config.bias}")
    print(f"  Inference Mode:       {lora_config.inference_mode}")
    print(f"  Task Type:            {lora_config.task_type}")

    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()
    model.to(DEVICE)

    optimizer = optim.AdamW(
        model.parameters(),
        lr=best_params.get('learning_rate', 2e-5),
        weight_decay=best_params.get('weight_decay', 0.01)
    )



    print("Optimizer Configuration")
    print(f"  Learning Rate:        {optimizer.get('learning_rate', 2e-5)}")
    print(f"  Weight Decay:         {optimizer.get('weight_decay', 0.01)}")

    criterion = nn.CrossEntropyLoss(weight=class_weights_tensor)

    total_train_time = 0
    epoch_times = []
    epoch_memories = []


    total_steps = len(train_loader) * EPOCHS
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(0.1 * total_steps), num_training_steps=total_steps)

    best_val_f1 = 0

    print("\nStarting Training on Official Split...")

    profiler_data = {
    'cpu_time': [],
    'cuda_time': [],
    'memory': []
    }

    for epoch in range(1, EPOCHS + 1):

      if epoch == 2:
        print("Profiling enabled for epoch 2 ...")

        with profile(
            activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
            record_shapes=True,
            profile_memory=True,
            with_stack=False
        ) as prof:
            train_loss, train_time, train_memory = train_epoch_with_profiler(
                model, train_loader, optimizer, scheduler, criterion, DEVICE, prof
            )


        print("PROFILER SUMMARY (Epoch 2)")


        print("\nTop Operations by CPU Time:")
        print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=15))

        print("\nTop Operations by CUDA Time:")
        try:
            print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=15))
        except:
            print(prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=15))

        print("\nTop Operations by Memory:")
        print(prof.key_averages().table(sort_by="self_cuda_memory_usage", row_limit=15))

        trace_file = "profiler_trace_lora_epoch1.json"
        prof.export_chrome_trace(trace_file)
        print(f"\n Chrome trace saved to: {trace_file}")
        print("  Download and open in chrome://tracing")
      else:
        train_loss, train_time, train_memory = train_epoch(
          model, train_loader, optimizer, scheduler, criterion, DEVICE
        )

      val_loss, val_acc, val_labels, val_preds, val_time, val_memory = evaluate(
          model, valid_loader, criterion, DEVICE
      )

      total_train_time += train_time
      epoch_times.append(train_time)
      epoch_memories.append(train_memory)

      report_dict = classification_report(val_labels, val_preds, output_dict=True)
      macro_f1 = report_dict['macro avg']['f1-score']
      fake_recall = report_dict['0']['recall']

      print(f"Epoch {epoch}/{EPOCHS} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Acc: {val_acc:.4f} | Macro-F1: {macro_f1:.4f} | Fake Recall: {fake_recall:.4f}|  T_Time: {train_time:.1f}s | T_Mem: {train_memory:.0f}MB | V_Time: {val_time:.2f}s | V_Memory: {val_memory:.1f} MB")

      if macro_f1 > best_val_f1:
          best_val_f1 = macro_f1
          torch.save(model.state_dict(), 'best_lora_model.pth')
          print(" -> Best model updated!")

    print(f"Total Training Time: {total_train_time:.1f}s")
    print(f"Avg Epoch Time: {sum(epoch_times)/len(epoch_times):.1f}s")
    print(f"Avg Peak Memory: {sum(epoch_memories)/len(epoch_memories):.0f}MB")

    # automatically run best model on Official Test Set after training
    print("\n========== FINAL TEST RESULT (Official Test Set) ==========")
    model.load_state_dict(torch.load('best_lora_model.pth'))
    test_loss, test_acc, test_labels, test_preds, test_time, test_memory = evaluate(model, test_loader, criterion, DEVICE)
    print(classification_report(test_labels, test_preds, target_names=['Fake (0)', 'True (1)']))

if __name__ == "__main__":

    run_official_split_training()

Using device: cuda | Model: bert-base-uncased
Loading Official Datasets (Train / Valid / Test)...
Stats: Train=10240, Valid=1284, Test=1267
Calculating class weights from Training set...
Class Weights: [1.14081996 0.89012517]
Loading base model with LoRA...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Final LoRA Configuration
  LoRA Rank (r):        8
  LoRA Alpha:           32
  LoRA Dropout:         0.0
  Target Modules:       {'query', 'value'}
  Bias:                 none
  Inference Mode:       False
  Task Type:            TaskType.SEQ_CLS
trainable params: 296,450 || all params: 109,780,228 || trainable%: 0.2700
Optimizer Configuration
  Learning Rate:        3e-05
  Weight Decay:         0.01

Starting Training on Official Split...
Epoch 1/10 | Train Loss: 0.6981 | Val Loss: 0.6901 | Acc: 0.5467 | Macro-F1: 0.5430 | Fake Recall: 0.4756|  T_Time: 13.3s | T_Mem: 13532MB | V_Time: 4.71s | V_Memory: 4909.0 MB
 -> Best model updated!
Profiling enabled for epoch 2 ...
PROFILER SUMMARY (Epoch 2)

Top Operations by CPU Time:
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
        

In [None]:
#@title rebuild for lora + amp

# 1. Re-prepare environment and data

# Redefine configuration
MODEL_NAME = 'bert-base-uncased'
BATCH_SIZE = 128
MAX_LEN = 256
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
VALID_PATH = '/kaggle/input/liar-dataset/valid.tsv'
TEST_PATH =  '/kaggle/input/liar-dataset/test.tsv'

print("Re-loading Tokenizer and Dataloaders...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Re-instantiate datasets (ensure TextualizedLIARDataset class has already been run above)
valid_dataset = TextualizedLIARDataset(VALID_PATH, tokenizer, max_len=MAX_LEN)
test_dataset  = TextualizedLIARDataset(TEST_PATH,  tokenizer, max_len=MAX_LEN)

# Re-instantiate DataLoader
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4, pin_memory=True)
test_loader  = DataLoader(test_dataset,  batch_size=BATCH_SIZE, shuffle=False, num_workers=4, pin_memory=True)

# 2. Define function for searching best threshold

def find_optimal_threshold(model, dataloader, device):
    model.eval()
    all_probs = []
    all_labels = []

    print("Running inference on Validation Set...")
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            # Softmax to obtain probabilities
            probs = F.softmax(outputs.logits, dim=1)
            # Extract probability of label 1 (True)
            true_probs = probs[:, 1].cpu().numpy()

            all_probs.extend(true_probs)
            all_labels.extend(labels.cpu().numpy())

    all_probs = np.array(all_probs)
    all_labels = np.array(all_labels)

    # Iterate to find best F1 score
    best_threshold = 0.5
    best_f1 = 0

    thresholds = np.arange(0.1, 0.95, 0.05)

    print(f"\n{'Threshold':<10} | {'Macro F1':<10} | {'Fake Recall':<12} | {'True Recall':<12}")
    print("-" * 60)

    for thresh in thresholds:
        preds = (all_probs > thresh).astype(int)

        report = classification_report(all_labels, preds, output_dict=True)
        macro_f1 = report['macro avg']['f1-score']
        fake_recall = report['0']['recall']
        true_recall = report['1']['recall']

        print(f"{thresh:.2f}       | {macro_f1:.4f}     | {fake_recall:.4f}       | {true_recall:.4f}")

        if macro_f1 > best_f1:
            best_f1 = macro_f1
            best_threshold = thresh

    print(f"\nBest Threshold found: {best_threshold:.2f}")
    return best_threshold

# 3. Run optimization

# Load model
print("\nLoading model weights from 'best_lora_model.pth'...")
# Must reinitialize model structure before loading weights
base_model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2
)

best_params = grid_search_output
lora_config = LoraConfig(
        task_type=TaskType.SEQ_CLS,
        r=best_params.get('lora_r', 8),
        lora_alpha=best_params.get('lora_alpha', 32),
        lora_dropout=best_params.get('lora_dropout', 0.01),
        target_modules=["query", "value"],
        bias="none",
        inference_mode=False
    )

model = get_peft_model(base_model, lora_config)

# Load fine-tuned weights
model.load_state_dict(torch.load('best_lora_model.pth'))

model.to(DEVICE)

best_thresh = find_optimal_threshold(model, valid_loader, DEVICE)

# 2. Apply to Test Set
print(f"\nApplying Threshold {best_thresh:.2f} to Test Set...")
model.eval()
test_probs = []
test_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['label'].to(DEVICE)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        probs = F.softmax(outputs.logits, dim=1)
        test_probs.extend(probs[:, 1].cpu().numpy())
        test_labels.extend(labels.cpu().numpy())

test_probs = np.array(test_probs)
final_preds = (test_probs > best_thresh).astype(int)

print("\n========== OPTIMIZED TEST RESULT ==========")
print(classification_report(test_labels, final_preds, target_names=['Fake (0)', 'True (1)']))


Re-loading Tokenizer and Dataloaders...

Loading model weights from 'best_lora_model.pth'...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Running inference on Validation Set...

Threshold  | Macro F1   | Fake Recall  | True Recall 
------------------------------------------------------------
0.10       | 0.3422     | 0.0000       | 1.0000
0.15       | 0.3422     | 0.0000       | 1.0000
0.20       | 0.3422     | 0.0000       | 1.0000
0.25       | 0.3422     | 0.0000       | 1.0000
0.30       | 0.3458     | 0.0032       | 1.0000
0.35       | 0.4148     | 0.0731       | 0.9880
0.40       | 0.5186     | 0.2289       | 0.9072
0.45       | 0.5752     | 0.3847       | 0.7934
0.50       | 0.5966     | 0.5244       | 0.6707
0.55       | 0.5832     | 0.6266       | 0.5434
0.60       | 0.5725     | 0.7451       | 0.4281
0.65       | 0.5535     | 0.8506       | 0.3278
0.70       | 0.4703     | 0.9221       | 0.1781
0.75       | 0.3718     | 0.9919       | 0.0464
0.80       | 0.3259     | 1.0000       | 0.0015
0.85       | 0.3242     | 1.0000       | 0.0000
0.90       | 0.3242     | 1.0000       | 0.0000

Best Threshold found: 0.50

