In [None]:
# Data Loading Modules
import os
import pandas as pd
from datasets import Dataset

def load_maritime_corpus():
    """Load maritime_corpus.csv from Data/ folder"""
    data_path = "./Data/maritime_corpus.csv"
    df = pd.read_csv(data_path)
    print(f"Loaded maritime_corpus.csv: {len(df)} samples")
    print(f"Columns: {list(df.columns)}")
    return df

def load_news_corpus():
    """Load News_corpus.csv from Data/ folder"""
    data_path = "./Data/News_corpus.csv"
    df = pd.read_csv(data_path)
    print(f"Loaded News_corpus.csv: {len(df)} samples")
    print(f"Columns: {list(df.columns)}")
    return df

def load_annotation():
    """Load Annotation.csv from Data/ folder"""
    data_path = "./Data/Annotation.csv"
    df = pd.read_csv(data_path)
    print(f"Loaded Annotation.csv: {len(df)} samples")
    print(f"Columns: {list(df.columns)}")
    return df


In [None]:
!pip -q install "transformers>=4.41.0" "datasets>=2.19.0" "accelerate>=0.30.0" "scikit-learn>=1.3.0" "pandas>=2.0.0" "openpyxl>=3.1.0" --upgrade

import os, random, numpy as np, torch
import pandas as pd

from sklearn.metrics import classification_report, confusion_matrix
from collections import Counter

import torch.nn as nn
from torch.optim import AdamW
from torch.nn.functional import softmax

from datasets import Dataset, Features, Value, ClassLabel
from transformers import (
    AutoTokenizer, AutoModel, Trainer, TrainingArguments
)

## DAPT

In [None]:
# === DAPT Data Loading ===
maritime_df = load_maritime_corpus()
dataset = Dataset.from_pandas(maritime_df)
dataset = dataset.train_test_split(test_size=0.1, seed=42)

# === Tokenization ===
from transformers import AutoTokenizer, AutoModelForMaskedLM, DataCollatorForLanguageModeling

model_ckpt = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

def tokenize_function(examples):
    return tokenizer(examples["sentence"], truncation=True, max_length=128)

tokenized = dataset.map(tokenize_function, batched=True, remove_columns=["sentence"])

# === Data Collator ===
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

# === Training arguments ===
dapt_output_dir = "./Model/dapt_model"
training_args = TrainingArguments(
    output_dir=dapt_output_dir,
    eval_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=1000,
    learning_rate=5e-5,
    weight_decay=0.01,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,
    eval_accumulation_steps=1,
    num_train_epochs=2,
    warmup_ratio=0.06,
    logging_strategy="steps",
    logging_steps=50,
    logging_dir=f"{dapt_output_dir}/logs",
    report_to="none",
    save_total_limit=2,
    dataloader_pin_memory=False,
    dataloader_num_workers=0,
    resume_from_checkpoint=None,
)

# === Load model ===
model = AutoModelForMaskedLM.from_pretrained(model_ckpt)

# === Compute metrics for MLM accuracy ===
import numpy as np
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    mask = labels != -100  # Ignore padding positions
    correct = (predictions == labels) & mask
    accuracy = correct.sum() / mask.sum()
    return {"accuracy": accuracy}

# === Trainer with progress bar ===
from tqdm.auto import tqdm
import torch

# Use all test samples instead of selecting 500
small_eval_dataset = tokenized["test"]
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=small_eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# === Training with progress tracking ===
print(" Starting DAPT training...")
print(f"Training samples: {len(tokenized['train'])}")
print(f"Evaluation samples: {len(small_eval_dataset)}")

# Check for existing checkpoints
import os
checkpoint_dirs = [d for d in os.listdir(dapt_output_dir) if d.startswith('checkpoint-')] if os.path.exists(dapt_output_dir) else []
if checkpoint_dirs:
    latest_checkpoint = sorted(checkpoint_dirs)[-1]
    print(f"�� Found existing checkpoint: {latest_checkpoint}")
    print(" Training will resume from latest checkpoint")
else:
    print(" No existing checkpoints found, starting fresh training")

# Initial evaluation
initial_metrics = trainer.evaluate(eval_dataset=small_eval_dataset)
print("\n Initial Evaluation (before training):")
for key, value in initial_metrics.items():
    print(f"  {key}: {value:.4f}")

# Training with progress bar
trainer.train()

# Final evaluation
final_metrics = trainer.evaluate(eval_dataset=small_eval_dataset)
print("\n Final Evaluation (after training):")
for key, value in final_metrics.items():
    print(f"  {key}: {value:.4f}")

# Save model and tokenizer
trainer.save_model(dapt_output_dir)
tokenizer.save_pretrained(dapt_output_dir)
print(f"\n DAPT model saved to: {dapt_output_dir}")

## TAPT

In [None]:
# === TAPT Data Loading ===
news_df = load_news_corpus()
dataset = Dataset.from_pandas(news_df)
dataset = dataset.train_test_split(test_size=0.1, seed=42)

def tapt_tokenize(example):
    return tokenizer(example["text"], truncation=True, max_length=512)

tokenized = dataset.map(tapt_tokenize, batched=True, remove_columns=["text"])

# === TAPT TrainingArguments ===
tapt_output_dir = "./Model/tapt_model"
tapt_args = TrainingArguments(
    output_dir=tapt_output_dir,
    eval_strategy="steps",
    eval_steps=50,
    save_strategy="steps",
    save_steps=100,
    learning_rate=3e-5,
    weight_decay=0.01,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=4,
    eval_accumulation_steps=1,
    num_train_epochs=2,
    warmup_ratio=0.06,
    logging_strategy="steps",
    logging_steps=50,
    logging_dir=f"{tapt_output_dir}/logs",
    report_to="none",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    dataloader_pin_memory=False,
    dataloader_num_workers=0,
    resume_from_checkpoint=None,
)

# === Load DAPT model as base ===
model = AutoModelForMaskedLM.from_pretrained(dapt_output_dir)

# === Accuracy compute_metrics function ===
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    mask = labels != -100
    correct = (predictions == labels) & mask
    accuracy = correct.sum() / mask.sum()
    return {"accuracy": accuracy}

# === TAPT Trainer ===
small_eval_dataset = tokenized["test"]
trainer = Trainer(
    model=model,
    args=tapt_args,
    train_dataset=tokenized["train"],
    eval_dataset=small_eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# === Training with progress tracking ===
print("Starting TAPT training...")
print(f"Training samples: {len(tokenized['train'])}")
print(f"Evaluation samples: {len(small_eval_dataset)}")

# Check for existing checkpoints
import os
checkpoint_dirs = [d for d in os.listdir(tapt_output_dir) if d.startswith('checkpoint-')] if os.path.exists(tapt_output_dir) else []
if checkpoint_dirs:
    latest_checkpoint = sorted(checkpoint_dirs)[-1]
    print(f"Found existing checkpoint: {latest_checkpoint}")
    print("Training will resume from latest checkpoint")
else:
    print("No existing checkpoints found, starting fresh training")

# Initial evaluation
initial_metrics = trainer.evaluate(eval_dataset=small_eval_dataset)
print("\nInitial Evaluation (before training):")
for key, value in initial_metrics.items():
    print(f"  {key}: {value:.4f}")

# Training with progress bar
trainer.train()

# Final evaluation
final_metrics = trainer.evaluate(eval_dataset=small_eval_dataset)
print("\nFinal Evaluation (after training):")
for key, value in final_metrics.items():
    print(f"  {key}: {value:.4f}")

# Save model and tokenizer
trainer.save_model(tapt_output_dir)
tokenizer.save_pretrained(tapt_output_dir)
print(f"\nTAPT model saved to: {tapt_output_dir}")

## Classfier

In [None]:
# === Classifier Data Loading ===
annotation_df = load_annotation()

# Keep max 500 examples per class (you can remove this if you want all data)
label_df = annotation_df.groupby('label').head(500).reset_index(drop=True)
label_df['label'] = label_df['label'].astype(int)  # 0=irrelevant, 1=rise, 2=fall

id2label = {0: "irrelevant", 1: "rise", 2: "fall"}

# Create binary labels for the two-stage setup
label_df['label_rel'] = label_df['label'].apply(lambda x: 0 if x == 0 else 1)  # 0=irrelevant, 1=relevant
label_df['label_dir'] = label_df['label'].apply(lambda x: 0 if x == 1 else (1 if x == 2 else 0))  # rise/fall

# Compute class weights for relevance head
rel_counts = Counter(label_df['label_rel'])
total = sum(rel_counts.values())
class_weights = [total / rel_counts[i] for i in range(2)]
print("Class counts (rel):", dict(rel_counts), "-> weights:", class_weights)

# Build Hugging Face Dataset
features = Features({
    "sentence": Value("string"),
    "label": ClassLabel(names=[id2label[i] for i in range(3)]),
    "label_rel": ClassLabel(names=["irrelevant", "relevant"]),
    "label_dir": ClassLabel(names=["rise", "fall"])
})
full_ds = Dataset.from_pandas(label_df, features=features)

# Split into train/valid/test with stratification
ds = full_ds.train_test_split(test_size=0.2, seed=42, stratify_by_column="label")
tmp = ds["train"].train_test_split(test_size=0.2, seed=42, stratify_by_column="label")
dataset = {"train": tmp["train"], "valid": tmp["test"], "test": ds["test"]}

for k, v in dataset.items():
    print(f"{k}: {len(v)} samples")

# === Reproducibility ===
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
set_seed(42)

print("Torch:", torch.__version__, "| Transformers:", __import__("transformers").__version__)
print("CUDA available:", torch.cuda.is_available())

In [None]:
# === Model Definition ===
class TwoStageModel(nn.Module):
    """
    A two-head model:
      - relevance head: irrelevant vs relevant
      - direction head: rise vs fall (only applied if relevant)
    """
    def __init__(self, base_model_name, hidden_size=768, rel_class_weights=None, num_layers_to_freeze=6):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(base_model_name)
        self.dropout = nn.Dropout(0.1)
        self.rel_head = nn.Linear(hidden_size, 2)  # relevance classification
        self.dir_head = nn.Linear(hidden_size, 2)  # direction classification

        # Freeze the first N encoder layers
        for name, param in self.encoder.named_parameters():
            if "encoder.layer." in name:
                try:
                    layer_num = int(name.split("encoder.layer.")[1].split(".")[0])
                    if layer_num < num_layers_to_freeze:
                        param.requires_grad = False
                except Exception:
                    pass

        # Loss functions
        if rel_class_weights is not None:
            self.rel_loss_fn = nn.CrossEntropyLoss(weight=torch.tensor(rel_class_weights, dtype=torch.float32))
        else:
            self.rel_loss_fn = nn.CrossEntropyLoss()
        self.dir_loss_fn = nn.CrossEntropyLoss()

    def forward(self, input_ids, attention_mask,
                labels=None,  # not used
                labels_rel=None,
                labels_dir=None):
        enc_out = self.encoder(input_ids, attention_mask=attention_mask).last_hidden_state
        cls = self.dropout(enc_out[:, 0])  # [CLS] token
        logits_rel = self.rel_head(cls)
        logits_dir = self.dir_head(cls)

        loss = None
        if labels_rel is not None:
            # relevance loss always computed
            loss_rel = self.rel_loss_fn(logits_rel, labels_rel)
            loss_dir = 0.0
            if labels_dir is not None:
                # only compute direction loss for relevant samples
                mask = labels_rel == 1
                if mask.any():
                    loss_dir = self.dir_loss_fn(logits_dir[mask], labels_dir[mask])
            loss = loss_rel + loss_dir

        return {"loss": loss, "logits_rel": logits_rel, "logits_dir": logits_dir}

# === Tokenization ===
tapt_dir = "./Model/tapt_model"
tokenizer = AutoTokenizer.from_pretrained(tapt_dir)

def tokenize(batch):
    return tokenizer(batch["sentence"], truncation=True, padding="max_length", max_length=128)

tokenized = {k: v.map(tokenize, batched=True, remove_columns=["sentence"]) for k, v in dataset.items()}

# Rename columns to match trainer inputs
for split in tokenized:
    tokenized[split] = tokenized[split].rename_column("label_rel", "labels_rel")
    tokenized[split] = tokenized[split].rename_column("label_dir", "labels_dir")
    tokenized[split].set_format(type="torch", columns=["input_ids", "attention_mask", "labels_rel", "labels_dir", "label"])

# === Custom Trainer ===
class TwoStageTrainer(Trainer):
    """Custom Trainer that passes two labels to the model"""
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels_rel = inputs.pop("labels_rel")
        labels_dir = inputs.pop("labels_dir")
        outputs = model(**inputs, labels_rel=labels_rel, labels_dir=labels_dir)
        loss = outputs["loss"]
        return (loss, outputs) if return_outputs else loss

# === Training Arguments ===
args = TrainingArguments(
    output_dir="./Model/classifier",
    eval_strategy="epoch",   # evaluate at each epoch
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=32,
    learning_rate=1e-5,
    weight_decay=0.01,
    save_total_limit=2,
    logging_strategy="epoch",
    logging_dir="./logs",
    report_to="none",
    seed=42,
    optim="adamw_torch",
    fp16=torch.cuda.is_available(),   # use mixed precision if GPU
    dataloader_pin_memory=False,
    dataloader_num_workers=0,
    # Checkpoint resume settings
    resume_from_checkpoint=None,  # Set to checkpoint path to resume, or None for auto-resume
)

# === Model + Custom Optimizer ===
model = TwoStageModel(
    base_model_name=tapt_dir,
    rel_class_weights=class_weights,
    num_layers_to_freeze=9
)

# Different learning rates for encoder vs classifier heads
encoder_params, classifier_params = [], []
for name, param in model.named_parameters():
    if param.requires_grad:
        if name.startswith("encoder."):
            encoder_params.append(param)
        else:
            classifier_params.append(param)  # These are the classification heads

optimizer = AdamW([
    {"params": encoder_params, "lr": 1e-5},
    {"params": classifier_params, "lr": 3e-5},  # Higher LR for classifier heads
], weight_decay=0.01)

trainer = TwoStageTrainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["valid"],
    tokenizer=tokenizer,
)
trainer.optimizer = optimizer  # inject custom optimizer

# === Training with progress tracking ===
print("Starting Classifier training...")
print(f"Training samples: {len(tokenized['train'])}")
print(f"Validation samples: {len(tokenized['valid'])}")
print(f"Test samples: {len(tokenized['test'])}")

# Check for existing checkpoints
import os
classifier_output_dir = "./Model/classifier"
checkpoint_dirs = [d for d in os.listdir(classifier_output_dir) if d.startswith('checkpoint-')] if os.path.exists(classifier_output_dir) else []
if checkpoint_dirs:
    latest_checkpoint = sorted(checkpoint_dirs)[-1]
    print(f"Found existing checkpoint: {latest_checkpoint}")
    print("Training will resume from latest checkpoint")
else:
    print("No existing checkpoints found, starting fresh training")

# Initial evaluation
initial_metrics = trainer.evaluate(eval_dataset=tokenized["valid"])
print("\nInitial Evaluation (before training):")
for key, value in initial_metrics.items():
    print(f"  {key}: {value:.4f}")

# Training with progress bar
train_result = trainer.train()
print("Best model path:", trainer.state.best_model_checkpoint)

# Final evaluation
final_metrics = trainer.evaluate(eval_dataset=tokenized["valid"])
print("\nFinal Evaluation (after training):")
for key, value in final_metrics.items():
    print(f"  {key}: {value:.4f}")

print(f"\nClassifier model saved to: ./Model/classifier")


In [None]:
# === Evaluation Helpers ===
@torch.no_grad()
def collect_logits(model, ds, trainer):
    """Collect logits and labels from a dataset"""
    dl = trainer.get_eval_dataloader(ds)
    model.eval()
    all_rel, all_dir, all_rel_labels, all_dir_labels = [], [], [], []
    device = next(model.parameters()).device
    for batch in dl:
        labels_rel = batch.pop("labels_rel")
        labels_dir = batch.pop("labels_dir")
        batch = {k: v.to(device) for k,v in batch.items() if k in ["input_ids","attention_mask"]}
        out = model(**batch)
        all_rel.append(out["logits_rel"].cpu())
        all_dir.append(out["logits_dir"].cpu())
        all_rel_labels.append(labels_rel.cpu())
        all_dir_labels.append(labels_dir.cpu())
    rel = torch.cat(all_rel).numpy()
    dire = torch.cat(all_dir).numpy()
    y_rel = torch.cat(all_rel_labels).numpy()
    y_dir = torch.cat(all_dir_labels).numpy()
    return rel, dire, y_rel, y_dir

def predict_two_stage(model, ds, trainer, tau_rel=0.5):
    """Two-stage prediction with threshold tau_rel"""
    rel_logits, dir_logits, _, _ = collect_logits(model, ds, trainer)
    rel_probs = softmax(torch.tensor(rel_logits), dim=1).numpy()
    dir_probs = softmax(torch.tensor(dir_logits), dim=1).numpy()

    preds = []
    for pr, pd in zip(rel_probs, dir_probs):
        if pr[1] < tau_rel:
            preds.append(0)  # irrelevant
        else:
            preds.append(1 if pd[0] >= pd[1] else 2)  # rise/fall
    return np.array(preds)

# === Evaluate on Test Set ===
test_preds = predict_two_stage(model, tokenized["test"], trainer, tau_rel=0.5)
y_true = np.array(dataset["test"]["label"])
print("== Test Results (no calibration) ==")
print(classification_report(y_true, test_preds, target_names=[id2label[i] for i in range(3)], digits=3))
print(confusion_matrix(y_true, test_preds))

# === Temperature Scaling ===
def fit_temperature(logits_np, labels_np, max_iter=50):
    """Fit temperature scaling using LBFGS"""
    logits = torch.tensor(logits_np, dtype=torch.float32)
    labels = torch.tensor(labels_np, dtype=torch.long)
    T = torch.nn.Parameter(torch.ones(1, dtype=torch.float32))
    optim = torch.optim.LBFGS([T], lr=0.01, max_iter=max_iter)
    ce = nn.CrossEntropyLoss()

    def closure():
        optim.zero_grad()
        loss = ce(logits / T, labels)
        loss.backward()
        return loss

    optim.step(closure)
    return float(T.detach().cpu().item())

# Fit temperature on validation set
rel_logits_val, dir_logits_val, y_rel_val, y_dir_val = collect_logits(model, tokenized["valid"], trainer)
T_rel = fit_temperature(rel_logits_val, y_rel_val)
mask_val = (y_rel_val == 1)
T_dir = fit_temperature(dir_logits_val[mask_val], y_dir_val[mask_val])
print(f"Temperature scaling: T_rel={T_rel:.3f}, T_dir={T_dir:.3f}")

def predict_two_stage_temp(model, ds, trainer, tau_rel=0.5, T_rel=1.0, T_dir=1.0):
    """Two-stage prediction with temperature scaling"""
    rel_logits, dir_logits, _, _ = collect_logits(model, ds, trainer)
    rel_probs = softmax(torch.tensor(rel_logits)/T_rel, dim=1).numpy()
    dir_probs = softmax(torch.tensor(dir_logits)/T_dir, dim=1).numpy()
    preds = []
    for pr, pd in zip(rel_probs, dir_probs):
        if pr[1] < tau_rel:
            preds.append(0)
        else:
            preds.append(1 if pd[0]>=pd[1] else 2)
    return np.array(preds)

# === Final Test Results with Temperature Scaling ===
test_preds_temp = predict_two_stage_temp(model, tokenized["test"], trainer, tau_rel=0.5, T_rel=T_rel, T_dir=T_dir)
print("\n== Test Results (with temperature scaling) ==")
print(classification_report(y_true, test_preds_temp, target_names=[id2label[i] for i in range(3)], digits=3))
print(confusion_matrix(y_true, test_preds_temp))


## Manual Checkpoint Resume Guide

In [None]:
"""
If you need to resume from a specific checkpoint instead of auto-resume:

1. For DAPT training:
   training_args.resume_from_checkpoint = "./Model/dapt_model/checkpoint-1000"
   
2. For TAPT training:
   tapt_args.resume_from_checkpoint = "./Model/tapt_model/checkpoint-500"
   
3. For Classifier training:
   args.resume_from_checkpoint = "./Model/classifier/checkpoint-2"

Checkpoint files are automatically saved at:
- DAPT: every 1000 steps
- TAPT: every 100 steps  
- Classifier: every epoch

To list available checkpoints:
"""
import os

def list_checkpoints(model_dir):
    """List all available checkpoints in a model directory"""
    if not os.path.exists(model_dir):
        print(f"Directory {model_dir} does not exist")
        return []
    
    checkpoints = [d for d in os.listdir(model_dir) if d.startswith('checkpoint-')]
    if checkpoints:
        checkpoints.sort(key=lambda x: int(x.split('-')[1]))
        print(f"Available checkpoints in {model_dir}:")
        for ckpt in checkpoints:
            print(f"  - {ckpt}")
    else:
        print(f"No checkpoints found in {model_dir}")
    
    return checkpoints

print("=== Checkpoint Status ===")
dapt_checkpoints = list_checkpoints("./Model/dapt_model")
tapt_checkpoints = list_checkpoints("./Model/tapt_model") 
classifier_checkpoints = list_checkpoints("./Model/classifier")

print("\n Tip: Set resume_from_checkpoint to any of the above paths to resume from that specific checkpoint")
