In [None]:
# Import all necessary libraries
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"  # Use physical GPU 1
import math, random
from dataclasses import dataclass
from typing import List, Dict, Tuple

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.utils.class_weight import compute_class_weight

from transformers import AutoTokenizer, AutoModel, get_scheduler
import torch.optim as optim
import matplotlib.pyplot as plt

print("All libraries imported successfully!")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name()}")
print(f"Current working directory: {os.getcwd()}")

In [None]:
# Configuration class and utility functions
@dataclass
class TrainConfig:
    save_path: str = "roberta_mean_pool_fold_{}.pt"  # For 5fold model naming
    roberta_name: str = "roberta-base"
    max_length: int = 256
    batch_size: int = 8            # batch = groups; each group has 5 posts internally
    epochs: int = 20
    lr_head: float = 1e-3
    lr_backbone: float = 2e-5
    weight_decay: float = 0.01
    warmup_ratio: float = 0.06
    unfreeze_last_n_layers: int = 3   # 0=freeze all; e.g., 3=unfreeze last 3 layers
    seed: int = 42
    device: str = "cuda" if torch.cuda.is_available() else "cpu"
    num_workers: int = 2
    pin_memory: bool = True
    data_csv: str = "data_with_instance_and_fold_labels.csv"  # Updated to new data file
    pooling_type: str = "mean"   # Only use mean pooling

def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

print("Configuration class defined!")

In [None]:
# Data loading function
def load_dataframe(csv_path: str) -> pd.DataFrame:
    """Load and validate dataframe"""
    df = pd.read_csv(csv_path)
    required_cols = ["post_sequence", "suicide_risk", "post_created_utc", "fold_label"]
    missing_cols = [col for col in required_cols if col not in df.columns]
    if missing_cols:
        raise KeyError(f"Missing required columns: {missing_cols}")
    
    # Keep required columns and rename text column
    df = df[required_cols].rename(columns={"post_sequence": "text"})
    
    # Convert timestamp
    df["post_created_utc"] = pd.to_datetime(df["post_created_utc"])
    
    # Basic validation
    if len(df) % 5 != 0:
        raise ValueError(f"Row count {len(df)} is not a multiple of 5. Ensure original order is not shuffled.")
    
    # Ensure every 5 rows have the same label
    G = len(df) // 5
    for g in range(G):
        labs = df.iloc[g*5:(g+1)*5]["suicide_risk"].tolist()
        if len(set(labs)) != 1:
            raise ValueError(f"Group {g} has inconsistent labels: {labs}")
    
    # Force labels to int
    df["suicide_risk"] = df["suicide_risk"].astype(int)
    df["fold_label"] = df["fold_label"].astype(int)
    return df

print("Data loading function defined!")

In [None]:
        # Calculate relative time intervals (in hours)
        time_intervals = []
        off = 0
        for size in sizes:
            group_timestamps = flat_timestamps[off:off+size]
            # Calculate hour difference for each timestamp relative to the first timestamp
            base_time = group_timestamps[0]
            intervals = [(ts - base_time).total_seconds() / 3600 for ts in group_timestamps]
            time_intervals.extend(intervals)
            off += size
        
        return enc, labels, torch.tensor(sizes, dtype=torch.long), torch.tensor(time_intervals, dtype=torch.float32)

print("Dataset class defined!")

In [None]:
# Mean Pooling implementation

class MeanPooling(nn.Module):
    """Mean pooling"""
    def __init__(self, hidden_size=768):
        super().__init__()
        
    def forward(self, cls_batch: torch.Tensor, group_sizes: torch.Tensor, time_intervals: torch.Tensor = None):
        # MeanPooling doesn't use time information, maintaining backward compatibility
        out, off = [], 0
        for k in group_sizes.tolist():
            out.append(cls_batch[off:off+k].mean(dim=0, keepdim=True))
            off += k
        return torch.cat(out, dim=0)

print("Mean Pooling defined!")

In [None]:
# RoBERTa Mean Pooling classifier
class RobertaEnhancedPoolClassifier(nn.Module):
    def __init__(self, roberta_name="roberta-base", num_classes=4, pooling_type="mean"):
        super().__init__()
        self.backbone = AutoModel.from_pretrained(roberta_name)
        hidden = self.backbone.config.hidden_size  # 768 for roberta-base
        self.pooling_type = pooling_type
        
        # Only support mean pooling
        if pooling_type == "mean":
            self.pooling = MeanPooling(hidden)
        else:
            raise ValueError(f"Only mean pooling is supported, but got: {pooling_type}")
            
        # 768→128→16→4
        self.head = nn.Sequential(
            nn.Linear(hidden, 128),
            nn.BatchNorm1d(128),
            nn.GELU(),
            nn.Linear(128, 16),
            nn.BatchNorm1d(16),
            nn.GELU(),
            nn.Linear(16, num_classes),
        )
    
    def forward(self, encodings, group_sizes: torch.Tensor, time_intervals: torch.Tensor = None):
        out = self.backbone(input_ids=encodings["input_ids"], attention_mask=encodings["attention_mask"])
        cls = out.last_hidden_state[:, 0, :]  # (B*5, 768)
        
        # Use mean pooling
        pooled = self.pooling(cls, group_sizes, time_intervals)  # (B, 768)
        
        return self.head(pooled)  # (B, num_classes)

def set_backbone_trainable(model: RobertaEnhancedPoolClassifier, unfreeze_last_n_layers: int = 0):
    """Control the number of trainable backbone layers"""
    # First freeze all parameters
    for p in model.backbone.parameters(): 
        p.requires_grad = False
    
    # Unfreeze last n layers
    if unfreeze_last_n_layers > 0:
        encoder_layers = model.backbone.encoder.layer  # RobertaEncoder
        for layer in encoder_layers[-unfreeze_last_n_layers:]:
            for p in layer.parameters(): 
                p.requires_grad = True

print("Classifier model defined!")

In [None]:
# Training-related utility functions
def compute_class_weights(y: np.ndarray, num_classes: int) -> torch.Tensor:
    """Compute class weights to handle imbalanced data"""
    classes = np.arange(num_classes)
    w = compute_class_weight(class_weight="balanced", classes=classes, y=y)
    return torch.tensor(w, dtype=torch.float32)

@torch.no_grad()
def evaluate(model, loader, device, criterion=None) -> Dict[str, float]:
    """Evaluate model performance"""
    model.eval()
    all_logits, all_labels, total_loss, n = [], [], 0.0, 0
    
    for batch in loader:
        if len(batch) == 4:  # New format: includes time information
            enc, labels, sizes, time_intervals = batch
            time_intervals = time_intervals.to(device)
        else:  # Old format: backward compatibility
            enc, labels, sizes = batch
            time_intervals = None
            
        enc = {k: v.to(device) for k, v in enc.items()}
        labels = labels.to(device)
        sizes = sizes.to(device)
        
        logits = model(enc, sizes, time_intervals)
        all_logits.append(logits.detach().cpu())
        all_labels.append(labels.detach().cpu())
        
        if criterion is not None:
            total_loss += criterion(logits, labels).item() * labels.size(0)
            n += labels.size(0)
    
    logits = torch.cat(all_logits)
    y_true = torch.cat(all_labels).numpy()
    y_pred = logits.argmax(dim=1).numpy()
    
    wf1 = f1_score(y_true, y_pred, average="weighted")
    out = {"weighted_f1": float(wf1)}
    if criterion is not None and n > 0: 
        out["loss"] = float(total_loss / n)
    return out

print("Evaluation function defined!")

In [None]:
# 5fold cross-validation training function
from tqdm import tqdm
from sklearn.metrics import classification_report

def train_single_fold(df: pd.DataFrame, cfg: TrainConfig, fold_idx: int, num_classes: int = 4) -> Dict[str, any]:
    """Train model for a single fold"""
    set_seed(cfg.seed)
    device = cfg.device
    print(f"Training fold {fold_idx}, using device: {device}")
    
    # Split data based on fold_label
    G = len(df) // 5
    group_ids = np.arange(G)
    
    # Get fold labels for each group (take fold_label from first row of each group)
    group_fold_labels = np.array([int(df.iloc[g*5]["fold_label"]) for g in group_ids], dtype=int)
    group_labels = np.array([int(df.iloc[g*5]["suicide_risk"]) for g in group_ids], dtype=int)
    
    # Current fold as validation set, other 4 folds as training set
    g_tr = group_ids[group_fold_labels != fold_idx]
    g_val = group_ids[group_fold_labels == fold_idx]
    y_tr = group_labels[group_fold_labels != fold_idx]
    y_val = group_labels[group_fold_labels == fold_idx]
    
    print(f"Fold {fold_idx}: Training set {len(g_tr)} groups, validation set {len(g_val)} groups")
    print(f"Training set label distribution: {np.bincount(y_tr)}")
    print(f"Validation set label distribution: {np.bincount(y_val)}")
    
    # Create tokenizer and data loaders
    tok = AutoTokenizer.from_pretrained(cfg.roberta_name)
    collate = Collator(tok, max_length=cfg.max_length)
    
    ds_tr = GroupedTextDataset(df, g_tr)
    ds_val = GroupedTextDataset(df, g_val)
    
    dl_tr = DataLoader(ds_tr, batch_size=cfg.batch_size, shuffle=True, 
                       num_workers=cfg.num_workers, pin_memory=cfg.pin_memory, collate_fn=collate)
    dl_val = DataLoader(ds_val, batch_size=cfg.batch_size, shuffle=False, 
                        num_workers=cfg.num_workers, pin_memory=cfg.pin_memory, collate_fn=collate)
    
    # Create model
    model = RobertaEnhancedPoolClassifier(cfg.roberta_name, num_classes=num_classes, 
                                        pooling_type=cfg.pooling_type).to(device)
    set_backbone_trainable(model, cfg.unfreeze_last_n_layers)
    
    # Set up optimizer
    params = []
    if any(p.requires_grad for p in model.backbone.parameters()):
        params.append({"params": [p for p in model.backbone.parameters() if p.requires_grad],
                       "lr": cfg.lr_backbone, "weight_decay": cfg.weight_decay})
    params.append({"params": list(model.head.parameters()), "lr": cfg.lr_head, "weight_decay": cfg.weight_decay})
    params.append({"params": list(model.pooling.parameters()), "lr": cfg.lr_head, "weight_decay": cfg.weight_decay})
    
    opt = optim.AdamW(params)
    
    # Learning rate scheduler
    steps_per_epoch = max(1, math.ceil(len(ds_tr) / cfg.batch_size))
    total_steps = cfg.epochs * steps_per_epoch
    warmup_steps = int(cfg.warmup_ratio * total_steps)
    
    sch = get_scheduler(
        "linear",
        optimizer=opt,
        num_warmup_steps=warmup_steps,
        num_training_steps=total_steps,
    )
    
    # Compute class weights
    class_w = compute_class_weights(y_tr, num_classes).to(device)
    criterion = nn.CrossEntropyLoss(weight=class_w)
    
    # Training loop
    best_f1, best_state, patience, bad = -1.0, None, 5, 0
    
    print(f"Starting training for Fold {fold_idx}...")
    for ep in tqdm(range(1, cfg.epochs+1), desc=f"Fold {fold_idx} Training"):
        model.train()
        epoch_loss = 0.0
        n_batches = 0
        
        for batch in dl_tr:
            if len(batch) == 4:
                enc, labels, sizes, time_intervals = batch
                time_intervals = time_intervals.to(device)
            else:
                enc, labels, sizes = batch
                time_intervals = None
                
            enc = {k: v.to(device) for k, v in enc.items()}
            labels = labels.to(device)
            sizes = sizes.to(device)
            
            opt.zero_grad(set_to_none=True)
            logits = model(enc, sizes, time_intervals)
            loss = criterion(logits, labels)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            opt.step()
            sch.step()
            
            epoch_loss += loss.item()
            n_batches += 1
        
        # Evaluate each epoch
        tr_eval = evaluate(model, dl_tr, device, criterion)
        val_eval = evaluate(model, dl_val, device, criterion)
        
        if ep % 5 == 0:  # Print every 5 epochs
            print(f"  Epoch {ep:02d} | Train Loss {tr_eval['loss']:.4f} WF1 {tr_eval['weighted_f1']:.4f} | "
                  f"Val Loss {val_eval['loss']:.4f} WF1 {val_eval['weighted_f1']:.4f}")
        
        # Early stopping check
        if val_eval["weighted_f1"] > best_f1 + 1e-6:
            best_f1 = val_eval["weighted_f1"]
            best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
            bad = 0
        else:
            bad += 1
            if bad >= patience:
                print(f"  Early stopping at epoch {ep} (best Val WF1={best_f1:.4f})")
                break
    
    # Save best model
    if best_state is not None:
        model.load_state_dict(best_state, strict=True)
        model_save_path = cfg.save_path.format(fold_idx)
        torch.save(best_state, model_save_path)
        print(f"  Model saved to: {model_save_path}")
    
    # Validation set evaluation and generate classification report
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in dl_val:
            if len(batch) == 4:
                enc, labels, sizes, time_intervals = batch
                time_intervals = time_intervals.to(device)
            else:
                enc, labels, sizes = batch
                time_intervals = None
                
            enc = {k: v.to(device) for k, v in enc.items()}
            labels = labels.to(device)
            sizes = sizes.to(device)
            
            logits = model(enc, sizes, time_intervals)
            preds = logits.argmax(dim=1)
            
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    # Generate classification report
    report = classification_report(all_labels, all_preds, 
                                 target_names=['class_0', 'class_1', 'class_2', 'class_3'],
                                 digits=4)
    print(f"\nFold {fold_idx} validation set classification report:")
    print(report)
    
    return {
        "fold_idx": fold_idx,
        "best_f1": best_f1,
        "model_path": model_save_path,
        "val_predictions": all_preds,
        "val_labels": all_labels,
        "classification_report": report
    }

print("5fold training function defined!")

In [None]:
# Voting prediction function
def ensemble_predict(test_df: pd.DataFrame, cfg: TrainConfig, voting_strategy: str = "hard", num_classes: int = 4):
    """
    Use 5 trained models for ensemble prediction
    
    Args:
        test_df: Test data
        cfg: Configuration
        voting_strategy: "hard" or "soft"
        num_classes: Number of classes
    
    Returns:
        Prediction results and probabilities
    """
    device = cfg.device
    tok = AutoTokenizer.from_pretrained(cfg.roberta_name)
    collate = Collator(tok, max_length=cfg.max_length)
    
    # Prepare test data
    G_test = len(test_df) // 5
    test_group_ids = np.arange(G_test)
    ds_test = GroupedTextDataset(test_df, test_group_ids)
    dl_test = DataLoader(ds_test, batch_size=cfg.batch_size, shuffle=False, 
                        num_workers=cfg.num_workers, pin_memory=cfg.pin_memory, collate_fn=collate)
    
    # Load 5 models and make predictions
    all_predictions = []  # Store predictions from each model
    all_probabilities = []  # Store probabilities from each model
    
    for fold_idx in range(5):
        model_path = cfg.save_path.format(fold_idx)
        print(f"Loading model: {model_path}")
        
        # Create and load model
        model = RobertaEnhancedPoolClassifier(cfg.roberta_name, num_classes=num_classes, 
                                            pooling_type=cfg.pooling_type).to(device)
        model.load_state_dict(torch.load(model_path, map_location=device))
        model.eval()
        
        fold_preds = []
        fold_probs = []
        
        with torch.no_grad():
            for batch in dl_test:
                if len(batch) == 4:
                    enc, labels, sizes, time_intervals = batch
                    time_intervals = time_intervals.to(device)
                else:
                    enc, labels, sizes = batch
                    time_intervals = None
                    
                enc = {k: v.to(device) for k, v in enc.items()}
                sizes = sizes.to(device)
                
                logits = model(enc, sizes, time_intervals)
                probs = torch.softmax(logits, dim=1)
                preds = logits.argmax(dim=1)
                
                fold_preds.extend(preds.cpu().numpy())
                fold_probs.extend(probs.cpu().numpy())
        
        all_predictions.append(fold_preds)
        all_probabilities.append(fold_probs)
    
    # Convert to numpy arrays
    all_predictions = np.array(all_predictions)  # (5, n_samples)
    all_probabilities = np.array(all_probabilities)  # (5, n_samples, n_classes)
    
    # Generate final predictions based on voting strategy
    if voting_strategy == "hard":
        # Hard voting: choose the most frequent class
        final_predictions = []
        for i in range(all_predictions.shape[1]):
            votes = all_predictions[:, i]
            # Count votes for each class
            vote_counts = np.bincount(votes, minlength=num_classes)
            final_pred = np.argmax(vote_counts)
            final_predictions.append(final_pred)
        final_predictions = np.array(final_predictions)
        final_probabilities = None
        
    elif voting_strategy == "soft":
        # Soft voting: average probabilities
        final_probabilities = np.mean(all_probabilities, axis=0)  # (n_samples, n_classes)
        final_predictions = np.argmax(final_probabilities, axis=1)
        
    else:
        raise ValueError(f"Unknown voting strategy: {voting_strategy}. Choose 'hard' or 'soft'.")
    
    print(f"Completed prediction using {voting_strategy} voting strategy")
    print(f"Prediction result distribution: {np.bincount(final_predictions)}")
    
    return {
        "predictions": final_predictions,
        "probabilities": final_probabilities,
        "individual_predictions": all_predictions,
        "individual_probabilities": all_probabilities,
        "voting_strategy": voting_strategy
    }

print("Voting prediction function defined!")

In [None]:
# Data loading and basic configuration
data_file = "data_with_instance_and_fold_labels.csv"

try:
    df = load_dataframe(data_file)
    print(f"Data loaded successfully!")
    print(f"Data shape: {df.shape}")
    print(f"Total groups: {len(df) // 5}")
    print(f"\nLabel distribution:")
    print(df['suicide_risk'].value_counts().sort_index())
    print(f"\nFold distribution:")
    # Get fold labels for each group
    G = len(df) // 5
    group_fold_labels = [df.iloc[g*5]["fold_label"] for g in range(G)]
    fold_counts = pd.Series(group_fold_labels).value_counts().sort_index()
    print(fold_counts)
    print(f"\nFirst few rows of data:")
    print(df[['text', 'suicide_risk', 'fold_label']].head())
    
except FileNotFoundError:
    print(f"Error: Data file '{data_file}' not found")
    print("Please ensure the data file is in the current directory")
except Exception as e:
    print(f"Error loading data: {e}")

In [None]:
# Fold 0 training
cfg = TrainConfig(
    save_path="bio_mean_pool_fold_{}.pt",
    roberta_name="/prj0129/jzh4027/IEEE/local_models/models--microsoft--BiomedNLP-BiomedBERT-base-uncased-abstract/snapshots/d673b8835373c6fa116d6d8006b33d48734e305d",
    max_length=512,
    batch_size=32,
    epochs=10,
    lr_head=1e-3,
    lr_backbone=2e-5,
    weight_decay=0.01,
    warmup_ratio=0.06,
    unfreeze_last_n_layers=3,
    seed=42,
    data_csv="data_with_instance_and_fold_labels.csv",
    pooling_type="mean"
)

print(f"Configuration completed! Starting training for Fold 0")
print(f"Device: {cfg.device}")

if 'df' in locals():
    fold_0_result = train_single_fold(df, cfg, fold_idx=0, num_classes=4)
    print(f"\nFold 0 training completed! Best validation F1: {fold_0_result['best_f1']:.4f}")
else:
    print("Error: Data not loaded. Please run the data loading cell first.")

In [None]:
# Fold 1 training
if 'df' in locals():
    fold_1_result = train_single_fold(df, cfg, fold_idx=1, num_classes=4)
    print(f"\nFold 1 training completed! Best validation F1: {fold_1_result['best_f1']:.4f}")
else:
    print("Error: Data not loaded. Please run the data loading cell first.")

In [None]:
# Fold 2 training
if 'df' in locals():
    fold_2_result = train_single_fold(df, cfg, fold_idx=2, num_classes=4)
    print(f"\nFold 2 training completed! Best validation F1: {fold_2_result['best_f1']:.4f}")
else:
    print("Error: Data not loaded. Please run the data loading cell first.")

In [None]:
# Fold 3 training
if 'df' in locals():
    fold_3_result = train_single_fold(df, cfg, fold_idx=3, num_classes=4)
    print(f"\nFold 3 training completed! Best validation F1: {fold_3_result['best_f1']:.4f}")
else:
    print("Error: Data not loaded. Please run the data loading cell first.")

In [None]:
# Fold 4 training
if 'df' in locals():
    fold_4_result = train_single_fold(df, cfg, fold_idx=4, num_classes=4)
    print(f"\nFold 4 training completed! Best validation F1: {fold_4_result['best_f1']:.4f}")
else:
    print("Error: Data not loaded. Please run the data loading cell first.")

# Ensemble Prediction on Test Set

Now we'll load all 5 trained models and perform ensemble prediction using hard voting on the test set (`sdoh_evaluate_on_leaderboard.csv`).

In [None]:
def load_test_dataframe(file_path, max_length=256):
    """
    Load test dataframe for prediction (without suicide_risk and fold_label columns)
    """
    df = pd.read_csv(file_path)
    
    # Check required columns for test set - using actual column names
    required_cols = ['user_id', 'post_created_utc', 'post_sequence']
    missing_cols = [col for col in required_cols if col not in df.columns]
    if missing_cols:
        raise ValueError(f"Missing required columns: {missing_cols}")
    
    print(f"Loaded test dataframe: {len(df)} rows, {len(df.columns)} columns")
    print(f"Columns: {list(df.columns)}")
    
    # Rename columns to match training data format
    df = df.rename(columns={
        'post_created_utc': 'posting_time', 
        'post_sequence': 'posting_text'
    })
    
    # Basic validation - must be multiple of 5
    if len(df) % 5 != 0:
        raise ValueError(f"Row count {len(df)} is not a multiple of 5. Each group should have exactly 5 rows.")
    
    # Tokenize text
    tokenizer = AutoTokenizer.from_pretrained("roberta-base")  # Use same tokenizer as training
    
    def tokenize_text(text):
        if pd.isna(text):
            text = ""
        return tokenizer(str(text), 
                        max_length=max_length, 
                        padding='max_length', 
                        truncation=True, 
                        return_tensors='pt')
    
    print("Tokenizing test texts...")
    df['tokenized'] = df['posting_text'].apply(tokenize_text)
    
    # Convert posting_time to datetime
    df['posting_time'] = pd.to_datetime(df['posting_time'])
    
    # Group every 5 consecutive rows (just like training data)
    grouped_data = []
    total_groups = len(df) // 5
    
    for g in range(total_groups):
        start_idx = g * 5
        end_idx = start_idx + 5
        group = df.iloc[start_idx:end_idx].copy()
        
        # Sort by posting_time within each group
        group_sorted = group.sort_values('posting_time')
        grouped_data.append(group_sorted)
    
    print(f"Total instances (groups): {len(grouped_data)}")
    return grouped_data

In [None]:
def create_test_dataset(grouped_data):
    """
    Create dataset for test prediction
    """
    test_data = []
    
    for group in grouped_data:
        # Extract tokenized inputs
        input_ids = torch.stack([row['tokenized']['input_ids'].squeeze(0) for _, row in group.iterrows()])
        attention_mask = torch.stack([row['tokenized']['attention_mask'].squeeze(0) for _, row in group.iterrows()])
        
        # Calculate time intervals (even though not used by MeanPooling)
        times = group['posting_time'].tolist()
        time_intervals = []
        for i in range(len(times)):
            if i == 0:
                time_intervals.append(0)
            else:
                interval = (times[i] - times[i-1]).total_seconds() / 3600  # hours
                time_intervals.append(interval)
        
        test_data.append({
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'time_intervals': torch.tensor(time_intervals, dtype=torch.float32),
            'group_id': len(test_data)  # Just use the group index as identifier
        })
    
    return test_data

In [None]:
def ensemble_predict(test_data, model_paths, device='cuda', voting_strategy='hard'):
    """
    Perform ensemble prediction using hard or soft voting
    
    Args:
        test_data: Test data
        model_paths: List of model file paths
        device: Device to use for inference
        voting_strategy: 'hard' for majority voting, 'soft' for probability averaging
    
    Returns:
        tuple: (ensemble_predictions, individual_predictions, individual_probabilities)
    """
    models = []
    
    # Load all models
    print("Loading trained models...")
    for i, model_path in enumerate(model_paths):
        print(f"Loading model from: {model_path}")
        
        # Initialize model
        model = RobertaEnhancedPoolClassifier(
            roberta_name="roberta-base",  # Use same as training
            num_classes=4,
            pooling_type="mean"
        )
        
        # Load state dict
        state_dict = torch.load(model_path, map_location=device)
        model.load_state_dict(state_dict)
        model.to(device)
        model.eval()
        
        models.append(model)
        print(f"Model {i} loaded successfully")
    
    print(f"\nTotal models loaded: {len(models)}")
    
    # Perform predictions
    all_predictions = []  # List of arrays, each array contains predictions from one model
    all_probabilities = []  # List of arrays, each array contains probabilities from one model
    
    print("Making predictions...")
    with torch.no_grad():
        for model_idx, model in enumerate(models):
            print(f"Predicting with model {model_idx}...")
            model_predictions = []
            model_probabilities = []
            
            for batch in test_data:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                time_intervals = batch['time_intervals'].to(device)
                
                # Prepare encodings dict and group sizes
                encodings = {
                    'input_ids': input_ids,
                    'attention_mask': attention_mask
                }
                group_sizes = torch.tensor([len(input_ids)], dtype=torch.long).to(device)
                
                # Get prediction and probabilities
                outputs = model(encodings, group_sizes, time_intervals)
                probabilities = torch.softmax(outputs, dim=1).cpu().numpy()[0]  # Get probabilities for single instance
                predicted_class = torch.argmax(outputs, dim=1).cpu().item()
                
                model_predictions.append(predicted_class)
                model_probabilities.append(probabilities)
            
            all_predictions.append(model_predictions)
            all_probabilities.append(model_probabilities)
            print(f"Model {model_idx} predictions: {len(model_predictions)} instances")
    
    # Convert to numpy arrays for easier manipulation
    all_predictions = np.array(all_predictions)  # Shape: (5_models, n_instances)
    all_probabilities = np.array(all_probabilities)  # Shape: (5_models, n_instances, 4_classes)
    
    # Ensemble prediction based on strategy
    if voting_strategy.lower() == 'hard':
        print(f"\nPerforming hard voting ensemble...")
        ensemble_predictions = []
        
        for i in range(len(test_data)):
            # Get predictions from all models for instance i
            votes = all_predictions[:, i]  # Get votes from all 5 models for instance i
            
            # Count votes for each class
            vote_counts = np.bincount(votes, minlength=4)
            
            # Get majority vote (in case of tie, argmax returns the first occurrence)
            majority_vote = np.argmax(vote_counts)
            ensemble_predictions.append(majority_vote)
            
        ensemble_predictions = np.array(ensemble_predictions)
        ensemble_probabilities = None
        
    elif voting_strategy.lower() == 'soft':
        print(f"\nPerforming soft voting ensemble...")
        # Average probabilities across all models
        ensemble_probabilities = np.mean(all_probabilities, axis=0)  # Shape: (n_instances, 4_classes)
        
        # Get final predictions from averaged probabilities
        ensemble_predictions = np.argmax(ensemble_probabilities, axis=1)
        
    else:
        raise ValueError(f"Unknown voting strategy: {voting_strategy}. Choose 'hard' or 'soft'.")
    
    print(f"Ensemble predictions completed using {voting_strategy} voting: {len(ensemble_predictions)} instances")
    
    # Print prediction distribution
    pred_counts = np.bincount(ensemble_predictions, minlength=4)
    print(f"Prediction distribution:")
    for class_id in range(4):
        percentage = pred_counts[class_id] / len(ensemble_predictions) * 100
        print(f"  Class {class_id}: {pred_counts[class_id]} instances ({percentage:.1f}%)")
    
    return ensemble_predictions, all_predictions, all_probabilities

In [None]:
# Load test data
print("Loading test dataset...")
test_file_path = "sdoh_evaluate_on_leaderboard.csv"
test_grouped_data = load_test_dataframe(test_file_path)

# Create test dataset
print("\nCreating test dataset...")
test_data = create_test_dataset(test_grouped_data)
print(f"Test dataset created: {len(test_data)} instances")

# Define model paths
model_paths = [
    "roberta_mean_pool_fold_0.pt",
    "roberta_mean_pool_fold_1.pt", 
    "roberta_mean_pool_fold_2.pt",
    "roberta_mean_pool_fold_3.pt",
    "roberta_mean_pool_fold_4.pt"
]

# Check if all model files exist
print("\nChecking model files...")
for path in model_paths:
    if os.path.exists(path):
        print(f"✓ {path} exists")
    else:
        print(f"✗ {path} not found!")
        raise FileNotFoundError(f"Model file not found: {path}")

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"\nUsing device: {device}")

# Perform ensemble prediction - you can change voting_strategy to 'soft' for soft voting
voting_strategy = 'soft'  # Change to 'soft' for soft voting
print(f"\nStarting ensemble prediction with {voting_strategy} voting...")
ensemble_predictions, individual_predictions, individual_probabilities = ensemble_predict(
    test_data, model_paths, device, voting_strategy=voting_strategy
)

# Print results
print(f"\n" + "="*50)
print("ENSEMBLE PREDICTION RESULTS")
print("="*50)
print(f"Total test instances: {len(ensemble_predictions)}")
print(f"Voting strategy: {voting_strategy.upper()}")
print(f"Ensemble predictions: {ensemble_predictions}")

# Show some individual model predictions for comparison
print(f"\nIndividual model predictions (first 10 instances):")
print("Instance | Model0 | Model1 | Model2 | Model3 | Model4 | Ensemble")
print("-" * 65)
for i in range(min(10, len(ensemble_predictions))):
    individual_preds = [individual_predictions[j][i] for j in range(5)]
    print(f"{i:8d} | {individual_preds[0]:6d} | {individual_preds[1]:6d} | {individual_preds[2]:6d} | {individual_preds[3]:6d} | {individual_preds[4]:6d} | {ensemble_predictions[i]:8d}")

# If soft voting, show some probability information
if voting_strategy.lower() == 'soft' and individual_probabilities is not None:
    print(f"\nSoft voting - Average probabilities (first 5 instances):")
    print("Instance |   Class 0   |   Class 1   |   Class 2   |   Class 3   | Prediction")
    print("-" * 75)
    avg_probs = np.mean(individual_probabilities, axis=0)  # Average across models
    for i in range(min(5, len(ensemble_predictions))):
        probs = avg_probs[i]
        print(f"{i:8d} | {probs[0]:10.4f} | {probs[1]:10.4f} | {probs[2]:10.4f} | {probs[3]:10.4f} | {ensemble_predictions[i]:10d}")

print(f"\nFinal ensemble predictions array:")
print(ensemble_predictions)

# Save predictions to npy file
np.save('ensemble_predictions.npy', ensemble_predictions)
print(f"\nPredictions saved to: ensemble_predictions.npy")

In [None]:
np.save('5fold_bio.npy', ensemble_predictions)