In [1]:
import sys
import os
import json
import numpy as np
from tqdm.auto import tqdm 
import torch

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
from src.utils import load_cleaned_data, load_config, metrics, plot_confusion_matrix
from src.weighted_ensemble_predict import weighted_ensemble_predict

from transformers import (
    XLMRobertaTokenizerFast,
    XLMRobertaForSequenceClassification,
    set_seed,
)

2025-04-25 12:25:12.455920: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-25 12:25:12.466261: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745576712.478276    3977 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745576712.481738    3977 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1745576712.491600    3977 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [2]:
RUN_ID_TO_LOAD = "run_20250414_134417"
CONFIG_PATH = "../cfg/xlm_roberta.json"

config = load_config(CONFIG_PATH)

set_seed(config["seed"])
torch.backends.cudnn.deterministic = True

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#device = torch.device("cpu")
print(f"Using device: {device}")
if device == torch.device("cuda"): torch.cuda.empty_cache()

model_name = config["model"]["base_model"]
tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_name)
print(f"Tokenizer {model_name} loaded.")

model_base_path = config["training"]["output_dir"]
run_output_dir = os.path.join(model_base_path, RUN_ID_TO_LOAD)
metrics_path = os.path.join(run_output_dir, "fold_metrics.json")
checkpoint_dir = os.path.join(run_output_dir, "fold_checkpoint_paths.json")
n_splits = config["cross_validation"]["n_splits"]

Using device: cuda
Tokenizer xlm-roberta-base loaded.


In [3]:
with open(metrics_path, "r") as f:
    fold_results = json.load(f)

with open(checkpoint_dir, "r") as f:
    fold_checkpoints = json.load(f)

metric_key = f"eval_{config["training"]["metric_for_best_model"]}"
fold_f1_scores = [result[metric_key] for result in fold_results]
print(f"Loaded metrics for {len(fold_f1_scores)} folds from {metrics_path}")
print(f"Loaded checkpoints for {len(fold_results)} folds from {checkpoint_dir}")
print(f"Fold F1 scores: {fold_f1_scores}")

fold_models = []
print(f"\nLoading {n_splits} fold models...")
for fold in range(0, n_splits):
    model_path = fold_checkpoints[fold]
    model = XLMRobertaForSequenceClassification.from_pretrained(model_path)
    model.to(device)
    model.eval()  # Set to evaluation mode
    fold_models.append(model)
    print(f"Loaded model for fold {fold}.")

assert len(fold_models) == n_splits, f"Expected {n_splits} models, but found {len(fold_models)}"
assert len(fold_f1_scores) == n_splits, f"Expected {n_splits} metric scores, but found {len(fold_f1_scores)}"

Loaded metrics for 5 folds from ../models/run_20250414_134417/fold_metrics.json
Loaded checkpoints for 5 folds from ../models/run_20250414_134417/fold_checkpoint_paths.json
Fold F1 scores: [0.8222222222222222, 0.7741935483870968, 0.8, 0.7045454545454546, 0.7956989247311828]

Loading 5 fold models...
Loaded model for fold 0.
Loaded model for fold 1.
Loaded model for fold 2.
Loaded model for fold 3.
Loaded model for fold 4.


In [4]:
test_data = load_cleaned_data(config["data"]["bypublisher_test_data_path"])
print("Columns in test data:", test_data.columns.tolist())
X_test = test_data["full_text"]
y_test = test_data["label"]
print(f"Loaded bypublisher data: {len(X_test)} samples.")

Columns in test data: ['id', 'title', 'body', 'full_text', 'label', 'domain', 'uppercase_ratio', 'exclamation_count', 'avg_sentence_length']
Loaded bypublisher data: 4000 samples.


In [5]:
y_pred = []
y_pred_proba = []

print("\nStarting ensemble predictions on test data...")
for text in tqdm(X_test, desc="Predicting"): 
    inputs = tokenizer(
        text,
        max_length=config["data"]["max_length"],
        truncation=config["tokenizer"]["truncation"],
        padding=config["tokenizer"]["padding"], 
        add_special_tokens=config["tokenizer"]["add_special_tokens"],
        return_tensors=config["tokenizer"]["return_tensors"]
    ).to(device)
    
    weighted_probs = weighted_ensemble_predict(inputs, fold_models, fold_f1_scores, device)
    
    pred_label = torch.argmax(weighted_probs, dim=1).item()
    confidence = weighted_probs[0][1].item() 
    
    y_pred.append(pred_label)
    y_pred_proba.append(confidence)

y_pred = np.array(y_pred)
y_pred_proba = np.array(y_pred_proba)
print("Ensemble predictions finished.")


Starting ensemble predictions on test data...


Predicting:   0%|          | 0/4000 [00:00<?, ?it/s]

Ensemble predictions finished.


In [6]:
print(f"Predictions shape: {y_pred.shape}")
print(f"Predictions probabilities shape: {y_pred_proba.shape}")
print(f"Test labels shape: {y_test.shape}")
np.save(f"../results/predictions/by_publisher_{config["model"]["base_model"]}_predictions.npy", y_pred)
np.save(f"../results/predictions/by_publisher_{config["model"]["base_model"]}_predictions_proba.npy", y_pred_proba)
np.save(f"../results/predictions/by_publisher_{config["model"]["base_model"]}_test_labels.npy", y_test)

Predictions shape: (4000,)
Predictions probabilities shape: (4000,)
Test labels shape: (4000,)
