## 1. Imports and Setup

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path

from scipy.sparse import load_npz
import xgboost as xgb
from sklearn.metrics import average_precision_score, roc_auc_score

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

## 2. File Path Validation

In [None]:
DATA_DIR = Path("../../../data/processed")

def ensure_exists(path):
    """Check file existence and raise a clear error if missing."""
    if not Path(path).exists():
        raise FileNotFoundError(
            f"Missing required file: {path!s}. "
            "Place it under data/processed or update the path."
        )

X_train_full_fp = DATA_DIR / "X_train_full.npz"
X_test_fp       = DATA_DIR / "X_test.npz"
y_train_fp      = DATA_DIR / "y_train_full.npy"
ids_train_fp    = DATA_DIR / "ids_train_full.npy"
ids_test_fp     = DATA_DIR / "ids_test.npy"
splits_fp       = DATA_DIR / "train_brd4_50k_clean_blocks.parquet"

for p in (X_train_full_fp, X_test_fp, y_train_fp, ids_train_fp, ids_test_fp, splits_fp):
    ensure_exists(p)

print("✓ All required files found")

## 3. Load Features, Labels, IDs, and Split Metadata

In [None]:
X_train_full = load_npz(X_train_full_fp)
X_test       = load_npz(X_test_fp)

y_train_full   = np.load(y_train_fp)
ids_train_full = np.load(ids_train_fp)
ids_test       = np.load(ids_test_fp)

splits_df = pd.read_parquet(splits_fp)

print("X_train_full:", X_train_full.shape)
print("X_test:", X_test.shape)
print("y_train_full:", y_train_full.shape)
print("splits columns:", splits_df.columns.tolist())

## 4. Apply Official Block-Aware Split

**train_in**: Molecules for training

**val_ood**: Out-of-distribution validation (different building blocks)

In [None]:
df = pd.DataFrame({"id": ids_train_full})
df = df.merge(splits_df[["id", "split_group"]], on="id", how="left")

train_mask = df["split_group"] == "train_in"
val_mask   = df["split_group"] == "val_ood"

X_tr = X_train_full[train_mask.values]
y_tr = y_train_full[train_mask.values]

X_val = X_train_full[val_mask.values]
y_val = y_train_full[val_mask.values]

print("train_in:", X_tr.shape, "| positives:", y_tr.sum())
print("val_ood:", X_val.shape, "| positives:", y_val.sum())

## 5. Calculate Class Imbalance Weight

In [None]:
N_pos = int((y_tr == 1).sum())
N_neg = int((y_tr == 0).sum())
scale_pos_weight = N_neg / N_pos

print("scale_pos_weight (train_in):", scale_pos_weight)

## 6. STEP 1 — Train XGBoost on train_in Only

In [None]:
xgb_clf = xgb.XGBClassifier(
    objective="binary:logistic",
    tree_method="hist",
    n_estimators=500,
    learning_rate=0.05,
    max_depth=7,
    subsample=0.8,
    colsample_bytree=0.6,
    scale_pos_weight=scale_pos_weight,
    reg_lambda=1.0,
    reg_alpha=0.0,
    n_jobs=-1,
    random_state=RANDOM_STATE,
    eval_metric="aucpr"
)

xgb_clf.fit(X_tr, y_tr)
print("XGBoost trained on train_in.")

## 7. STEP 2 — Validate on val_ood (Unseen Building Blocks)

In [None]:
val_proba_xgb = xgb_clf.predict_proba(X_val)[:, 1]

ap_val_xgb  = average_precision_score(y_val, val_proba_xgb)
roc_val_xgb = roc_auc_score(y_val, val_proba_xgb)

print(f"XGBoost – Val_OOD AP:  {ap_val_xgb:.6f}")
print(f"XGBoost – Val_OOD AUC: {roc_val_xgb:.6f}")

## 8. STEP 3 — Retrain XGBoost on ALL Data (train_in + val_ood)

This final model will be used for predicting X_test.

In [None]:
xgb_final = xgb.XGBClassifier(
    objective="binary:logistic",
    tree_method="hist",
    n_estimators=500,
    learning_rate=0.05,
    max_depth=7,
    subsample=0.8,
    colsample_bytree=0.6,
    scale_pos_weight=scale_pos_weight,   # still computed from train_in
    reg_lambda=1.0,
    reg_alpha=0.0,
    n_jobs=-1,
    random_state=RANDOM_STATE,
    eval_metric="aucpr"
)

xgb_final.fit(X_train_full, y_train_full)
print("XGBoost retrained on FULL training set.")

## 9. Generate Test Predictions and Create Submission File

In [None]:
# Predict on test data using the final model
test_proba = xgb_final.predict_proba(X_test)[:, 1]

submission = pd.DataFrame({
    "id": ids_test,
    "binds": test_proba.astype(float)
})

# Create submission directory if it doesn't exist
submission_dir = Path("../../../data/submission_of_models")
submission_dir.mkdir(parents=True, exist_ok=True)

submission_path = submission_dir / "submission_xgboost.csv"
submission.to_csv(submission_path, index=False)

print(f"✓ Submission saved to: {submission_path}")
print(f"Submission shape: {submission.shape}")
submission.head()