## 1. Imports and Setup

In [33]:
import numpy as np
import pandas as pd
from pathlib import Path

from scipy.sparse import load_npz, vstack
import xgboost as xgb
from sklearn.metrics import average_precision_score, roc_auc_score

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

## 2. File Path Validation

In [34]:
DATA_DIR = Path("../../../data/processed")

def ensure_exists(path):
    """Check file existence and raise a clear error if missing."""
    if not Path(path).exists():
        raise FileNotFoundError(
            f"Missing required file: {path!s}. "
            "Place it under data/processed or update the path."
        )

# Pre-split ECFP files (all in the same folder)
X_train_in_fp = DATA_DIR / "X_train_in_ecfp.npz"
y_train_in_fp = DATA_DIR / "y_train_in.npy"

X_val_ood_fp  = DATA_DIR / "X_val_ood_ecfp.npz"
y_val_ood_fp  = DATA_DIR / "y_val_ood.npy"

X_test_ood_fp = DATA_DIR / "X_test_ood_ecfp.npz"
y_test_ood_fp = DATA_DIR / "y_test_ood.npy"

for p in (X_train_in_fp, y_train_in_fp,
          X_val_ood_fp,  y_val_ood_fp,
          X_test_ood_fp, y_test_ood_fp):
    ensure_exists(p)

print("✓ All required files found")

✓ All required files found


## 3. Load Pre-Split Data

In [35]:
X_tr   = load_npz(X_train_in_fp)   # train_in fingerprints
y_tr   = np.load(y_train_in_fp)

X_val  = load_npz(X_val_ood_fp)    # val_ood fingerprints
y_val  = np.load(y_val_ood_fp)

X_test = load_npz(X_test_ood_fp)   # test_ood fingerprints
y_test = np.load(y_test_ood_fp)

print("Train_in shape:", X_tr.shape,  "| Positives:", int(y_tr.sum()))
print("Val_ood  shape:", X_val.shape, "| Positives:", int(y_val.sum()))
print("Test_ood shape:", X_test.shape,"| Positives:", int(y_test.sum()))

Train_in shape: (34819, 2048) | Positives: 119
Val_ood  shape: (7268, 2048) | Positives: 30
Test_ood shape: (7913, 2048) | Positives: 120


## 4. Calculate Class Imbalance Weight

In [36]:
N_pos = int((y_tr == 1).sum())
N_neg = int((y_tr == 0).sum())
scale_pos_weight = N_neg / N_pos

print("scale_pos_weight (train_in):", scale_pos_weight)

scale_pos_weight (train_in): 291.5966386554622


## 5. STEP 1 — Train XGBoost on train_in

In [None]:
xgb_clf = xgb.XGBClassifier(
    objective="binary:logistic",
    tree_method="hist",
    n_estimators=1000,
    learning_rate=0.05,
    
    max_depth=7,
    subsample=0.8,
    colsample_bytree=0.6,
    scale_pos_weight=scale_pos_weight,
    reg_lambda=1.0,
    reg_alpha=0.0,
    n_jobs=-1,
    random_state=RANDOM_STATE,
    eval_metric="aucpr"
)

xgb_clf.fit(X_tr, y_tr)
print("XGBoost trained on train_in.")

Parameters: { "min_samples_leaf" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost trained on train_in.


## 6. Evaluate on val_ood (Out-of-Distribution Validation)

In [38]:
val_proba_xgb = xgb_clf.predict_proba(X_val)[:, 1]

ap_val_xgb  = average_precision_score(y_val, val_proba_xgb)
roc_val_xgb = roc_auc_score(y_val, val_proba_xgb)

print(f"XGBoost – Val_OOD AP:  {ap_val_xgb:.6f}")
print(f"XGBoost – Val_OOD AUC: {roc_val_xgb:.6f}")

XGBoost – Val_OOD AP:  0.217187
XGBoost – Val_OOD AUC: 0.890508


## 7. STEP 2 — Retrain on ALL Training Data (train_in + val_ood)

In [39]:
X_train_all = vstack([X_tr, X_val])
y_train_all = np.concatenate([y_tr, y_val])

print("Combined TRAIN (train_in + val_ood) shape:",
      X_train_all.shape, "| Positives:", int(y_train_all.sum()))

xgb_final = xgb.XGBClassifier(
    objective="binary:logistic",
    tree_method="hist",
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=7,
    subsample=0.8,
    colsample_bytree=0.6,
    scale_pos_weight=scale_pos_weight,  # still based on train_in
    reg_lambda=1.0,
    reg_alpha=0.0,
    n_jobs=-1,
    random_state=RANDOM_STATE,
    eval_metric="aucpr"
)

xgb_final.fit(X_train_all, y_train_all)
print("XGBoost retrained on TRAIN + VAL (train_in + val_ood).")

Combined TRAIN (train_in + val_ood) shape: (42087, 2048) | Positives: 149
XGBoost retrained on TRAIN + VAL (train_in + val_ood).
XGBoost retrained on TRAIN + VAL (train_in + val_ood).


## 8. STEP 3 — Final Evaluation on test_ood

In [40]:
test_proba_xgb = xgb_final.predict_proba(X_test)[:, 1]

ap_test_xgb  = average_precision_score(y_test, test_proba_xgb)
roc_test_xgb = roc_auc_score(y_test, test_proba_xgb)

print(f"XGBoost – Test_OOD AP:  {ap_test_xgb:.6f}")
print(f"XGBoost – Test_OOD AUC: {roc_test_xgb:.6f}")

XGBoost – Test_OOD AP:  0.089750
XGBoost – Test_OOD AUC: 0.734257


## 9. Create Submission File (Optional)

If you have unlabeled test IDs for Kaggle submission, create the CSV here.

In [41]:
# Example: if you have test IDs loaded separately
# ids_test = np.load(DATA_DIR / "ids_test_ood.npy")
# 
# submission = pd.DataFrame({
#     "id": ids_test,
#     "binds": test_proba_xgb
# })
# 
# submission_dir = Path("../../../data/submission_of_models")
# submission_dir.mkdir(parents=True, exist_ok=True)
# 
# submission_path = submission_dir / "submission_xgboost_presplit.csv"
# submission.to_csv(submission_path, index=False)
# 
# print(f"✓ Submission saved to: {submission_path}")
# submission.head()

print("Uncomment the code above to generate submission file")

Uncomment the code above to generate submission file
