## 1. Imports and Setup

In [23]:
import numpy as np
import pandas as pd
from pathlib import Path
from scipy.sparse import load_npz, vstack  # vstack to combine sparse matrices

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import average_precision_score, roc_auc_score

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

## 2. File Path Validation

In [24]:
DATA_DIR = Path("../../../data/processed")

def ensure_exists(path):
    """Raises a clear error if a required file does not exist."""
    if not Path(path).exists():
        raise FileNotFoundError(
            f"Required file not found: {path!s}."
            " Place files under data/processed or correct the path."
        )

# Pre-split ECFP files
X_train_in_fp   = DATA_DIR / "X_train_in_ecfp.npz"
y_train_in_fp   = DATA_DIR / "y_train_in.npy"

X_val_ood_fp    = DATA_DIR / "X_val_ood_ecfp.npz"
y_val_ood_fp    = DATA_DIR / "y_val_ood.npy"

X_test_ood_fp   = DATA_DIR / "X_test_ood_ecfp.npz"
y_test_ood_fp   = DATA_DIR / "y_test_ood.npy"

for p in (X_train_in_fp, y_train_in_fp,
          X_val_ood_fp,  y_val_ood_fp,
          X_test_ood_fp, y_test_ood_fp):
    ensure_exists(p)

print("✓ All required files found")

✓ All required files found


## 3. Load Pre-Split Data

In [25]:
X_tr   = load_npz(X_train_in_fp)     # train_in ECFP fingerprints
y_tr   = np.load(y_train_in_fp)

X_val  = load_npz(X_val_ood_fp)      # val_ood ECFP fingerprints
y_val  = np.load(y_val_ood_fp)

X_test = load_npz(X_test_ood_fp)     # test_ood ECFP fingerprints
y_test = np.load(y_test_ood_fp)

print("Train_in shape:", X_tr.shape,  "| Positives:", int(y_tr.sum()))
print("Val_ood  shape:", X_val.shape, "| Positives:", int(y_val.sum()))
print("Test_ood shape:", X_test.shape,"| Positives:", int(y_test.sum()))

Train_in shape: (34819, 2048) | Positives: 119
Val_ood  shape: (7268, 2048) | Positives: 30
Test_ood shape: (7913, 2048) | Positives: 120


## 4. STEP 1 — Train Random Forest on train_in

In [26]:
rf_clf = RandomForestClassifier(
    n_estimators=500,
    max_features="sqrt",
    max_depth=None,
    min_samples_leaf=5,
    n_jobs=-1,
    class_weight="balanced",
    random_state=RANDOM_STATE,
)

rf_clf.fit(X_tr, y_tr)
print("Random Forest trained on train_in.")

Random Forest trained on train_in.


## 5. Evaluate on val_ood (Out-of-Distribution Validation)

In [27]:
val_proba_rf = rf_clf.predict_proba(X_val)[:, 1]

ap_val_rf  = average_precision_score(y_val, val_proba_rf)
roc_val_rf = roc_auc_score(y_val, val_proba_rf)

print(f"RF – Val_OOD AP:  {ap_val_rf:.6f}")
print(f"RF – Val_OOD AUC: {roc_val_rf:.6f}")

RF – Val_OOD AP:  0.254348
RF – Val_OOD AUC: 0.931579


## 6. STEP 2 — Retrain on ALL Training Data (train_in + val_ood)

In [None]:
X_train_all = vstack([X_tr, X_val])
y_train_all = np.concatenate([y_tr, y_val])

print("Combined TRAIN (train_in + val_ood) shape:",
      X_train_all.shape, "| Positives:", int(y_train_all.sum()))

# IMPROVED Random Forest for imbalanced data
rf_final = RandomForestClassifier(
    n_estimators=1000,           # More trees for better coverage
    max_features="sqrt",
   
    max_depth=None,              # No depth limit
    min_samples_leaf=1,          # More fine-grained splits (default=1)
    min_samples_split=2,         # Allow smaller splits (default=2)
    n_jobs=-1,
    class_weight="balanced_subsample",  # Better handling per tree
    random_state=RANDOM_STATE,
)

rf_final.fit(X_train_all, y_train_all)
print("✓ Random Forest trained with 1000 trees on ALL data (train_in + val_ood).")

Combined TRAIN (train_in + val_ood) shape: (42087, 2048) | Positives: 149
✓ Random Forest trained with 1000 trees on ALL data (train_in + val_ood).
✓ Random Forest trained with 1000 trees on ALL data (train_in + val_ood).


## 7. STEP 3 — Final Evaluation on test_ood

In [29]:
test_proba_rf = rf_final.predict_proba(X_test)[:, 1]

ap_test_rf  = average_precision_score(y_test, test_proba_rf)
roc_test_rf = roc_auc_score(y_test, test_proba_rf)

print(f"RF – Test_OOD AP:  {ap_test_rf:.6f}")
print(f"RF – Test_OOD AUC: {roc_test_rf:.6f}")

RF – Test_OOD AP:  0.088128
RF – Test_OOD AUC: 0.627460


## 8. Create Submission File (Optional)

If you have unlabeled test IDs for Kaggle submission, create the CSV here.

In [30]:
# Example: if you have test IDs loaded separately
# ids_test = np.load(DATA_DIR / "ids_test_ood.npy")
# 
# submission = pd.DataFrame({
#     "id": ids_test,
#     "binds": test_proba_rf
# })
# 
# submission_dir = Path("../../../data/submission_of_models")
# submission_dir.mkdir(parents=True, exist_ok=True)
# 
# submission_path = submission_dir / "submission_rf_presplit.csv"
# submission.to_csv(submission_path, index=False)
# 
# print(f"✓ Submission saved to: {submission_path}")
# submission.head()

print("Uncomment the code above to generate submission file")

Uncomment the code above to generate submission file
