# Random Forest Classification via cross-validation

In [18]:
# import modules
import os
import pandas as pd
import numpy as np
import random
import pickle

# import scikit-learn modules
from sklearn.model_selection import StratifiedKFold, StratifiedGroupKFold, RepeatedStratifiedKFold, StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import RocCurveDisplay, PrecisionRecallDisplay, roc_auc_score, average_precision_score, precision_recall_curve, auc, roc_curve

from collections import Counter

import time

# import visualization modules
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
cwd = os.getcwd()
BASE_DIR = os.path.abspath(os.path.join(cwd, "..", ".."))

# build paths inside the repo
get_data_path = lambda folders, fname: os.path.normpath(
    os.path.join(BASE_DIR, *folders, fname)
)

file_path_training_data = get_data_path(['output', 'models'], 'training_data.csv')

file_RF_model_I_context_reduced = get_data_path(['output', 'models', 'model_I'], 'RF_model_contextualised_reduced.pickle')
file_RF_model_I_full_reduced = get_data_path(['output', 'models', 'model_I'], 'RF_model_full_reduced.pickle')

file_RF_model_II_contextualised_reduced = get_data_path(['output', 'models', 'model_II'], 'RF_model_context_reduced.pickle')
file_RF_model_II_full_reduced = get_data_path(['output', 'models', 'model_II'], 'RF_model_full_reduced.pickle')

file_RF_model_III_contextualised_reduced = get_data_path(['output', 'models', 'model_III'], 'RF_model_contextualised_reduced.pickle')
file_RF_model_III_full_reduced = get_data_path(['output', 'models', 'model_III'], 'RF_model_full_reduced.pickle')

file_RF_model_IV_contextualised = get_data_path(['output', 'models', 'model_IV'], 'RF_model_contextualised.pickle')
file_RF_model_IV_full = get_data_path(['output', 'models', 'model_IV'], 'RF_model_full.pickle')

In [3]:
training_df = pd.read_csv(file_path_training_data)
training_df.head()

Unnamed: 0,genepair,A1,A2,A1_entrez,A2_entrez,DepMap_ID,cell_line,Gemini_FDR,raw_LFC,SL,...,colocalisation,interact,n_total_ppi,fet_ppi_overlap,gtex_spearman_corr,gtex_min_mean_expr,gtex_max_mean_expr,GEMINI,LFC,SL_new
0,A3GALT2_ABO,A3GALT2,ABO,127550.0,28.0,ACH-000022,PATU8988S_PANCREAS,0.998944,0.088856,False,...,0.0,False,3.0,0.0,0.114847,0.258739,11.702,0.118768,0.088856,False
1,A3GALT2_ABO,A3GALT2,ABO,127550.0,28.0,ACH-000307,PK1_PANCREAS,0.986587,0.201704,False,...,0.0,False,3.0,0.0,0.114847,0.258739,11.702,0.132501,0.201704,False
2,A3GALT2_ABO,A3GALT2,ABO,127550.0,28.0,ACH-000632,HS944T_SKIN,1.0,0.069772,False,...,0.0,False,3.0,0.0,0.114847,0.258739,11.702,0.024593,0.069772,False
3,A3GALT2_ABO,A3GALT2,ABO,127550.0,28.0,ACH-000681,A549_LUNG,0.977988,0.379455,False,...,0.0,False,3.0,0.0,0.114847,0.258739,11.702,-0.241323,0.379455,False
4,A3GALT2_ABO,A3GALT2,ABO,127550.0,28.0,ACH-000756,GI1_CENTRAL_NERVOUS_SYSTEM,0.999586,-0.077118,False,...,0.0,False,3.0,0.0,0.114847,0.258739,11.702,0.299715,-0.077118,False


In [4]:
feature_columns_1 = ['rMaxExp_A1A2', 'rMinExp_A1A2',
                     'max_ranked_A1A2', 'min_ranked_A1A2',
                     'max_cn', 'min_cn', 'Protein_Altering', 'Damaging', 
                     'min_sequence_identity',
                     'prediction_score', 
                     'weighted_PPI_essentiality', 'weighted_PPI_expression',
                     'smallest_BP_GO_essentiality', 'smallest_CC_GO_essentiality',
                     'smallest_BP_GO_expression', 'go_CC_expression'
                     ]

target_column = 'SL_new'

print('num of features:', len(feature_columns_1))

num of features: 16


In [5]:
feature_columns_2 = feature_columns_1 + ['closest', 'WGD', 'family_size',
                                         'cds_length_ratio', 'shared_domains', 'has_pombe_ortholog',
                                         'has_essential_pombe_ortholog', 'has_cerevisiae_ortholog', 'has_essential_cerevisiae_ortholog', 
                                         'conservation_score', 'mean_age', 'either_in_complex', 'mean_complex_essentiality', 'colocalisation',
                                         'interact', 'n_total_ppi', 'fet_ppi_overlap',
                                         'gtex_spearman_corr', 'gtex_min_mean_expr', 'gtex_max_mean_expr']
feature_columns_2.remove('prediction_score')
print('num of features:', len(feature_columns_2))

num of features: 35


In [6]:
# Load splits_IV from pickle file to determine average train and test set sizes
with open('/Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/output/models/model_IV/splits_IV.pkl', 'rb') as f:
    splits_IV = pickle.load(f)

# Calculate average train and test set sizes
train_sizes = []
test_sizes = []

for train_idx, test_idx in splits_IV:
    train_sizes.append(len(train_idx))
    test_sizes.append(len(test_idx))

avg_train_size = sum(train_sizes) / len(train_sizes)
avg_test_size = sum(test_sizes) / len(test_sizes)

print(f"Average training set size: {avg_train_size:.2f}")
print(f"Average test set size: {avg_test_size:.2f}")

Average training set size: 26396.04
Average test set size: 1649.64


## Model I - Random Selection

In [7]:
def safe_roc_auc(y_true, y_score):
    mask = ~np.isnan(y_score)
    if np.sum(mask) < 2 or len(np.unique(y_true[mask])) < 2:
        return np.nan
    return roc_auc_score(y_true[mask], y_score[mask])

def safe_average_precision(y_true, y_score):
    mask = ~np.isnan(y_score)
    if np.sum(mask) == 0:
        return np.nan
    return average_precision_score(y_true[mask], y_score[mask])

In [8]:
def validate_stratification(y_train, y_test, fold_num, model_name, tolerance=0.05):
    """
    Validate that stratification is working correctly across train/test splits.
    
    Parameters:
    - y_train, y_test: target arrays for train and test sets
    - fold_num: current fold number
    - model_name: name of the model for logging
    - tolerance: acceptable difference in class proportions (default 5%)
    """
    # Calculate class proportions
    train_pos_rate = y_train.mean()
    test_pos_rate = y_test.mean()
    overall_pos_rate = np.concatenate([y_train, y_test]).mean()
    
    # Calculate differences from overall rate
    train_diff = abs(train_pos_rate - overall_pos_rate)
    test_diff = abs(test_pos_rate - overall_pos_rate)
    train_test_diff = abs(train_pos_rate - test_pos_rate)
    
    # Print detailed information
    print(f"  [{model_name} - Fold {fold_num}] Stratification Check:")
    print(f"    Overall SL rate: {overall_pos_rate:.4f}")
    print(f"    Train SL rate:   {train_pos_rate:.4f} (diff: {train_diff:.4f})")
    print(f"    Test SL rate:    {test_pos_rate:.4f} (diff: {test_diff:.4f})")
    print(f"    Train/Test diff: {train_test_diff:.4f}")
    
    # Check for violations
    warnings = []
    if train_diff > tolerance:
        warnings.append(f"Train set deviation ({train_diff:.4f}) exceeds tolerance ({tolerance})")
    if test_diff > tolerance:
        warnings.append(f"Test set deviation ({test_diff:.4f}) exceeds tolerance ({tolerance})")
    if train_test_diff > tolerance:
        warnings.append(f"Train/Test difference ({train_test_diff:.4f}) exceeds tolerance ({tolerance})")
    
    if warnings:
        print(f"WARNINGS:")
        for warning in warnings:
            print(f"      - {warning}")
    else:
        print(f"Stratification OK")
    
    return {
        'train_pos_rate': train_pos_rate,
        'test_pos_rate': test_pos_rate,
        'overall_pos_rate': overall_pos_rate,
        'train_diff': train_diff,
        'test_diff': test_diff,
        'train_test_diff': train_test_diff,
        'warnings': warnings
    }

def validate_group_splits(train_groups, test_groups, fold_num, model_name, group_type="groups"):
    """
    Validate that groups don't leak between train and test sets.
    """
    train_set = set(train_groups)
    test_set = set(test_groups)
    overlap = train_set & test_set
    
    print(f"  [{model_name} - Fold {fold_num}] Group Validation:")
    print(f"    Train {group_type}: {len(train_set)}")
    print(f"    Test {group_type}:  {len(test_set)}")
    print(f"    Overlapping {group_type}: {len(overlap)}")
    
    if len(overlap) > 0:
        print(f"GROUP LEAKAGE DETECTED: {overlap}")
        return False
    else:
        print(f"No group leakage")
        return True

In [9]:
def stratified_downsample_indices(y, train_indices, target_n, random_state=42):
    train_indices = np.asarray(train_indices)
    if train_indices.size <= target_n:
        return train_indices
    y_fold = np.asarray(y)[train_indices]
    sss = StratifiedShuffleSplit(n_splits=1, train_size=target_n, random_state=random_state)
    sub_idx, _ = next(sss.split(np.zeros_like(y_fold), y_fold))
    return train_indices[sub_idx]

In [None]:
def model_contextualised_cross_validation(
    classifier, data, target, splits, verbose=True, model_name="Model",
    downsample_train_to=None, downsample_random_state=42
):
    print(f"[DEBUG] model_contextualised_cross_validation v4 — downsample={downsample_train_to}")  # signature

    tprs, fprs, aucs = [], [], []
    mean_fpr = np.linspace(0, 1, 100)

    pred_aucs, seq_aucs, gene_expr_aucs, gene_ess_aucs = [], [], [], []
    aps, pred_aps, seq_aps, gene_expr_aps, gene_ess_aps = [], [], [], [], []

    y_real, y_proba = [], []

    stratification_results = []
    skipped_folds = []
    effective_train_sizes = []   # collect sizes

    total_splits = len(splits)
    start_time = time.time()

    # Ensure NumPy array for fast indexing
    y_all = target.values if hasattr(target, "values") else np.asarray(target)

    for fold_num, (train, test) in enumerate(splits, start=1):
        # ----- Print original sizes -----
        print(f"Fold {fold_num}:")
        print(f"  Original train size = {len(train)}, Test size = {len(test)}")

        if len(test) == 0:
            print(f"  Skipping fold {fold_num}: Empty test set")
            skipped_folds.append(fold_num)
            continue

        # ----- Downsample the TRAIN indices if requested -----
        if downsample_train_to is not None:
            train_sub = stratified_downsample_indices(
                y_all, train, target_n=downsample_train_to, random_state=downsample_random_state
            )
        else:
            train_sub = np.asarray(train)

        # ----- Print effective size (what we actually train on) -----
        eff_sz = len(train_sub)
        effective_train_sizes.append(eff_sz)
        print(
            f"  Effective train size after downsampling = {eff_sz}"
            f"{' (cap: ' + str(downsample_train_to) + ')' if downsample_train_to else ''}",
            flush=True
        )

        # Hard guard: when original > cap, ensure we hit exactly the cap
        if downsample_train_to is not None and len(train) > downsample_train_to:
            assert eff_sz == downsample_train_to, f"Expected {downsample_train_to}, got {eff_sz}"

        # Targets for this fold (downsampled train, full test)
        y_train = y_all[train_sub]
        y_test  = y_all[np.asarray(test)]

        # Validate stratification AFTER downsampling
        stratification_result = validate_stratification(y_train, y_test, fold_num, model_name)
        stratification_results.append(stratification_result)

        if np.unique(y_test).size < 2:
            print(f"  Skipping fold {fold_num}: Insufficient positive/negative samples in test set")
            print(f"  Test set classes: {np.unique(y_test)}")
            skipped_folds.append(fold_num)
            continue

        # ----- Fit on downsampled train; evaluate on full test -----
        y_pred_proba = classifier.fit(data.iloc[train_sub], target.iloc[train_sub]) \
                                  .predict_proba(data.iloc[test])[:, 1]

        fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
        tprs.append(tpr); fprs.append(fpr)

        aucs.append(roc_auc_score(y_test, y_pred_proba))
        aps.append(average_precision_score(y_test, y_pred_proba))

        # Baseline feature scores on test
        pred_aucs.append(safe_roc_auc(y_test, data.iloc[test]['prediction_score'].values))
        seq_aucs.append(safe_roc_auc(y_test, data.iloc[test]['min_sequence_identity'].values))
        gene_expr_aucs.append(safe_roc_auc(y_test, data.iloc[test]['rMinExp_A1A2'].values))
        gene_ess_aucs.append(1 - safe_roc_auc(y_test, data.iloc[test]['min_ranked_A1A2'].values))

        pred_aps.append(safe_average_precision(y_test, data.iloc[test]['prediction_score'].values))
        seq_aps.append(safe_average_precision(y_test, data.iloc[test]['min_sequence_identity'].values))
        gene_expr_aps.append(safe_average_precision(y_test, data.iloc[test]['rMinExp_A1A2'].values))
        gene_ess_aps.append(safe_average_precision(y_test, data.iloc[test]['min_ranked_A1A2'].values))

        y_real.append(y_test); y_proba.append(y_pred_proba)

        if verbose:
            elapsed_time = time.time() - start_time
            print(f"  ROC AUC = {aucs[-1]:.4f}, Elapsed time = {elapsed_time:.2f} seconds")

    # ----- Aggregate curves/metrics -----
    if len(tprs) > 0:
        mean_tpr = np.mean([np.interp(mean_fpr, fprs[i], tprs[i]) for i in range(len(tprs))], axis=0)
        mean_tpr[-1] = 1.0
        mean_auc = auc(mean_fpr, mean_tpr)
        std_auc = np.std(aucs)
    else:
        mean_tpr, mean_auc, std_auc = np.array([]), np.nan, np.nan

    if len(y_real) > 0 and len(y_proba) > 0:
        y_real = np.concatenate(y_real); y_proba = np.concatenate(y_proba)
        precision, recall, _ = precision_recall_curve(y_real, y_proba)
    else:
        precision, recall = np.array([]), np.array([])

    # ----- Summary -----
    print(f"\n=== {model_name} Summary ===")
    if not np.isnan(mean_auc):
        print(f"Mean ROC AUC = {mean_auc:.4f} ± {std_auc:.4f}")
    print(f"Successful folds: {len(aucs)}/{total_splits}")
    if skipped_folds:
        print(f"Skipped folds: {skipped_folds}")

    if stratification_results:
        train_diffs = [r['train_diff'] for r in stratification_results]
        test_diffs  = [r['test_diff']  for r in stratification_results]
        print("Stratification quality:")
        print(f"  Mean train deviation: {np.mean(train_diffs):.4f} ± {np.std(train_diffs):.4f}")
        print(f"  Mean test deviation:  {np.mean(test_diffs):.4f} ± {np.std(test_diffs):.4f}")

    return {
        'tprs': tprs, 'fprs': fprs, 'aucs': aucs,
        'mean_tpr': mean_tpr, 'mean_fpr': mean_fpr, 'mean_auc': mean_auc, 'std_auc': std_auc,
        'seq_auc': np.nanmean(seq_aucs), 'seq_std_auc': np.nanstd(seq_aucs),
        'pred_auc': np.nanmean(pred_aucs), 'pred_std_auc': np.nanstd(pred_aucs),
        'gene_expr_auc': np.nanmean(gene_expr_aucs), 'gene_expr_std_auc': np.nanstd(gene_expr_aucs),
        'gene_ess_auc': np.nanmean(gene_ess_aucs), 'gene_ess_std_auc': np.nanstd(gene_ess_aucs),
        'precision': precision, 'recall': recall, 'aps': aps,
        'mean_aps': np.nanmean(aps), 'std_ap': np.nanstd(aps),
        'pred_ap': np.nanmean(pred_aps), 'pred_std_ap': np.nanstd(pred_aps),
        'seq_ap': np.nanmean(seq_aps), 'seq_std_ap': np.nanstd(seq_aps),
        'gene_expr_ap': np.nanmean(gene_expr_aps), 'gene_expr_std_ap': np.nanstd(gene_expr_aps),
        'gene_ess_ap': np.nanmean(gene_ess_aps), 'gene_ess_std_ap': np.nanstd(gene_ess_aps),
        'stratification_results': stratification_results,
        'skipped_folds': skipped_folds,
        'effective_folds': len(aucs),
        'effective_train_sizes': effective_train_sizes,
    }

In [None]:
def model_full_cross_validation(
    classifier, data, target, splits, verbose=True, model_name="Model",
    downsample_train_to=None, downsample_random_state=42
):
    print(f"[DEBUG] model_full_cross_validation v2 — downsample={downsample_train_to}")  # signature

    tprs, fprs, aucs = [], [], []
    mean_fpr = np.linspace(0, 1, 100)

    y_real, y_proba, aps = [], [], []

    stratification_results = []
    skipped_folds = []
    effective_train_sizes = []   # NEW

    total_splits = len(splits)
    start_time = time.time()

    # Ensure NumPy for reliable indexing
    y_all = target.values if hasattr(target, "values") else np.asarray(target)

    for fold_num, (train, test) in enumerate(splits, start=1):
        print(f"\n=== {model_name} - Fold {fold_num}/{total_splits} ===")
        print(f"  Original train size = {len(train)}, Test size = {len(test)}")

        if len(test) == 0:
            print(f"  Skipping fold {fold_num}: Empty test set")
            skipped_folds.append(fold_num)
            continue

        # --- Downsample TRAIN if requested ---
        if downsample_train_to is not None:
            train_sub = stratified_downsample_indices(
                y_all, train, target_n=downsample_train_to, random_state=downsample_random_state
            )
        else:
            train_sub = np.asarray(train)

        # Print effective size (what we actually train on)
        eff_sz = len(train_sub)
        effective_train_sizes.append(eff_sz)
        print(
            f"  Effective train size after downsampling = {eff_sz}"
            f"{' (cap: ' + str(downsample_train_to) + ')' if downsample_train_to else ''}",
            flush=True
        )

        # Guard: if the original was larger than the cap, ensure we hit the cap
        if downsample_train_to is not None and len(train) > downsample_train_to:
            assert eff_sz == downsample_train_to, f"Expected {downsample_train_to}, got {eff_sz}"

        # Targets for this fold (downsampled train, full test)
        y_train = y_all[train_sub]
        y_test  = y_all[np.asarray(test)]

        # Validate stratification AFTER downsampling
        stratification_result = validate_stratification(y_train, y_test, fold_num, model_name)
        stratification_results.append(stratification_result)

        if np.unique(y_test).size < 2:
            print(f"  Skipping fold {fold_num}: Insufficient positive/negative samples in test set")
            print(f"  Test set classes: {np.unique(y_test)}")
            skipped_folds.append(fold_num)
            continue

        # --- Train on downsampled train; evaluate on full test ---
        y_pred_proba = classifier.fit(data.iloc[train_sub], target.iloc[train_sub]) \
                                  .predict_proba(data.iloc[test])[:, 1]

        fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
        tprs.append(tpr); fprs.append(fpr)

        aucs.append(safe_roc_auc(y_test, y_pred_proba))
        aps.append(safe_average_precision(y_test, y_pred_proba))

        y_real.append(y_test)
        y_proba.append(y_pred_proba)

        if verbose:
            elapsed_time = time.time() - start_time
            print(f"  ROC AUC = {aucs[-1]:.4f}, Elapsed time = {elapsed_time:.2f} seconds")

    # Aggregate metrics
    if len(tprs) > 0:
        mean_tpr = np.mean([np.interp(mean_fpr, fprs[i], tprs[i]) for i in range(len(tprs))], axis=0)
        mean_tpr[-1] = 1.0
        mean_auc = auc(mean_fpr, mean_tpr)
        std_auc = np.std(aucs)
    else:
        mean_tpr, mean_auc, std_auc = np.array([]), np.nan, np.nan

    if len(y_real) > 0 and len(y_proba) > 0:
        y_real = np.concatenate(y_real)
        y_proba = np.concatenate(y_proba)
        precision, recall, _ = precision_recall_curve(y_real, y_proba)
    else:
        precision, recall = np.array([]), np.array([])

    # Summary
    print(f"\n=== {model_name} Summary ===")
    if not np.isnan(mean_auc):
        print(f"Mean ROC AUC = {mean_auc:.4f} ± {std_auc:.4f}")
    print(f"Successful folds: {len(aucs)}/{total_splits}")
    if skipped_folds:
        print(f"Skipped folds: {skipped_folds}")

    if stratification_results:
        train_diffs = [r['train_diff'] for r in stratification_results]
        test_diffs  = [r['test_diff']  for r in stratification_results]
        print("Stratification quality:")
        print(f"  Mean train deviation: {np.mean(train_diffs):.4f} ± {np.std(train_diffs):.4f}")
        print(f"  Mean test deviation:  {np.mean(test_diffs):.4f} ± {np.std(test_diffs):.4f}")

    return {
        'tprs': tprs, 'fprs': fprs, 'aucs': aucs,
        'mean_tpr': mean_tpr, 'mean_fpr': mean_fpr, 'mean_auc': mean_auc, 'std_auc': std_auc,
        'precision': precision, 'recall': recall, 'aps': aps,
        'mean_aps': np.nanmean(aps),
        'std_ap': np.nanstd(aps),
        'stratification_results': stratification_results,
        'skipped_folds': skipped_folds,
        'effective_folds': len(aucs),
        'effective_train_sizes': effective_train_sizes,   # NEW
    }


In [11]:
# Define feature sets
data_1 = training_df[feature_columns_1]
data_2 = training_df[feature_columns_2]
target = training_df[target_column]

# Define your Random Forest classifier
RF = RandomForestClassifier(n_estimators=600, random_state=42, max_features=0.2, max_depth=20, min_samples_leaf=4)

In [12]:
import pickle

with open("/Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/output/models/model_I/repeated_splits_I.pkl", "rb") as f:
    repeated_splits_I = pickle.load(f)

In [None]:
model_I_contextualised_reduced = model_contextualised_cross_validation(
    classifier=RF,
    data=training_df[feature_columns_1],
    target=training_df[target_column],
    splits=repeated_splits_I,
    model_name="Model_I_contextualised_repeated",
    downsample_train_to=26400,
    downsample_random_state=42,
)

[DEBUG] model_early_cross_validation v4 — downsample=26400
Fold 1:
  Original train size = 32995, Test size = 8249
  Effective train size after downsampling = 26400 (cap: 26400)
  [Model_I_early_repeated - Fold 1] Stratification Check:
    Overall SL rate: 0.0232
    Train SL rate:   0.0233 (diff: 0.0000)
    Test SL rate:    0.0232 (diff: 0.0001)
    Train/Test diff: 0.0001
Stratification OK
  ROC AUC = 0.9053, Elapsed time = 35.15 seconds
Fold 2:
  Original train size = 32995, Test size = 8249
  Effective train size after downsampling = 26400 (cap: 26400)
  [Model_I_early_repeated - Fold 2] Stratification Check:
    Overall SL rate: 0.0232
    Train SL rate:   0.0232 (diff: 0.0000)
    Test SL rate:    0.0233 (diff: 0.0000)
    Train/Test diff: 0.0001
Stratification OK
  ROC AUC = 0.9123, Elapsed time = 71.48 seconds
Fold 3:
  Original train size = 32995, Test size = 8249
  Effective train size after downsampling = 26400 (cap: 26400)
  [Model_I_early_repeated - Fold 3] Stratification

In [None]:
model_I_full_reduced = model_full_cross_validation(
    classifier=RF,
    data=training_df[feature_columns_2],  
    target=training_df[target_column],
    splits=repeated_splits_I,                   
    model_name="Model_I_full_reduced",
    downsample_train_to=26400,                 
    downsample_random_state=42,
    verbose=True
)

[DEBUG] model_late_cross_validation v2 — downsample=26400

=== Model_I_late_reduced - Fold 1/50 ===
  Original train size = 32995, Test size = 8249
  Effective train size after downsampling = 26400 (cap: 26400)
  [Model_I_late_reduced - Fold 1] Stratification Check:
    Overall SL rate: 0.0232
    Train SL rate:   0.0233 (diff: 0.0000)
    Test SL rate:    0.0232 (diff: 0.0001)
    Train/Test diff: 0.0001
Stratification OK
  ROC AUC = 0.9169, Elapsed time = 62.55 seconds

=== Model_I_late_reduced - Fold 2/50 ===
  Original train size = 32995, Test size = 8249
  Effective train size after downsampling = 26400 (cap: 26400)
  [Model_I_late_reduced - Fold 2] Stratification Check:
    Overall SL rate: 0.0232
    Train SL rate:   0.0232 (diff: 0.0000)
    Test SL rate:    0.0233 (diff: 0.0000)
    Train/Test diff: 0.0001
Stratification OK
  ROC AUC = 0.9252, Elapsed time = 124.13 seconds

=== Model_I_late_reduced - Fold 3/50 ===
  Original train size = 32995, Test size = 8249
  Effective tra

In [None]:
# save results of cross validation
with open(file_RF_model_I_contextualised_reduced, 'wb') as f:
   pickle.dump(model_I_contextualised_reduced, f)

with open(file_RF_model_I_full_reduced, 'wb') as f:
    pickle.dump(model_I_full_reduced, f)

## Model II
#### Same paralog pairs, different cell lines

In [21]:
# Read splits
with open('/Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/output/models/model_II/repeated_splits_II.pkl', 'rb') as f:
    repeated_splits_II = pickle.load(f)

In [22]:
for i, (tr, te) in enumerate(repeated_splits_II, 1):
    train_cells = training_df.loc[tr, 'cell_line'].unique()
    test_cells  = training_df.loc[te, 'cell_line'].unique()
    validate_group_splits(train_cells, test_cells, i, "Model II", "cell lines")

  [Model II - Fold 1] Group Validation:
    Train cell lines: 8
    Test cell lines:  2
    Overlapping cell lines: 0
No group leakage
  [Model II - Fold 2] Group Validation:
    Train cell lines: 8
    Test cell lines:  2
    Overlapping cell lines: 0
No group leakage
  [Model II - Fold 3] Group Validation:
    Train cell lines: 8
    Test cell lines:  2
    Overlapping cell lines: 0
No group leakage
  [Model II - Fold 4] Group Validation:
    Train cell lines: 8
    Test cell lines:  2
    Overlapping cell lines: 0
No group leakage
  [Model II - Fold 5] Group Validation:
    Train cell lines: 8
    Test cell lines:  2
    Overlapping cell lines: 0
No group leakage
  [Model II - Fold 6] Group Validation:
    Train cell lines: 8
    Test cell lines:  2
    Overlapping cell lines: 0
No group leakage
  [Model II - Fold 7] Group Validation:
    Train cell lines: 8
    Test cell lines:  2
    Overlapping cell lines: 0
No group leakage
  [Model II - Fold 8] Group Validation:
    Train cell 

In [None]:
model_II_contextualised_reduced = model_contextualised_cross_validation(
    classifier=RF,
    data=training_df[feature_columns_1],      # or training_df if easier
    target=training_df[target_column],
    splits=repeated_splits_II,                # <-- group CV splits (unseen cell lines in test)
    model_name="Model_II_contextualised_reduced",
    downsample_train_to=26_400,               # cap train fold size
    downsample_random_state=42,
    verbose=True
)

[DEBUG] model_early_cross_validation v4 — downsample=26400
Fold 1:
  Original train size = 32973, Test size = 8271
  Effective train size after downsampling = 26400 (cap: 26400)
  [Model_II_early_reduced - Fold 1] Stratification Check:
    Overall SL rate: 0.0226
    Train SL rate:   0.0266 (diff: 0.0040)
    Test SL rate:    0.0097 (diff: 0.0129)
    Train/Test diff: 0.0170
Stratification OK
  ROC AUC = 0.9554, Elapsed time = 39.67 seconds
Fold 2:
  Original train size = 32978, Test size = 8266
  Effective train size after downsampling = 26400 (cap: 26400)
  [Model_II_early_reduced - Fold 2] Stratification Check:
    Overall SL rate: 0.0236
    Train SL rate:   0.0213 (diff: 0.0023)
    Test SL rate:    0.0311 (diff: 0.0075)
    Train/Test diff: 0.0098
Stratification OK
  ROC AUC = 0.9183, Elapsed time = 80.04 seconds
Fold 3:
  Original train size = 33018, Test size = 8226
  Effective train size after downsampling = 26400 (cap: 26400)
  [Model_II_early_reduced - Fold 3] Stratification

In [None]:
model_II_full_reduced = model_full_cross_validation(
    classifier=RF,
    data=training_df[feature_columns_2],  
    target=training_df[target_column],
    splits=repeated_splits_II,
    model_name="Model_II_full_reduced",
    downsample_train_to=26_400,
    downsample_random_state=42,
    verbose=True
)

[DEBUG] model_late_cross_validation v2 — downsample=26400

=== Model_II_late_reduced - Fold 1/50 ===
  Original train size = 32973, Test size = 8271
  Effective train size after downsampling = 26400 (cap: 26400)
  [Model_II_late_reduced - Fold 1] Stratification Check:
    Overall SL rate: 0.0226
    Train SL rate:   0.0266 (diff: 0.0040)
    Test SL rate:    0.0097 (diff: 0.0129)
    Train/Test diff: 0.0170
Stratification OK
  ROC AUC = 0.9660, Elapsed time = 63.88 seconds

=== Model_II_late_reduced - Fold 2/50 ===
  Original train size = 32978, Test size = 8266
  Effective train size after downsampling = 26400 (cap: 26400)
  [Model_II_late_reduced - Fold 2] Stratification Check:
    Overall SL rate: 0.0236
    Train SL rate:   0.0213 (diff: 0.0023)
    Test SL rate:    0.0311 (diff: 0.0075)
    Train/Test diff: 0.0098
Stratification OK
  ROC AUC = 0.9323, Elapsed time = 116.17 seconds

=== Model_II_late_reduced - Fold 3/50 ===
  Original train size = 33018, Test size = 8226
  Effectiv

In [None]:
# save results of cross validation
with open(file_RF_model_II_contextualised_reduced, 'wb') as f:
    pickle.dump(model_II_contextualised_reduced, f)

with open(file_RF_model_II_full_reduced, 'wb') as f:
    pickle.dump(model_II_full_reduced, f)

## Model III
#### Different paralog pairs, same cell lines

In [28]:
# Read splits
with open('/Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/output/models/model_III/repeated_splits_III.pkl', 'rb') as f:
    repeated_splits_III = pickle.load(f)

In [29]:
for i, (tr, te) in enumerate(repeated_splits_III, 1):
    train_cells = training_df.loc[tr, 'genepair'].unique()
    test_cells  = training_df.loc[te, 'genepair'].unique()
    validate_group_splits(train_cells, test_cells, i, "Model III", "genepair")

  [Model III - Fold 1] Group Validation:
    Train genepair: 3337
    Test genepair:  833
    Overlapping genepair: 0
No group leakage
  [Model III - Fold 2] Group Validation:
    Train genepair: 3334
    Test genepair:  836
    Overlapping genepair: 0
No group leakage
  [Model III - Fold 3] Group Validation:
    Train genepair: 3336
    Test genepair:  834
    Overlapping genepair: 0
No group leakage
  [Model III - Fold 4] Group Validation:
    Train genepair: 3336
    Test genepair:  834
    Overlapping genepair: 0
No group leakage
  [Model III - Fold 5] Group Validation:
    Train genepair: 3337
    Test genepair:  833
    Overlapping genepair: 0
No group leakage
  [Model III - Fold 6] Group Validation:
    Train genepair: 3337
    Test genepair:  833
    Overlapping genepair: 0
No group leakage
  [Model III - Fold 7] Group Validation:
    Train genepair: 3333
    Test genepair:  837
    Overlapping genepair: 0
No group leakage
  [Model III - Fold 8] Group Validation:
    Train gene

In [None]:
#Run cross-validation on both feature set
model_III_contextualised_reduced = model_contextualised_cross_validation(
    classifier=RF,
    data=training_df[feature_columns_1],     
    target=training_df[target_column],
    splits=repeated_splits_III,                
    model_name="Model_III_contextualised_reduced",
    downsample_train_to=26_400,               
    downsample_random_state=42,
    verbose=True
)

[DEBUG] model_early_cross_validation v4 — downsample=26400
Fold 1:
  Original train size = 33027, Test size = 8217
  Effective train size after downsampling = 26400 (cap: 26400)
  [Model_III_early_reduced - Fold 1] Stratification Check:
    Overall SL rate: 0.0234
    Train SL rate:   0.0223 (diff: 0.0011)
    Test SL rate:    0.0269 (diff: 0.0035)
    Train/Test diff: 0.0046
Stratification OK
  ROC AUC = 0.8733, Elapsed time = 40.77 seconds
Fold 2:
  Original train size = 32987, Test size = 8257
  Effective train size after downsampling = 26400 (cap: 26400)
  [Model_III_early_reduced - Fold 2] Stratification Check:
    Overall SL rate: 0.0232
    Train SL rate:   0.0234 (diff: 0.0002)
    Test SL rate:    0.0224 (diff: 0.0008)
    Train/Test diff: 0.0010
Stratification OK
  ROC AUC = 0.9057, Elapsed time = 79.34 seconds
Fold 3:
  Original train size = 32975, Test size = 8269
  Effective train size after downsampling = 26400 (cap: 26400)
  [Model_III_early_reduced - Fold 3] Stratificat

In [None]:
#Run cross-validation on both feature set
model_III_full_reduced = model_full_cross_validation(
    classifier=RF,
    data=training_df[feature_columns_2],     
    target=training_df[target_column],
    splits=repeated_splits_III,                
    model_name="Model_III_full_reduced",
    downsample_train_to=26_400,               
    downsample_random_state=42,
    verbose=True
)

[DEBUG] model_late_cross_validation v2 — downsample=26400

=== Model_III_late_reduced - Fold 1/50 ===
  Original train size = 33027, Test size = 8217
  Effective train size after downsampling = 26400 (cap: 26400)


  [Model_III_late_reduced - Fold 1] Stratification Check:
    Overall SL rate: 0.0234
    Train SL rate:   0.0223 (diff: 0.0011)
    Test SL rate:    0.0269 (diff: 0.0035)
    Train/Test diff: 0.0046
Stratification OK
  ROC AUC = 0.8870, Elapsed time = 53.39 seconds

=== Model_III_late_reduced - Fold 2/50 ===
  Original train size = 32987, Test size = 8257
  Effective train size after downsampling = 26400 (cap: 26400)
  [Model_III_late_reduced - Fold 2] Stratification Check:
    Overall SL rate: 0.0232
    Train SL rate:   0.0234 (diff: 0.0002)
    Test SL rate:    0.0224 (diff: 0.0008)
    Train/Test diff: 0.0010
Stratification OK
  ROC AUC = 0.9003, Elapsed time = 102.67 seconds

=== Model_III_late_reduced - Fold 3/50 ===
  Original train size = 32975, Test size = 8269
  Effective train size after downsampling = 26400 (cap: 26400)
  [Model_III_late_reduced - Fold 3] Stratification Check:
    Overall SL rate: 0.0232
    Train SL rate:   0.0234 (diff: 0.0003)
    Test SL rate:    0.022

In [None]:
# Downsample cap verified
print(min(model_III_contextualised_reduced['effective_train_sizes']),
      max(model_III_contextualised_reduced['effective_train_sizes']))
print(min(model_III_full_reduced['effective_train_sizes']),
      max(model_III_full_reduced['effective_train_sizes']))

26400 26400
26400 26400


In [None]:
# save results of cross validation
with open(file_RF_model_III_contextualised_reduced, 'wb') as f:
    pickle.dump(model_III_contextualised_reduced, f)

with open(file_RF_model_III_full_reduced, 'wb') as f:
    pickle.dump(model_III_full_reduced, f)

## Model IV
#### Different paralog pairs, different cell lines

In [None]:
# --- Custom split functions ---
def create_disjoint_splits(df, n_splits=5, num_test_cell_lines=2, test_gene_fraction=0.25, random_state=None):
    splits = []

    # Get the unique cell lines and gene pairs
    unique_cell_lines = df['cell_line'].unique()
    unique_gene_pairs = df['genepair'].unique()

    # Set the random seed if provided
    if random_state is not None:
        np.random.seed(random_state)

    # Shuffle and split cell lines and gene pairs into mutually exclusive groups
    np.random.shuffle(unique_cell_lines)
    cell_line_splits = np.array_split(unique_cell_lines, n_splits)

    np.random.shuffle(unique_gene_pairs)
    gene_pair_splits = np.array_split(unique_gene_pairs, n_splits)

    # Create the splits
    for fold in range(n_splits):
        test_cell_lines = cell_line_splits[fold]
        test_gene_pairs = gene_pair_splits[fold]

        train_cell_lines = np.concatenate([cell_line_splits[i] for i in range(n_splits) if i != fold])
        train_gene_pairs = np.concatenate([gene_pair_splits[i] for i in range(n_splits) if i != fold])

        # Define train/test indices
        test_index = df[(df['cell_line'].isin(test_cell_lines)) & (df['genepair'].isin(test_gene_pairs))].index
        train_index = df[(df['cell_line'].isin(train_cell_lines)) & (df['genepair'].isin(train_gene_pairs))].index

        splits.append((train_index, test_index))

        print(f'[Fold {fold+1}] '
              f'# pairs (train): {df.loc[train_index, "genepair"].nunique()} | '
              f'# pairs (test): {df.loc[test_index, "genepair"].nunique()} | '
              f'# overlapping: {np.isin(df.loc[test_index, "genepair"].unique(), df.loc[train_index, "genepair"].unique()).sum()} | '
              f'# cells (train): {df.loc[train_index, "cell_line"].nunique()} | '
              f'# cells (test): {df.loc[test_index, "cell_line"].nunique()}')

        # Add validation after creating splits
        train_cells = set(df.loc[train_index, 'cell_line'].unique())
        test_cells = set(df.loc[test_index, 'cell_line'].unique())
        train_pairs = set(df.loc[train_index, 'genepair'].unique())
        test_pairs = set(df.loc[test_index, 'genepair'].unique())

        assert len(train_cells & test_cells) == 0, "Cell line leakage detected!"
        assert len(train_pairs & test_pairs) == 0, "Gene pair leakage detected!"

    return splits

def repeated_custom_cv(df, n_splits=5, n_repeats=3, num_test_cell_lines=2, test_gene_fraction=0.25, random_state=None):
    all_splits = []

    for repeat in range(n_repeats):
        current_seed = random_state + repeat if random_state is not None else None
        splits = create_disjoint_splits(
            df,
            n_splits=n_splits,
            num_test_cell_lines=num_test_cell_lines,
            test_gene_fraction=test_gene_fraction,
            random_state=current_seed
        )
        all_splits.extend(splits)

    return all_splits


In [None]:
splits_IV = repeated_custom_cv(df=training_df, n_splits=5, n_repeats=10, random_state=42)

# Add validation for Model IV splits
print("\n=== MODEL IV: COMBINED VALIDATION ===")
for i, (train_index, test_index) in enumerate(splits_IV[:5]):  # Just first 5 for brevity
    train_cells = training_df.loc[train_index, 'cell_line'].unique()  # Use correct column name
    test_cells = training_df.loc[test_index, 'cell_line'].unique()    # Use correct column name
    train_pairs = training_df.loc[train_index, 'genepair'].unique()
    test_pairs = training_df.loc[test_index, 'genepair'].unique()
    
    print(f"\n--- Fold {i+1} ---")
    validate_group_splits(train_cells, test_cells, i+1, "Model IV", "cell lines")
    validate_group_splits(train_pairs, test_pairs, i+1, "Model IV", "gene pairs")

In [None]:
# Save splits
with open('/Users/narod/Library/CloudStorage/GoogleDrive-narod.kebabci@ucdconnect.ie/My Drive/GitRepos/context_specific_SL_prediction/output/models/model_IV/splits_IV.pkl', 'wb') as f:
    pickle.dump(splits_IV, f)

In [None]:
model_IV_contextualised = model_contextualised_cross_validation(RF, data_1, target, splits_IV, model_name="Model IV contextualised")
model_IV_full = model_full_cross_validation(RF, data_2, target, splits_IV, model_name="Model IV Full")

In [None]:
# save results of cross validation
with open(file_RF_model_IV_contextualised, 'wb') as f:
    pickle.dump(model_IV_contextualised, f)

with open(file_RF_model_IV_full, 'wb') as f:
    pickle.dump(model_IV_full, f)

## Plot ROC and PR Curves

In [None]:
def plot_model_performance(df, data, target, model_contextualised, model_full, file_path):
    sns.set_context('paper')
    f, ax = plt.subplots(1, 2, figsize=(9.5, 4.5))  # ROC and PR side by side

    # --- contextualised model ---
    roc_display = RocCurveDisplay(
        fpr=model_contextualised['mean_fpr'],
        tpr=model_contextualised['mean_tpr'],
        roc_auc=model_contextualised['mean_auc']
    )
    roc_display.plot(ax=ax[0], color='#009E73', linewidth=1.75, label=f"{model_contextualised['mean_auc']:.2f}")

    pr_display = PrecisionRecallDisplay(
        precision=model_contextualised['precision'],
        recall=model_contextualised['recall'],
        average_precision=model_contextualised['mean_aps']
    )
    pr_display.plot(ax=ax[1], color='#009E73', linewidth=1.75, label=f"{model_contextualised['mean_aps']:.2f}")

    # --- Full model ---
    roc_display = RocCurveDisplay(
        fpr=model_full['mean_fpr'],
        tpr=model_full['mean_tpr'],
        roc_auc=model_full['mean_auc']
    )
    roc_display.plot(ax=ax[0], alpha=0.8, color='#CC79A7', linewidth=1.75, label=f"{model_full['mean_auc']:.2f}")

    pr_display = PrecisionRecallDisplay(
        precision=model_full['precision'],
        recall=model_full['recall'],
        average_precision=model_full['mean_aps']
    )
    pr_display.plot(ax=ax[1], alpha=0.8, color='#CC79A7', linewidth=1.75, label=f"{model_full['mean_aps']:.2f}")

    # --- Additional classifier-like metrics ---
    metrics = ['prediction_score', 'min_sequence_identity']
    colors = ['#777777', '#E69F00']
    auc_labels = ['pred_auc', 'seq_auc']
    ap_labels = ['pred_ap', 'seq_ap']

    for metric, color, auc_label, ap_label in zip(metrics, colors, auc_labels, ap_labels):
        score = data[metric] if metric != 'min_ranked_A1A2' else -data[metric]
        mask = ~score.isna()
        y_true = target[mask]
        y_score = score[mask]

        if len(np.unique(y_true)) < 2:
            print(f"Skipping {metric}: not enough label variation after NaN removal.")
            continue

        # ROC Curve
        fpr, tpr, _ = roc_curve(y_true, y_score)
        roc_auc = model_contextualised[auc_label]
        roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc)
        roc_display.plot(ax=ax[0], color=color, linewidth=1.75, label=f"{roc_auc:.3f}")

        # PR Curve
        precision, recall, _ = precision_recall_curve(y_true, y_score)
        pr_auc = model_contextualised[ap_label]
        pr_display = PrecisionRecallDisplay(precision=precision, recall=recall, average_precision=pr_auc)
        pr_display.plot(ax=ax[1], color=color, linewidth=1.75, label=f"{pr_auc:.3f}")

    # --- ROC chance line ---
    ax[0].plot([0, 1], [0, 1], color="lightgrey", linestyle="--", label="0.50")
    ax[0].set_xlim([-0.025, 1.025])
    ax[0].set_ylim([-0.025, 1.025])
    ax[0].spines.top.set(visible=False)
    ax[0].spines.right.set(visible=False)
    ax[0].tick_params(axis='both', which='major', labelsize=11)
    ax[0].set_xlabel('False Positive Rate', fontsize=12)
    ax[0].set_ylabel('True Positive Rate', fontsize=12)
    ax[0].legend(loc='lower right', fontsize=12, bbox_to_anchor=(1.02, 0))

    # --- PR baseline ---
    no_skill = round(sum(df['SL_new']) / len(df['SL_new']), 3)
    ax[1].plot([0, 1], [no_skill, no_skill], linestyle='--', color='lightgrey', label=f"{no_skill:.2f}")
    ax[1].set_xlim([-0.025, 1.025])
    ax[1].set_ylim([-0.025, 1.025])
    ax[1].spines.top.set(visible=False)
    ax[1].spines.right.set(visible=False)
    ax[1].set_xlabel('Recall', fontsize=12)
    ax[1].set_ylabel('Precision', fontsize=12)
    ax[1].tick_params(axis='both', which='major', labelsize=11)
    ax[1].legend(loc='upper right', fontsize=12)

    plt.tight_layout(h_pad=0.7)
    plt.savefig(file_path, bbox_inches='tight', dpi=400)
    plt.close(f)


In [None]:
figure_path_model1 = figure_path_model1 = get_data_path(['figures'], 'supp3_a_model_I.png')
plot_model_performance(training_df, data_1, target, model_I_contextualised_reduced, model_I_full_reduced, figure_path_model1)

In [None]:
figure_path_model2 = get_data_path(['figures'], 'supp3_a_model_II.png')
plot_model_performance(training_df, data_1, target, model_II_contextualised_reduced, model_II_full_reduced, figure_path_model2)

In [None]:
figure_path_model3 = get_data_path(['figures'], 'supp3_a_model_III.png')
plot_model_performance(training_df, data_1, target, model_III_contextualised_reduced, model_III_full_reduced, figure_path_model3)

In [None]:
def generate_validation_report(model_results, model_name):
    """Generate a comprehensive validation report for a model."""
    print(f"\n{'='*50}")
    print(f"VALIDATION REPORT: {model_name}")
    print(f"{'='*50}")
    
    # Performance summary
    print(f"Performance:")
    print(f"  Mean ROC AUC: {model_results['mean_auc']:.4f} ± {model_results['std_auc']:.4f}")
    print(f"  Mean PR AUC:  {model_results['mean_aps']:.4f} ± {model_results['std_ap']:.4f}")
    print(f"  Effective folds: {model_results['effective_folds']}")
    
    if 'skipped_folds' in model_results and model_results['skipped_folds']:
        print(f"  Skipped folds: {model_results['skipped_folds']}")
    
    # Stratification summary
    if 'stratification_results' in model_results:
        strat_results = model_results['stratification_results']
        if strat_results:
            train_diffs = [r['train_diff'] for r in strat_results]
            test_diffs = [r['test_diff'] for r in strat_results]
            warnings_count = sum(len(r['warnings']) for r in strat_results)
            
            print(f"\nStratification Quality:")
            print(f"  Mean train deviation: {np.mean(train_diffs):.4f} ± {np.std(train_diffs):.4f}")
            print(f"  Mean test deviation:  {np.mean(test_diffs):.4f} ± {np.std(test_diffs):.4f}")
            print(f"  Total warnings: {warnings_count}")
            
            if warnings_count > 0:
                print(f"Some folds had stratification issues")
            else:
                print(f"All folds passed stratification checks")

with open(file_RF_model_IV_contextualised, 'rb') as file:
    model_IV_contextualised = pickle.load(file)

with open(file_RF_model_IV_full, 'rb') as file:
    model_IV_full = pickle.load(file)

# Generate reports for all models
all_models = [
    (model_I_contextualised_reduced, "Model I contextualised"),
    (model_I_full_reduced, "Model I Full"),
    (model_II_contextualised_reduced, "Model II contextualised"),
    (model_II_full_reduced, "Model II Full"),
    (model_III_contextualised_reduced, "Model III contextualised"),
    (model_III_full_reduced, "Model III Full"),
    (model_IV_contextualised, "Model IV contextualised"),
    (model_IV_full, "Model IV Full")
]

for model_result, model_name in all_models:
    generate_validation_report(model_result, model_name)


VALIDATION REPORT: Model I Early
Performance:
  Mean ROC AUC: 0.9161 ± 0.0095
  Mean PR AUC:  0.3429 ± 0.0349
  Effective folds: 50

Stratification Quality:
  Mean train deviation: 0.0000 ± 0.0000
  Mean test deviation:  0.0001 ± 0.0000
All folds passed stratification checks

VALIDATION REPORT: Model I Late
Performance:
  Mean ROC AUC: 0.9275 ± 0.0088
  Mean PR AUC:  0.3924 ± 0.0351
  Effective folds: 50

Stratification Quality:
  Mean train deviation: 0.0000 ± 0.0000
  Mean test deviation:  0.0001 ± 0.0000
All folds passed stratification checks

VALIDATION REPORT: Model II Early
Performance:
  Mean ROC AUC: 0.8974 ± 0.0339
  Mean PR AUC:  0.2751 ± 0.0598
  Effective folds: 50

Stratification Quality:
  Mean train deviation: 0.0021 ± 0.0013
  Mean test deviation:  0.0067 ± 0.0043
All folds passed stratification checks

VALIDATION REPORT: Model II Late
Performance:
  Mean ROC AUC: 0.9138 ± 0.0311
  Mean PR AUC:  0.3312 ± 0.0696
  Effective folds: 50

Stratification Quality:
  Mean trai