In [1]:
import os
import sys
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import gc
import time
import subprocess
from concurrent.futures import ProcessPoolExecutor, as_completed
from pathlib import Path
import polars as pl
from typing import Dict, List, Tuple, Optional, Union
from pathlib import Path
from tqdm import tqdm

In [2]:
import rdkit
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, Draw
from rdkit.Chem.rdFingerprintGenerator import GetMorganGenerator
from rdkit.Chem.MACCSkeys import GenMACCSKeys
from rdkit.Avalon.pyAvalonTools import GetAvalonFP

In [3]:
import xgboost as xgb
import lightgbm as lgb

  from optuna import progress_bar as pbar_module


In [4]:
import tensorflow as tf
from tensorflow.keras import layers, regularizers, optimizers, losses, metrics
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

2025-07-06 18:17:27.437861: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751793447.459396   70416 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751793447.466381   70416 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1751793447.485814   70416 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1751793447.485842   70416 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1751793447.485844   70416 computation_placer.cc:177] computation placer alr

In [5]:
import tensorflow as tf
from tensorflow.keras import layers, regularizers, optimizers, losses, metrics

In [6]:
tf.keras.backend.clear_session()
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

In [7]:
from pathlib import Path
from extra_code.molecular_loader_with_fp import (
    # Main functions
    load_split_data,
    extract_xy_from_data,
    build_fingerprints_for_splits,
    create_all_fp_combinations,
    get_train_test_data,
    create_summary_dataframe,
    run_pipeline,
    
    # Helper functions
    generate_fp_combos,
    get_fp_combos,
    print_configuration,
)

In [8]:
from pathlib import Path
target_path = Path("result/3_solubility_fp_comp")
os.makedirs(target_path, exist_ok=True)
out_root = Path("./result/fingerprint")
out_root.mkdir(parents=True, exist_ok=True)

In [9]:
abbr_map = {
    "delaney-processed":           "de",
    "Lovric2020_logS0":            "lo",
    "ws496_logS":                  "ws",
    "huusk":                       "hu",
}
suffix_map = {
    'random': 'rm',
    'scaffold': 'sc',
    'chemical_space_coverage': 'cs',
    'cluster': 'cl',
    'physchem': 'pc',
    'activity_cliff': 'ac',
    'solubility_aware': 'sa',
    'time_series': 'ti',
    'ensemble': 'en',
}

In [10]:

RANDOM_STATE=42
DATASETS = ["ws", "de", "lo", "hu"]
SPLITS   = ["rm", "sc", "cs", "cl", "pc", "ac", "sa", "ti", "en"] 
FINGERPRINT_TYPES = ["morgan", "maccs", "avalon"]
PLOT_COMBOS = {
    "morgan+maccs",
    "morgan+avalon",
    "maccs+avalon",
    "morgan+maccs+avalon",
}

In [11]:
print_configuration()

=== Default Configuration ===

Fingerprint Types: ['morgan', 'maccs', 'avalon']

Datasets: ['ws', 'de', 'lo', 'hu']
Dataset Mapping: {'delaney-processed': 'de', 'Lovric2020_logS0': 'lo', 'ws496_logS': 'ws', 'huusk': 'hu'}

Splits: ['rm', 'sc', 'cs', 'cl', 'pc', 'ac', 'sa', 'ti', 'en']
Split Mapping: {'rm': 'random', 'sc': 'scaffold', 'cs': 'chemical_space_coverage', 'cl': 'cluster', 'pc': 'physchem', 'ac': 'activity_cliff', 'sa': 'solubility_aware', 'ti': 'time_series', 'en': 'ensemble'}

Fingerprint Sizes: {'morgan': 2048, 'maccs': 167, 'avalon': 512}

SMILES Columns: ['SMILES', 'smiles', 'Smiles', 'SMILE', 'smile', 'molecule']

Target Columns: ['target', 'Target', 'logS', 'LogS', 'logs'] ...

Fingerprint Combinations (7):
  morgan
  maccs
  avalon
  morgan+maccs
  morgan+avalon
  maccs+avalon
  morgan+maccs+avalon


In [12]:
FP_COMBOS = get_fp_combos(FINGERPRINT_TYPES)
print(f"\nGenerated {len(FP_COMBOS)} fingerprint combinations:")
for i, combo in enumerate(FP_COMBOS, 1):
    print(f"{i}. {'+'.join(combo)}")


Generated 7 fingerprint combinations:
1. morgan
2. maccs
3. avalon
4. morgan+maccs
5. morgan+avalon
6. maccs+avalon
7. morgan+maccs+avalon


In [13]:
data_dict, x_map, y_map, fp_map, all_combinations = run_pipeline(
    datasets=DATASETS,
    splits=SPLITS,    
    fingerprint_types=FINGERPRINT_TYPES,  # ["morgan", "maccs", "avalon"]
    fp_combos=FP_COMBOS,
    n_jobs=16,
    force_rebuild=False,
    use_cache=True,
    create_combinations=True
)

=== Molecular Data Processing Pipeline ===

Configuration:
  Datasets: ['ws', 'de', 'lo', 'hu']
  Splits: ['rm', 'sc', 'cs', 'cl', 'pc', 'ac', 'sa', 'ti', 'en']
  Fingerprint types: ['morgan', 'maccs', 'avalon']
  Fingerprint sizes: {'morgan': 2048, 'maccs': 167, 'avalon': 512}

Step 1: Loading data...
Loaded 9 splits

Step 2: Extracting SMILES and targets...
Extracted 72 dataset-phase combinations

Step 3: Generating fingerprints...


Generating fingerprints: 100%|██████████| 72/72 [00:00<00:00, 210.54it/s]



Step 4: Creating fingerprint combinations...
Creating combination: morgan
Creating combination: maccs
Creating combination: avalon
Creating combination: morgan+maccs
Creating combination: morgan+avalon
Creating combination: maccs+avalon
Creating combination: morgan+maccs+avalon

Created 7 combinations

Feature dimensions by combination:
  avalon: 512 features
  maccs: 167 features
  maccs+avalon: 679 features
  morgan: 2048 features
  morgan+avalon: 2560 features
  morgan+maccs: 2215 features
  morgan+maccs+avalon: 2727 features

=== Pipeline Complete ===


In [14]:
summary_df = create_summary_dataframe(all_combinations)
print(f"\nPipeline completed successfully!")
print(f"Summary shape: {summary_df.shape}")


Pipeline completed successfully!
Summary shape: (504, 8)


In [15]:
# Show data structure
print("Data structure:")
print(f"Splits loaded: {list(x_map.keys())}")
print(f"Fingerprint combinations created: {list(all_combinations.keys())}")

# Show dataset sizes
print("\nDataset sizes (train):")
train_summary = summary_df[summary_df['phase'] == 'train'].groupby('dataset')['n_molecules'].first()
for dataset, size in train_summary.items():
    print(f"  {dataset}: {size} molecules")

# Show feature dimensions
print("\nFeature dimensions by combination:")
feature_dims = summary_df[summary_df['phase'] == 'train'].groupby('combination')['n_features'].first()
for combo, dims in feature_dims.items():
    print(f"  {combo}: {dims} features")

Data structure:
Splits loaded: ['rm', 'sc', 'cs', 'cl', 'pc', 'ac', 'sa', 'ti', 'en']
Fingerprint combinations created: ['morgan', 'maccs', 'avalon', 'morgan+maccs', 'morgan+avalon', 'maccs+avalon', 'morgan+maccs+avalon']

Dataset sizes (train):
  de: 902 molecules
  hu: 1032 molecules
  lo: 663 molecules
  ws: 396 molecules

Feature dimensions by combination:
  avalon: 512 features
  maccs: 167 features
  maccs+avalon: 679 features
  morgan: 2048 features
  morgan+avalon: 2560 features
  morgan+maccs: 2215 features
  morgan+maccs+avalon: 2727 features


In [16]:

# Example: Get data for specific combination
combination = 'morgan+maccs'
split = 'rm'
dataset = 'de'

try:
    X_train, X_test, y_train, y_test = get_train_test_data(
        all_combinations, y_map, combination, split, dataset
    )
    
    print(f"Data retrieved for {combination} fingerprints:")
    print(f"  Dataset: {dataset}")
    print(f"  Split: {split}")
    print(f"  X_train: {X_train.shape}")
    print(f"  X_test: {X_test.shape}")
    print(f"  y_train: {len(y_train)} values, range [{min(y_train):.2f}, {max(y_train):.2f}]")
    print(f"  y_test: {len(y_test)} values, range [{min(y_test):.2f}, {max(y_test):.2f}]")
    
except KeyError as e:
    print(f"Error: {e}")
    print("\nAvailable combinations:", list(all_combinations.keys()))
    print("Available splits:", list(all_combinations[list(all_combinations.keys())[0]].keys()))


Data retrieved for morgan+maccs fingerprints:
  Dataset: de
  Split: rm
  X_train: (902, 2215)
  X_test: (226, 2215)
  y_train: 902 values, range [-11.60, 1.58]
  y_test: 226 values, range [-8.33, 1.57]


In [17]:
print("y_map structure:")
for split in list(y_map.keys())[:2]:  # 처음 2개 split만
    print(f"  {split}: {list(y_map[split].keys())}")

print("\nfp_map structure:")
for split in list(fp_map.keys())[:2]:  # 처음 2개 split만
    print(f"  {split}: {list(fp_map[split].keys())}")

y_map structure:
  rm: ['de_train', 'hu_train', 'lo_train', 'ws_train', 'hu_test', 'ws_test', 'de_test', 'lo_test']
  sc: ['ws_train', 'de_train', 'hu_train', 'lo_train', 'lo_test', 'ws_test', 'de_test', 'hu_test']

fp_map structure:
  rm: ['de_train', 'hu_train', 'lo_train', 'ws_train', 'hu_test', 'ws_test', 'de_test', 'lo_test']
  sc: ['ws_train', 'de_train', 'hu_train', 'lo_train', 'lo_test', 'ws_test', 'de_test', 'hu_test']


In [18]:
from tensorflow.keras import mixed_precision
mixed_precision.set_global_policy('mixed_float16')

In [19]:
BATCHSIZE = 32
EPOCHS = 100
lr = 0.001

In [20]:
# def new_model(input_dim):
#     model = tf.keras.Sequential([
#         layers.Input(shape=(input_dim,)),
#         layers.Dense(
#             units=1024,
#             activation='relu',
#             kernel_initializer='glorot_uniform',
#             kernel_regularizer=regularizers.l2(1e-5)),
#         layers.BatchNormalization(),
#         layers.Dropout(0.2),
#         layers.Dense(
#             units=496,
#             kernel_initializer='glorot_uniform',
#             kernel_regularizer=regularizers.l2(1e-5),
#             activation='relu'),
#         layers.BatchNormalization(),
#         layers.Dropout(0.2),
#         layers.Dense(units=1, dtype='float32')
#     ])
#     model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr), 
#                   loss=tf.keras.losses.MeanSquaredError(),
#                   metrics=[
#                       metrics.RootMeanSquaredError(name='rmse'),
#                       metrics.MeanAbsoluteError(name='mae')
#                   ])
#     return model

In [21]:
# def run_model_benchmark(X_train, y_train, X_test, y_test,
#                         split_type: str,
#                         combo: tuple,
#                         target_path: Path):
#     """
#     combo    : ('morgan', 'maccs') …
#     filename : <ds>_<split>_fp_<tag>.csv/png
#     """
#     # if len(combo) == 3:
#     #     tag = "all_comp"
#     # elif len(combo) == 2:
#     #     tag = "_a_".join(combo)
#     # else:
#     #     tag = combo[0]

#     # base_name = f"{split_type}_fp_{tag}" 

#     X_tr, X_te = map(np.asarray, (X_train, X_test))
#     y_tr, y_te = map(np.asarray, (y_train, y_test))

#     models = {
#         # "Ridge":        Ridge(),
#         # "SVR":          SVR(),
#         # "RandomForest": RandomForestRegressor(random_state=RANDOM_STATE, n_jobs=-1),
#         # "XGBoost":      xgb.XGBRegressor(random_state=RANDOM_STATE, n_jobs=-1),
#         # "LightGBM":     lgb.LGBMRegressor(random_state=RANDOM_STATE, n_jobs=-1, verbose=-1),
#         "DNN":          "Tensorflow model",
#     }

#     results = {}
#     for name, mdl in models.items():
#         if name == "DNN":
#             # GPU memory usage
#             import tensorflow as tf
#             gpus = tf.config.experimental.list_physical_devices('GPU')
#             if gpus:
#                 try:
#                     for gpu in gpus:
#                         tf.config.experimental.set_memory_growth(gpu, True)
#                 except RuntimeError as e:
#                     print(e)
            
#             ds_full = tf.data.Dataset.from_tensor_slices((X_tr, y_tr))
#             ds_full = ds_full.shuffle(len(X_tr), seed=RANDOM_STATE)
#             val_sz  = int(len(X_tr) * 0.2)
#             val_ds  = ds_full.take(val_sz).batch(BATCHSIZE).prefetch(tf.data.AUTOTUNE)
#             tr_ds   = ds_full.skip(val_sz).batch(BATCHSIZE).prefetch(tf.data.AUTOTUNE)

#             # clear session
#             tf.keras.backend.clear_session()
            
#             model = new_model(X_tr.shape[1])
#             cb_es = tf.keras.callbacks.EarlyStopping(patience=15, restore_best_weights=True)
#             cb_rl = tf.keras.callbacks.ReduceLROnPlateau(factor=0.5, patience=3)
            
#             print(f"        Training DNN model...")
#             hist  = model.fit(tr_ds, epochs=EPOCHS, validation_data=val_ds,
#                             callbacks=[cb_es, cb_rl], verbose=0)
            
#             print(f"        Predicting...")
#             preds = model.predict(X_te, verbose=0).flatten()
            
#             # strict memory managing
#             del hist
#             del model
#             del ds_full
#             del val_ds
#             del tr_ds
#             tf.keras.backend.clear_session()
#             gc.collect()
            
#             # clean gpu memory
#             if gpus:
#                 tf.config.experimental.reset_memory_stats('GPU:0')
#         else:
#             mdl.fit(X_tr, y_tr)
#             preds = mdl.predict(X_te)

#         results[name] = dict(
#             RMSE=np.sqrt(mean_squared_error(y_te, preds)),
#             MAE =mean_absolute_error(y_te, preds),
#             R2  =r2_score(y_te, preds),
#             predictions=preds
#         )

#     return results

In [22]:
def new_model(input_shape):
    model = tf.keras.Sequential([
        layers.Input(shape=(input_shape,)),
        layers.Dense(
            units=1024,
            activation='relu',
            kernel_initializer='glorot_uniform',
            kernel_regularizer=regularizers.l2(1e-5)),
        layers.BatchNormalization(),
        layers.Dropout(0.2),
        layers.Dense(
            units=496,
            kernel_initializer='glorot_uniform',
            kernel_regularizer=regularizers.l2(1e-5),
            activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.2),
        layers.Dense(units=1, dtype='float32')
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr), 
                  loss=tf.keras.losses.MeanSquaredError(),
                  metrics=[
                      metrics.RootMeanSquaredError(name='rmse'),
                      metrics.MeanAbsoluteError(name='mae')
                  ])
    return model

def save_model(x_data, verbose: bool = False) -> Path:
    model_dir = Path("save_model")
    model_path = model_dir / "full_model.keras"
    model_dir.mkdir(parents=True, exist_ok=True)
    input_dim = x_data.shape[1]
    tf.keras.backend.clear_session()
    model = new_model(input_dim)
    model.save(model_path, overwrite=True)
    if verbose:
        print(f"[save_model] Model saved at {model_path} with input_dim={input_dim}")
    del model
    tf.keras.backend.clear_session()
    return model_path

In [23]:
def run_model_benchmark(X_train, y_train, X_test, y_test,
                          split_type: str,
                          combo: tuple,
                          target_path: Path,
                          n_folds: int = 5,
                          BATCHSIZE: int = 32,
                          EPOCHS: int = 100,
                          lr: float = 0.001,
                          RANDOM_STATE: int = 42):
    """
    Cross-validation version with subprocess for DNN
    combo    : ('morgan', 'maccs') …
    filename : <ds>_<split>_fp_<tag>.csv/png
    """
    
    X_tr, X_te = map(np.asarray, (X_train, X_test))
    y_tr, y_te = map(np.asarray, (y_train, y_test))

    models = {
        "DNN": "Tensorflow model (subprocess)",
    }

    # Initialize results storage
    cv_results = {name: {'train_scores': [], 'val_scores': [], 'test_score': None} 
                  for name in models.keys()}
    
    # Cross-validation
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=RANDOM_STATE)
    
    print(f"Starting {n_folds}-fold cross-validation...")
    
    for fold_idx, (train_idx, val_idx) in enumerate(kf.split(X_tr)):
        print(f"\n--- Fold {fold_idx + 1}/{n_folds} ---")
        
        # Split data for this fold
        X_fold_train, X_fold_val = X_tr[train_idx], X_tr[val_idx]
        y_fold_train, y_fold_val = y_tr[train_idx], y_tr[val_idx]
        
        for name, mdl in models.items():
            print(f"    Training {name}...")
            
            if name == "DNN":
                # Use subprocess approach for DNN
                try:
                    # Save training data - use consistent file names
                    np.save('X_train.npy', X_fold_train)
                    np.save('y_train.npy', y_fold_train)
                    np.save('X_test.npy', X_fold_val)  # validation data as test for CV
                    np.save('y_test.npy', y_fold_val)
                    
                    save_model(X_fold_train)
                    
                    # Run subprocess with validation data
                    result = subprocess.run(['python3', './extra_code/learning_process.py', 
                                           str(BATCHSIZE), str(EPOCHS), str(lr),
                                           'X_train.npy', 'y_train.npy',
                                           'X_test.npy', 'y_test.npy'],
                                          stdout=subprocess.PIPE, 
                                          stderr=subprocess.PIPE, 
                                          text=True,
                                          encoding='utf-8')
                    
                    print(f"        [DNN] Return code: {result.returncode}")
                    if result.stdout:
                        print(f"        [DNN] Full stdout:")
                        print(result.stdout)
                    else:
                        print(f"        [DNN] No stdout output")
                    
                    stdout_lines = result.stdout.strip().splitlines()
                    print(f"        [DNN] Number of stdout lines: {len(stdout_lines)}")
                    
                    # Parse results from subprocess - FIXED PARSING
                    train_r2, val_r2 = 0.0, 0.0
                    for line in stdout_lines:
                        if line.startswith("R2:"):
                            val_r2 = float(line.split("R2:")[1].strip())
                            train_r2 = val_r2  # Use same value for both since we don't have separate train metric
                    
                    if result.stderr:
                        print(f"        [DNN] stderr: {result.stderr}")
                    
                    cv_results[name]['train_scores'].append(train_r2)
                    cv_results[name]['val_scores'].append(val_r2)
                    
                    # Clean up
                    gc.collect()
                    
                except Exception as e:
                    print(f"        [DNN] Error: {e}")
                    cv_results[name]['train_scores'].append(0.0)
                    cv_results[name]['val_scores'].append(0.0)
            
            else:
                # Traditional ML models
                mdl_clone = mdl.__class__(**mdl.get_params())
                mdl_clone.fit(X_fold_train, y_fold_train)
                
                # Training score
                train_pred = mdl_clone.predict(X_fold_train)
                train_r2 = r2_score(y_fold_train, train_pred)
                
                # Validation score
                val_pred = mdl_clone.predict(X_fold_val)
                val_r2 = r2_score(y_fold_val, val_pred)
                
                cv_results[name]['train_scores'].append(train_r2)
                cv_results[name]['val_scores'].append(val_r2)
    
    # Final evaluation on test set
    print("\n--- Final evaluation on test set ---")
    final_results = {}
    
    for name, mdl in models.items():
        print(f"    Evaluating {name} on test set...")
        
        if name == "DNN":
            # Train on full training set and evaluate on test set
            try:
                # Save full training and test data - use same file names
                np.save('X_train.npy', X_tr)
                np.save('y_train.npy', y_tr)
                np.save('X_test.npy', X_te)
                np.save('y_test.npy', y_te)
                
                save_model(X_tr)
                
                # Run subprocess for final training
                result = subprocess.run(['python3', './extra_code/learning_process.py', 
                                       str(BATCHSIZE), str(EPOCHS), str(lr),
                                       'X_train.npy', 'y_train.npy',
                                       'X_test.npy', 'y_test.npy'],
                                      stdout=subprocess.PIPE, 
                                      stderr=subprocess.PIPE, 
                                      text=True,
                                      encoding='utf-8')
                
                print(f"        [DNN] Return code: {result.returncode}")
                if result.stdout:
                    print(f"        [DNN] Full stdout:")
                    print(result.stdout)
                else:
                    print(f"        [DNN] No stdout output")
                
                stdout_lines = result.stdout.strip().splitlines()
                print(f"        [DNN] Number of stdout lines: {len(stdout_lines)}")
                
                # Parse test results - FIXED PARSING
                test_rmse, test_mae, test_r2 = 0.0, 0.0, 0.0
                for line in stdout_lines:
                    if line.startswith("R2:"):
                        test_r2 = float(line.split("R2:")[1].strip())
                    elif line.startswith("RMSE:"):
                        test_rmse = float(line.split("RMSE:")[1].strip())
                    elif line.startswith("MAE:"):
                        test_mae = float(line.split("MAE:")[1].strip())
                
                if result.stderr:
                    print(f"        [DNN] stderr: {result.stderr}")
                
                # Load predictions if available
                predictions = None
                predictions_file = Path("predictions.npy")
                if predictions_file.exists():
                    predictions = np.load(predictions_file)
                    print(f"        [DNN] Loaded predictions from {predictions_file}")
                
                cv_results[name]['test_score'] = test_r2
                final_results[name] = {
                    'RMSE': test_rmse,
                    'MAE': test_mae,
                    'R2': test_r2,
                    'CV_train_mean': np.mean(cv_results[name]['train_scores']),
                    'CV_train_std': np.std(cv_results[name]['train_scores']),
                    'CV_val_mean': np.mean(cv_results[name]['val_scores']),
                    'CV_val_std': np.std(cv_results[name]['val_scores']),
                    'predictions': predictions  # Add predictions
                }
                
                # Clean up
                gc.collect()
                
            except Exception as e:
                print(f"        [DNN] Error in final evaluation: {e}")
                final_results[name] = {
                    'RMSE': 0.0,
                    'MAE': 0.0,
                    'R2': 0.0,
                    'CV_train_mean': np.mean(cv_results[name]['train_scores']),
                    'CV_train_std': np.std(cv_results[name]['train_scores']),
                    'CV_val_mean': np.mean(cv_results[name]['val_scores']),
                    'CV_val_std': np.std(cv_results[name]['val_scores']),
                    'predictions': None
                }
        
        else:
            # Traditional ML models - train on full training set
            mdl.fit(X_tr, y_tr)
            preds = mdl.predict(X_te)
            
            cv_results[name]['test_score'] = r2_score(y_te, preds)
            final_results[name] = {
                'RMSE': np.sqrt(mean_squared_error(y_te, preds)),
                'MAE': mean_absolute_error(y_te, preds),
                'R2': r2_score(y_te, preds),
                'CV_train_mean': np.mean(cv_results[name]['train_scores']),
                'CV_train_std': np.std(cv_results[name]['train_scores']),
                'CV_val_mean': np.mean(cv_results[name]['val_scores']),
                'CV_val_std': np.std(cv_results[name]['val_scores']),
                'predictions': preds
            }
    
    # Print summary
    print("\n" + "="*60)
    print("CROSS-VALIDATION SUMMARY")
    print("="*60)
    for name, results in final_results.items():
        print(f"\n{name}:")
        print(f"  CV Train R2: {results['CV_train_mean']:.4f} ± {results['CV_train_std']:.4f}")
        print(f"  CV Val R2:   {results['CV_val_mean']:.4f} ± {results['CV_val_std']:.4f}")
        print(f"  Test R2:     {results['R2']:.4f}")
        print(f"  Test RMSE:   {results['RMSE']:.4f}")
        print(f"  Test MAE:    {results['MAE']:.4f}")
    
    return final_results

In [24]:
# def vis_res(df, ds, split_type):
#     plt.rcParams.update({
#         "grid.linestyle":"--", "grid.alpha":0.7,
#         "axes.edgecolor":"#333333", "axes.linewidth":1.1,
#         "font.size":13
#     })
#     cmap = plt.get_cmap("tab20")
#     colors  = [cmap(i) for i in range(cmap.N)]
#     markers = ["o","s","D","P","X","^","v","<",">","h","8","p","*","H"]
#     from matplotlib.font_manager import FontProperties
#     mono = FontProperties(family="monospace", size=10)

#     fig, ax = plt.subplots(figsize=(6,6), constrained_layout=True)
#     fig.suptitle(f"{ds.upper()} – {split_type.upper()} – Multi-FP", fontsize=18, fontweight="bold")

#     y_true = df["y_truth"].values
#     ax.plot([y_true.min(), y_true.max()],[y_true.min(), y_true.max()],
#             "--", color="#666666", linewidth=1.2, zorder=1)

#     other_cols = [c for c in df.columns if c != "y_truth"]
#     max_len = max(len(c) for c in other_cols)

#     for j, col in enumerate(other_cols):
#         ax.scatter(y_true, df[col].values,
#                    s=55,
#                    marker=markers[j % len(markers)],
#                    color=colors[j % len(colors)],
#                    edgecolors="k",
#                    linewidths=0.6,
#                    alpha=0.85,
#                    label=f"{col:<{max_len}}  R\u00b2={r2_score(y_true, df[col]):>6.3f}")

#     ax.set_xlabel("True"); ax.set_ylabel("Predicted")
#     ax.legend(loc="lower right", prop=mono, fontsize=9)
#     ax.grid(True)

#     plot_dir = target_path / "plots"
#     plot_dir.mkdir(parents=True, exist_ok=True)
#     fname = plot_dir / f"{ds}_{split_type}_multi_fp_compare.png"
#     plt.savefig(fname, dpi=300, bbox_inches="tight"); plt.close()
#     print(f"📈  {fname.name} saved")

In [25]:
# def vis_res(df, ds, split_type, target_path):
#     """Visualization function with CV metrics display"""
#     plt.rcParams.update({
#         "grid.linestyle":"--", "grid.alpha":0.7,
#         "axes.edgecolor":"#333333", "axes.linewidth":1.1,
#         "font.size":13
#     })
#     cmap = plt.get_cmap("tab20")
#     colors  = [cmap(i) for i in range(cmap.N)]
#     markers = ["o","s","D","P","X","^","v","<",">","h","8","p","*","H"]
#     from matplotlib.font_manager import FontProperties
#     mono = FontProperties(family="monospace", size=10)

#     fig, ax = plt.subplots(figsize=(6,6), constrained_layout=True)
#     fig.suptitle(f"{ds.upper()} – {split_type.upper()} – Multi-FP", fontsize=18, fontweight="bold")

#     y_true = df["y_truth"].values
#     ax.plot([y_true.min(), y_true.max()],[y_true.min(), y_true.max()],
#             "--", color="#666666", linewidth=1.2, zorder=1)

#     other_cols = [c for c in df.columns if c != "y_truth"]
#     max_len = max(len(c) for c in other_cols)

#     for j, col in enumerate(other_cols):
#         ax.scatter(y_true, df[col].values,
#                    s=55,
#                    marker=markers[j % len(markers)],
#                    color=colors[j % len(colors)],
#                    edgecolors="k",
#                    linewidths=0.6,
#                    alpha=0.85,
#                    label=f"{col:<{max_len}}  R²={r2_score(y_true, df[col]):>6.3f}")

#     ax.set_xlabel("True"); ax.set_ylabel("Predicted")
#     ax.legend(loc="lower right", prop=mono, fontsize=9)
#     ax.grid(True)

#     # Create folder structure: target_path / "plots" / ds / split_type
#     plot_dir = target_path / "plots" / ds / split_type
#     plot_dir.mkdir(parents=True, exist_ok=True)
    
#     # Save with descriptive filename
#     fname = plot_dir / f"{ds}_{split_type}_multi_fp_compare.png"
#     plt.savefig(fname, dpi=300, bbox_inches="tight")
#     plt.close()
    
#     # Print relative path for clarity
#     print(f"📈  Saved to: plots/{ds}/{split_type}/{fname.name}")

In [None]:
def vis_res(df, ds, split_type, target_path, cv_results=None):
    """Visualization function with CV metrics display
    
    Args:
        df: DataFrame with y_truth and predictions columns
        ds: dataset name
        split_type: split type
        target_path: path to save plots
        cv_results: dict with CV results for each model/combo {combo_name: {'cv_mean': x, 'cv_std': y, 'test_r2': z}}
    """
    plt.rcParams.update({
        "grid.linestyle":"--", "grid.alpha":0.7,
        "axes.edgecolor":"#333333", "axes.linewidth":1.1,
        "font.size":13
    })
    cmap = plt.get_cmap("tab20")
    colors  = [cmap(i) for i in range(cmap.N)]
    markers = ["o","s","D","P","X","^","v","<",">","h","8","p","*","H"]
    from matplotlib.font_manager import FontProperties
    mono = FontProperties(family="monospace", size=10)

    fig, ax = plt.subplots(figsize=(8,6), constrained_layout=True)  # Wider for more legend info
    fig.suptitle(f"{ds.upper()} – {split_type.upper()} – Multi-FP", fontsize=18, fontweight="bold")

    y_true = df["y_truth"].values
    ax.plot([y_true.min(), y_true.max()],[y_true.min(), y_true.max()],
            "--", color="#666666", linewidth=1.2, zorder=1)

    other_cols = [c for c in df.columns if c != "y_truth"]
    max_len = max(len(c) for c in other_cols)

    for j, col in enumerate(other_cols):
        test_r2 = r2_score(y_true, df[col])
        
        # Create label with CV info if available
        if cv_results and col in cv_results:
            cv_mean = cv_results[col].get('cv_mean', 0)
            cv_std = cv_results[col].get('cv_std', 0)
            label = f"{col:<{max_len}}  Test R²={test_r2:>6.3f}  CV R²={cv_mean:.3f}±{cv_std:.3f}"
        else:
            label = f"{col:<{max_len}}  R²={test_r2:>6.3f}"
        
        ax.scatter(y_true, df[col].values,
                   s=55,
                   marker=markers[j % len(markers)],
                   color=colors[j % len(colors)],
                   edgecolors="k",
                   linewidths=0.6,
                   alpha=0.85,
                   label=label)

    ax.set_xlabel("True"); ax.set_ylabel("Predicted")
    ax.legend(loc="lower right", prop=mono, fontsize=9, framealpha=0.95)
    ax.grid(True)
    
    # Add text box with CV summary if available
    if cv_results:
        textstr = "Cross-Validation Summary\n"
        textstr += "-" * 25 + "\n"
        for col in other_cols:
            if col in cv_results:
                cv_mean = cv_results[col].get('cv_mean', 0)
                cv_std = cv_results[col].get('cv_std', 0)
                test_r2 = cv_results[col].get('test_r2', r2_score(y_true, df[col]))
                textstr += f"{col}: {cv_mean:.3f}±{cv_std:.3f}\n"
        
        # Add text box
        props = dict(boxstyle='round', facecolor='wheat', alpha=0.8)
        ax.text(0.05, 0.95, textstr, transform=ax.transAxes, fontsize=9,
                verticalalignment='top', bbox=props, fontfamily='monospace')

    # Create folder structure: target_path / "plots" / ds / split_type
    plot_dir = target_path / "plots" / ds / split_type
    plot_dir.mkdir(parents=True, exist_ok=True)
    
    # Save with descriptive filename
    fname = plot_dir / f"{ds}_{split_type}_multi_fp_compare.png"
    plt.savefig(fname, dpi=300, bbox_inches="tight")
    plt.close()
    
    # Print relative path for clarity
    print(f"📈  Saved to: plots/{ds}/{split_type}/{fname.name}")

In [26]:
def cleanup_temp_files():
    """Clean up temporary numpy files from previous runs"""
    temp_files = ['X_train.npy', 'y_train.npy', 'X_test.npy', 'y_test.npy', 'predictions.npy']
    for file in temp_files:
        if os.path.exists(file):
            os.remove(file)
            print(f"  Cleaned up: {file}")

# Initialize result storage
all_metrics = []
cv_summary = []

In [None]:
# # Main processing loop
# for ds in DATASETS:
#     print(f"\n{'='*50}")
#     print(f"Processing dataset: {ds}")
    
#     for sp in SPLITS:
#         print(f"\n  Processing split: {sp}")
#         train_key = f"{ds}_train"
#         test_key = f"{ds}_test"
        
#         try:
#             # Clean up any leftover files
#             cleanup_temp_files()
            
#             # Get y data
#             y_train = y_map[sp][train_key]
#             y_test = y_map[sp][test_key]
#             df_panel = pd.DataFrame({"y_truth": y_test})
            
#             for i, combo in enumerate(FP_COMBOS):
#                 combo_name = "+".join(combo)
#                 print(f"    Processing combo {i+1}/{len(FP_COMBOS)}: {combo_name}")
                
#                 # Get X data from all_combinations
#                 X_train = all_combinations[combo_name][sp][train_key]['combined']
#                 X_test = all_combinations[combo_name][sp][test_key]['combined']
                
#                 print(f"      Data shapes - Train: {X_train.shape}, Test: {X_test.shape}")
                
#                 # Run CV benchmark
#                 res = run_model_benchmark(
#                     X_train, y_train, X_test, y_test,
#                     split_type=sp,
#                     combo=combo,
#                     target_path=target_path / ds / sp,
#                     n_folds=5,
#                     BATCHSIZE=BATCHSIZE,
#                     EPOCHS=EPOCHS,
#                     lr=lr,
#                     RANDOM_STATE=RANDOM_STATE
#                 )
                
#                 # Record detailed metrics including CV results
#                 for mdl, m in res.items():
#                     # Basic metrics
#                     metric_entry = {
#                         "dataset": ds, 
#                         "split": sp,
#                         "fp_combo": combo_name, 
#                         "model": mdl,
#                         "RMSE": m["RMSE"], 
#                         "MAE": m["MAE"], 
#                         "R2": m["R2"],
#                         # CV metrics
#                         "CV_train_R2_mean": m.get("CV_train_mean", np.nan),
#                         "CV_train_R2_std": m.get("CV_train_std", np.nan),
#                         "CV_val_R2_mean": m.get("CV_val_mean", np.nan),
#                         "CV_val_R2_std": m.get("CV_val_std", np.nan)
#                     }
#                     all_metrics.append(metric_entry)
                
#                 # CV summary for analysis
#                 cv_summary.append({
#                     "dataset": ds,
#                     "split": sp,
#                     "fp_combo": combo_name,
#                     "best_model": max(res.items(), key=lambda x: x[1]["R2"])[0],
#                     "best_test_R2": max(res.items(), key=lambda x: x[1]["R2"])[1]["R2"],
#                     "best_cv_val_R2": max(res.items(), key=lambda x: x[1]["CV_val_mean"])[1]["CV_val_mean"]
#                 })
                
#                 # Only plot for selected combinations
#                 if combo_name in PLOT_COMBOS:
#                     # Get best model based on test R2
#                     best_model_name, best_model_res = max(res.items(), key=lambda x: x[1]["R2"])
                    
#                     # Check if predictions are available
#                     if "predictions" in best_model_res and best_model_res["predictions"] is not None:
#                         df_panel[combo_name] = best_model_res["predictions"]
#                         print(f"      Added predictions for {best_model_name}")
#                     else:
#                         # If no predictions, try to load from file (for DNN)
#                         if best_model_name == "DNN":
#                             predictions_file = Path("predictions.npy")
#                             if predictions_file.exists():
#                                 predictions = np.load(predictions_file)
#                                 # Verify shape matches
#                                 if len(predictions) == len(y_test):
#                                     df_panel[combo_name] = predictions
#                                     print(f"      Loaded DNN predictions from {predictions_file}")
#                                 else:
#                                     print(f"      Warning: Predictions shape mismatch. Expected {len(y_test)}, got {len(predictions)}")
#                                     df_panel[combo_name] = np.zeros_like(y_test)
#                             else:
#                                 print(f"      Warning: predictions.npy not found, using zeros for visualization")
#                                 df_panel[combo_name] = np.zeros_like(y_test)
#                         else:
#                             print(f"      Warning: No predictions available for {best_model_name}, using zeros")
#                             df_panel[combo_name] = np.zeros_like(y_test)
            
#             # Create visualization if we have valid predictions
#             print(f"    Creating visualization...")
#             # Check if we have any non-zero predictions
#             non_zero_cols = [col for col in df_panel.columns if col != "y_truth" and not np.all(df_panel[col] == 0)]
#             if non_zero_cols:
#                 vis_res(df_panel, ds, sp, target_path)
#             else:
#                 print(f"    ⚠️  Skipping visualization - all predictions are zeros")
            
#             # Clean up after each combination
#             cleanup_temp_files()
            
#         except KeyError as e:
#             print(f"    ⚠️  Skipping {ds}/{sp}: Key error - {e}")
#             continue
#         except Exception as e:
#             print(f"    ❌ Error in {ds}/{sp}: {e}")
#             import traceback
#             traceback.print_exc()
#             continue



Processing dataset: ws

  Processing split: rm
  Cleaned up: X_train.npy
  Cleaned up: y_train.npy
  Cleaned up: X_test.npy
  Cleaned up: y_test.npy
  Cleaned up: predictions.npy
    Processing combo 1/7: morgan
      Data shapes - Train: (396, 2048), Test: (100, 2048)
Starting 5-fold cross-validation...

--- Fold 1/5 ---
    Training DNN...


I0000 00:00:1751793452.308313   70416 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 3586 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3060 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.6


        [DNN] Return code: 0
        [DNN] Full stdout:
INFO: Starting training with args: ['32', '100', '0.001', 'X_train.npy', 'y_train.npy', 'X_test.npy', 'y_test.npy']
INFO: Data loaded - X_tr: (316, 2048), y_tr: (316,), X_val: (80, 2048), y_val: (80,)
INFO: Starting prediction on 80 samples
INFO: Prediction complete - shape: (80,)
R2: 0.634728
RMSE: 1.223780
MAE: 0.945902
MSE: 1.497636
INFO: Predictions saved to predictions.npy
INFO: Verified predictions saved, shape: (80,)

        [DNN] Number of stdout lines: 10
        [DNN] stderr: 2025-07-06 18:17:34.367776: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751793454.388661   70496 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751793454.396038   70496 cuda_blas.cc:1407] Unable to 

In [None]:
# Main processing loop with CV results collection
for ds in DATASETS:
    print(f"\n{'='*50}")
    print(f"Processing dataset: {ds}")
    
    for sp in SPLITS:
        print(f"\n  Processing split: {sp}")
        train_key = f"{ds}_train"
        test_key = f"{ds}_test"
        
        try:
            # Clean up any leftover files
            cleanup_temp_files()
            
            # Get y data
            y_train = y_map[sp][train_key]
            y_test = y_map[sp][test_key]
            df_panel = pd.DataFrame({"y_truth": y_test})
            
            # Store CV results for visualization
            cv_results_for_vis = {}
            
            for i, combo in enumerate(FP_COMBOS):
                combo_name = "+".join(combo)
                print(f"    Processing combo {i+1}/{len(FP_COMBOS)}: {combo_name}")
                
                # Get X data from all_combinations
                X_train = all_combinations[combo_name][sp][train_key]['combined']
                X_test = all_combinations[combo_name][sp][test_key]['combined']
                
                print(f"      Data shapes - Train: {X_train.shape}, Test: {X_test.shape}")
                
                # Run CV benchmark
                res = run_model_benchmark(
                    X_train, y_train, X_test, y_test,
                    split_type=sp,
                    combo=combo,
                    target_path=target_path / ds / sp,
                    n_folds=5,
                    BATCHSIZE=BATCHSIZE,
                    EPOCHS=EPOCHS,
                    lr=lr,
                    RANDOM_STATE=RANDOM_STATE
                )
                
                # Record detailed metrics including CV results
                for mdl, m in res.items():
                    # Basic metrics
                    metric_entry = {
                        "dataset": ds, 
                        "split": sp,
                        "fp_combo": combo_name, 
                        "model": mdl,
                        "RMSE": m["RMSE"], 
                        "MAE": m["MAE"], 
                        "R2": m["R2"],
                        # CV metrics
                        "CV_train_R2_mean": m.get("CV_train_mean", np.nan),
                        "CV_train_R2_std": m.get("CV_train_std", np.nan),
                        "CV_val_R2_mean": m.get("CV_val_mean", np.nan),
                        "CV_val_R2_std": m.get("CV_val_std", np.nan)
                    }
                    all_metrics.append(metric_entry)
                
                # CV summary for analysis
                cv_summary.append({
                    "dataset": ds,
                    "split": sp,
                    "fp_combo": combo_name,
                    "best_model": max(res.items(), key=lambda x: x[1]["R2"])[0],
                    "best_test_R2": max(res.items(), key=lambda x: x[1]["R2"])[1]["R2"],
                    "best_cv_val_R2": max(res.items(), key=lambda x: x[1]["CV_val_mean"])[1]["CV_val_mean"]
                })
                
                # Only plot for selected combinations
                if combo_name in PLOT_COMBOS:
                    # Get best model based on test R2
                    best_model_name, best_model_res = max(res.items(), key=lambda x: x[1]["R2"])
                    
                    # Store CV results for visualization
                    cv_results_for_vis[combo_name] = {
                        'cv_mean': best_model_res.get("CV_val_mean", 0),
                        'cv_std': best_model_res.get("CV_val_std", 0),
                        'test_r2': best_model_res["R2"]
                    }
                    
                    # Check if predictions are available
                    if "predictions" in best_model_res and best_model_res["predictions"] is not None:
                        df_panel[combo_name] = best_model_res["predictions"]
                        print(f"      Added predictions for {best_model_name}")
                    else:
                        # If no predictions, try to load from file (for DNN)
                        if best_model_name == "DNN":
                            predictions_file = Path("predictions.npy")
                            if predictions_file.exists():
                                predictions = np.load(predictions_file)
                                # Verify shape matches
                                if len(predictions) == len(y_test):
                                    df_panel[combo_name] = predictions
                                    print(f"      Loaded DNN predictions from {predictions_file}")
                                else:
                                    print(f"      Warning: Predictions shape mismatch. Expected {len(y_test)}, got {len(predictions)}")
                                    df_panel[combo_name] = np.zeros_like(y_test)
                            else:
                                print(f"      Warning: predictions.npy not found, using zeros for visualization")
                                df_panel[combo_name] = np.zeros_like(y_test)
                        else:
                            print(f"      Warning: No predictions available for {best_model_name}, using zeros")
                            df_panel[combo_name] = np.zeros_like(y_test)
            
            # Create visualization if we have valid predictions
            print(f"    Creating visualization...")
            # Check if we have any non-zero predictions
            non_zero_cols = [col for col in df_panel.columns if col != "y_truth" and not np.all(df_panel[col] == 0)]
            if non_zero_cols:
                # Pass CV results to vis_res
                vis_res(df_panel, ds, sp, target_path, cv_results=cv_results_for_vis)
            else:
                print(f"    ⚠️  Skipping visualization - all predictions are zeros")
            
            # Clean up after each combination
            cleanup_temp_files()
            
        except KeyError as e:
            print(f"    ⚠️  Skipping {ds}/{sp}: Key error - {e}")
            continue
        except Exception as e:
            print(f"    ❌ Error in {ds}/{sp}: {e}")
            import traceback
            traceback.print_exc()
            continue

In [None]:
# Save results
print("\n" + "="*50)
print("SAVING RESULTS")
print("="*50)

from datetime import datetime
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Create DataFrame once
metrics_df = pd.DataFrame(all_metrics)

# Save with timestamp filename
filename = f"fp_combo_all_metrics_{timestamp}.csv"
metrics_df.to_csv(target_path / filename, index=False)
print(f"💾 Detailed metrics saved to: {filename}")

# Save CV summary with timestamp
if cv_summary:
    cv_summary_filename = f"fp_combo_cv_summary_{timestamp}.csv"
    cv_summary_df = pd.DataFrame(cv_summary)
    cv_summary_df.to_csv(target_path / cv_summary_filename, index=False)
    print(f"💾 CV summary saved to: {cv_summary_filename}")

# Basic summary
print(f"\n✅ All tasks completed successfully")
print(f"📊 Saved {len(metrics_df)} metric records")
print(f"📁 Location: {target_path}")
print(f"\nSummary:")
print(f"  - Datasets: {metrics_df['dataset'].nunique()}")
print(f"  - Splits: {metrics_df['split'].nunique()}")
print(f"  - FP Combinations: {metrics_df['fp_combo'].nunique()}")
print(f"  - Models: {metrics_df['model'].nunique()}")

# CV-specific summary
if 'CV_val_R2_mean' in metrics_df.columns:
    n_folds = 5  # or get from configuration
    print(f"\nCross-Validation Summary:")
    print(f"  - Number of folds: {n_folds}")
    print(f"  - Total CV runs: {metrics_df['dataset'].nunique() * metrics_df['split'].nunique() * metrics_df['fp_combo'].nunique() * n_folds}")

# Print best models summary
print("\n" + "="*50)
print("BEST MODELS SUMMARY")
print("="*50)

if not metrics_df.empty:
    best_by_dataset = metrics_df.groupby(['dataset', 'split', 'fp_combo'])['R2'].agg(['max', 'idxmax'])
    for (ds, sp, fp), (max_r2, idx) in best_by_dataset.iterrows():
        best_row = metrics_df.loc[idx]
        print(f"\n{ds} / {sp} / {fp}:")
        print(f"  Best Model: {best_row['model']}")
        print(f"  Test R2: {best_row['R2']:.4f}")
        if 'CV_val_R2_mean' in best_row and not pd.isna(best_row['CV_val_R2_mean']):
            print(f"  CV Val R2: {best_row['CV_val_R2_mean']:.4f} ± {best_row['CV_val_R2_std']:.4f}")

# Best performing models overall
print(f"\nTop 5 Best Performing Models (by Test R2):")
if len(metrics_df) >= 5:
    top_models = metrics_df.nlargest(5, 'R2')[['dataset', 'split', 'fp_combo', 'model', 'R2']]
    if 'CV_val_R2_mean' in metrics_df.columns:
        top_models = metrics_df.nlargest(5, 'R2')[['dataset', 'split', 'fp_combo', 'model', 'R2', 'CV_val_R2_mean']]
    
    for idx, row in top_models.iterrows():
        output = f"  {row['dataset']}/{row['split']}/{row['fp_combo']} - {row['model']}: R2={row['R2']:.4f}"
        if 'CV_val_R2_mean' in row and not pd.isna(row['CV_val_R2_mean']):
            output += f" (CV Val: {row['CV_val_R2_mean']:.4f})"
        print(output)

# Additional analysis: Overfitting detection
if 'CV_train_R2_mean' in metrics_df.columns and 'CV_val_R2_mean' in metrics_df.columns:
    print("\n" + "="*50)
    print("OVERFITTING ANALYSIS")
    print("="*50)
    
    overfitting_found = False
    for _, row in metrics_df.iterrows():
        train_cv = row['CV_train_R2_mean']
        val_cv = row['CV_val_R2_mean']
        test = row['R2']
        
        if not np.isnan(train_cv) and not np.isnan(val_cv):
            overfit_score = train_cv - val_cv
            generalization_gap = val_cv - test
            
            if overfit_score > 0.1:  # Threshold for overfitting
                if not overfitting_found:
                    overfitting_found = True
                print(f"\n⚠️  Potential overfitting detected:")
                print(f"   {row['dataset']} / {row['split']} / {row['fp_combo']} / {row['model']}")
                print(f"   Train-Val gap: {overfit_score:.4f}")
                print(f"   Val-Test gap: {generalization_gap:.4f}")
    
    if not overfitting_found:
        print("\n✅ No significant overfitting detected")

# Check for models with good generalization
if 'CV_val_R2_mean' in metrics_df.columns:
    print(f"\nModels with Best Generalization (small CV-Test gap):")
    metrics_df['generalization_gap'] = abs(metrics_df['CV_val_R2_mean'] - metrics_df['R2'])
    
    # Filter out NaN values
    valid_gen = metrics_df.dropna(subset=['generalization_gap'])
    
    if len(valid_gen) >= 5:
        good_gen = valid_gen.nsmallest(5, 'generalization_gap')[['dataset', 'split', 'fp_combo', 'model', 'R2', 'CV_val_R2_mean', 'generalization_gap']]
        for idx, row in good_gen.iterrows():
            print(f"  {row['dataset']}/{row['split']}/{row['fp_combo']} - {row['model']}: "
                  f"Gap={row['generalization_gap']:.4f} (Test R2={row['R2']:.4f}, CV Val={row['CV_val_R2_mean']:.4f})")

print("\n✅ Pipeline completed!")