# xgb for AM-I, AM-II, AM-III

In [1]:
"""
XGBoost + Optuna 5-fold CV (R¬≤ mean) + iPhone Style Plotting
Sequential processing for AM-I, AM-II, AM-III datasets
"""
import os
import pandas as pd
import numpy as np
import optuna
import joblib
import xgboost as xgb
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from joblib import Parallel, delayed
from sklearn.model_selection import learning_curve
# =========================================================
# 1. Configuration
# =========================================================
DATA_FOLDER = './train_test_split'  # Modified: data files are in train_test_split folder
OUTPUT_ROOT = os.path.join('./', 'xgb-models')
os.makedirs(OUTPUT_ROOT, exist_ok=True)

FEATURE_COLS = ['MolWt', 'logP', 'TPSA', 'H_bond_donors', 'H_bond_acceptors']
FP_COLS   = [f'col{i}'   for i in range(823)]
MG_COLS   = [f'fp_{i}'   for i in range(1024)]
ALL_FEATURES = FEATURE_COLS + FP_COLS + MG_COLS
TARGET_COL   = 'UV_RT-s'

# iPhone Style Color Palette
IPHONE_COLORS = {
    'scatter': '#007AFF',
    'line':    '#AEAEB2',
    'text':    '#000000'
}

# =========================================================
# 2. Utility Functions
# =========================================================
def evaluate(y_true, y_pred):
    """Calculate evaluation metrics."""
    return {
        'R2':  r2_score(y_true, y_pred),
        'MAE': mean_absolute_error(y_true, y_pred),
        'RMSE': np.sqrt(mean_squared_error(y_true, y_pred))
    }

def iphone_style_ax(ax):
    """Apply iPhone-style aesthetics to matplotlib axes."""
    ax.tick_params(axis='both', direction='out', length=6, width=2, labelsize=16)
    for spine in ['top', 'right', 'bottom', 'left']:
        ax.spines[spine].set_visible(True)
    ax.grid(False)

def plot_scatter(y_true, y_pred, save_path):
    """Create scatter plot of true vs predicted values."""
    plt.figure(figsize=(6, 6))
    ax = plt.gca()
    iphone_style_ax(ax)
    plt.scatter(y_true, y_pred, alpha=0.8, s=70, color=IPHONE_COLORS['scatter'])
    lims = [min(y_true.min(), y_pred.min()), max(y_true.max(), y_pred.max())]
    plt.plot(lims, lims, linestyle='--', color=IPHONE_COLORS['line'], linewidth=3)

    r2  = r2_score(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    plt.text(0.05, 0.95, f"R¬≤ = {r2:.3g}\nMAE = {mae:.3g}",
             transform=ax.transAxes, va='top', fontsize=16, color=IPHONE_COLORS['text'])

    plt.xlabel("True Retention Time (s)", fontsize=17)
    plt.ylabel("Predicted Retention Time (s)", fontsize=17)
    plt.tight_layout()
    plt.savefig(save_path, dpi=300)
    plt.close()

def plot_residuals(y_true, y_pred, save_path):
    """Create residual plot."""
    residuals = y_pred - y_true
    plt.figure(figsize=(6, 6))
    ax = plt.gca()
    iphone_style_ax(ax)
    plt.scatter(y_pred, residuals, alpha=0.8, s=70, color=IPHONE_COLORS['scatter'])
    plt.axhline(y=0, linestyle='--', color=IPHONE_COLORS['line'], linewidth=3)

    r2  = r2_score(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    plt.text(0.05, 0.95, f"R¬≤ = {r2:.3g}\nMAE = {mae:.3g}",
             transform=ax.transAxes, va='top', fontsize=16, color=IPHONE_COLORS['text'])

    plt.xlabel("Predicted Retention Time (s)", fontsize=17)
    plt.ylabel("Residuals (Predicted - True)", fontsize=17)
    plt.tight_layout()
    plt.savefig(save_path, dpi=300)
    plt.close()

def plot_learning_curve(train_sizes, train_scores, val_scores, save_path):
    """Plot learning curve."""
    plt.figure(figsize=(6, 6))
    ax = plt.gca()
    iphone_style_ax(ax)

    plt.plot(train_sizes, np.mean(train_scores, axis=1), 'o-',
             color=IPHONE_COLORS['scatter'], linewidth=3, label='Train R¬≤')
    plt.plot(train_sizes, np.mean(val_scores, axis=1), 'o-',
             color=IPHONE_COLORS['line'], linewidth=3, label='Val R¬≤')

    plt.xlabel('Training examples', fontsize=17)
    plt.ylabel('R¬≤', fontsize=17)
    plt.title('Learning Curve (XGBoost)', fontsize=17)
    plt.legend(fontsize=14)
    plt.tight_layout()
    plt.savefig(save_path, dpi=300)
    plt.close()

def check_data_files(dataset_name):
    """Check if data files exist and their structure."""
    train_file = os.path.join(DATA_FOLDER, f"{dataset_name}_train.csv")
    test_file = os.path.join(DATA_FOLDER, f"{dataset_name}_test.csv")
    
    print(f"\nüîç Checking data files for {dataset_name}:")
    print(f"   Train file: {train_file}")
    print(f"   Test file: {test_file}")
    
    if not os.path.exists(train_file):
        print(f"   ‚ùå Train file not found!")
        return False
    if not os.path.exists(test_file):
        print(f"   ‚ùå Test file not found!")
        return False
    
    # Check file structure
    try:
        train_df = pd.read_csv(train_file, nrows=1)
        test_df = pd.read_csv(test_file, nrows=1)
        
        print(f"   ‚úÖ Files found. Checking structure...")
        print(f"   Train shape: {pd.read_csv(train_file).shape}")
        print(f"   Test shape: {pd.read_csv(test_file).shape}")
        
        # Check target column
        if TARGET_COL not in train_df.columns:
            print(f"   ‚ùå Target column '{TARGET_COL}' not found in train data!")
            print(f"   Available columns: {list(train_df.columns[:5])}...")
            return False
        
        # Check some feature columns
        missing_features = [col for col in ALL_FEATURES[:10] if col not in train_df.columns]
        if missing_features:
            print(f"   ‚ö†Ô∏è  Some features missing: {missing_features[:5]}...")
        
        return True
    except Exception as e:
        print(f"   ‚ùå Error reading files: {str(e)}")
        return False

# =========================================================
# 3. Main Processing Function
# =========================================================
def process_dataset(dataset_name):
    """Process a single dataset with XGBoost and Optuna optimization."""
    train_file = os.path.join(DATA_FOLDER, f"{dataset_name}_train.csv")
    test_file  = os.path.join(DATA_FOLDER, f"{dataset_name}_test.csv")
    
    # First check if files exist
    if not check_data_files(dataset_name):
        print(f"[SKIP] Cannot process {dataset_name} due to missing or invalid data files")
        return None

    print(f"\nüöÄ Processing dataset: {dataset_name}")
    output_dir = os.path.join(OUTPUT_ROOT, dataset_name)
    os.makedirs(output_dir, exist_ok=True)

    # ---------- Load Data ----------
    print(f"   üì• Loading data from {DATA_FOLDER}...")
    train_df = pd.read_csv(train_file)
    test_df  = pd.read_csv(test_file)
    
    # Display data info
    print(f"   üìä Data loaded:")
    print(f"     Train set: {len(train_df)} samples, {len(train_df.columns)} columns")
    print(f"     Test set: {len(test_df)} samples, {len(test_df.columns)} columns")
    print(f"     Target range (train): {train_df[TARGET_COL].min():.2f} - {train_df[TARGET_COL].max():.2f}")
    print(f"     Target range (test): {test_df[TARGET_COL].min():.2f} - {test_df[TARGET_COL].max():.2f}")

    X_train = train_df[ALL_FEATURES]
    y_train = train_df[TARGET_COL]
    X_test  = test_df[ALL_FEATURES]
    y_test  = test_df[TARGET_COL]

    dtrain_full = xgb.DMatrix(X_train, label=y_train)
    dtest       = xgb.DMatrix(X_test)

    # ---------- Optuna Hyperparameter Optimization ----------
    print(f"   üîç Optimizing hyperparameters for {dataset_name}...")
    def objective(trial):
        params = {
            'max_depth':        trial.suggest_int('max_depth', 3, 10),
            'learning_rate':    trial.suggest_float('learning_rate', 1e-3, 0.3, log=True),
            'subsample':        trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
            'objective':        'reg:squarederror',
            'tree_method':      'hist',
            'eval_metric':      'rmse',
            'seed':             42
        }
        cv = xgb.cv(params, dtrain_full,
                    num_boost_round=1000,
                    nfold=5,
                    early_stopping_rounds=50,
                    metrics='rmse',
                    seed=42,
                    verbose_eval=False)
        # Calculate R¬≤ from RMSE: R¬≤ = 1 - RMSE¬≤ / Var(y)
        rmse = cv['test-rmse-mean'].iloc[-1]
        var_y = np.var(y_train)
        r2 = 1 - (rmse ** 2) / var_y
        return r2

    study = optuna.create_study(direction='maximize',
                                sampler=optuna.samplers.TPESampler(seed=42))
    study.optimize(objective, n_trials=30, n_jobs=26)

    best_params = study.best_params
    final_params = {
        'max_depth':        best_params['max_depth'],
        'learning_rate':    best_params['learning_rate'],
        'subsample':        best_params['subsample'],
        'colsample_bytree': best_params['colsample_bytree'],
        'min_child_weight': best_params['min_child_weight'],
        'objective':        'reg:squarederror',
        'tree_method':      'hist',
        'eval_metric':      'rmse',
        'seed':             42
    }
    
    # Save Optuna trial results
    optuna_log_df = pd.DataFrame([(t.number, t.value, t.params) for t in study.trials],
                                 columns=['trial', 'val_r2_mean', 'params'])
    optuna_log_df.to_csv(os.path.join(output_dir, f"{dataset_name}_optuna_log.csv"), index=False)
    print(f"   ‚úÖ Optuna optimization completed. Best R¬≤: {study.best_value:.4f}")

    # ---------- Train Final XGBoost Model ----------
    print(f"   üèãÔ∏è Training final model for {dataset_name}...")
    evals_result = {}
    final_model = xgb.train(
        final_params,
        dtrain_full,
        num_boost_round=1000,
        evals=[(dtrain_full, 'train')],
        evals_result=evals_result,
        verbose_eval=False
    )
    joblib.dump(final_model, os.path.join(output_dir, f"{dataset_name}_final_model.pkl"))

    # ---------- Predictions ----------
    print(f"   üìä Making predictions for {dataset_name}...")
    y_pred       = final_model.predict(dtest)
    y_train_pred = final_model.predict(dtrain_full)

    train_metrics = evaluate(y_train, y_train_pred)
    test_metrics  = evaluate(y_test,  y_pred)

    # Save predictions
    pd.DataFrame({'true': y_test, 'predicted': y_pred})\
      .to_csv(os.path.join(output_dir, f"{dataset_name}_test_predictions.csv"), index=False)

    # Save evaluation metrics
    pd.DataFrame([train_metrics, test_metrics], index=['train', 'test'])\
      .to_csv(os.path.join(output_dir, f"{dataset_name}_evaluation_summary.csv"))

    # ---------- Generate Plots ----------
    print(f"   üé® Generating plots for {dataset_name}...")
    plot_scatter(y_test, y_pred,
                 os.path.join(output_dir, f"{dataset_name}_scatter.png"))
    plot_residuals(y_test, y_pred,
                   os.path.join(output_dir, f"{dataset_name}_residuals.png"))

    # Learning Curve
    train_sizes, train_scores, val_scores = learning_curve(
        xgb.XGBRegressor(**{k: v for k, v in final_params.items() if k != 'objective'}),
        X_train, y_train,
        cv=KFold(n_splits=5, shuffle=True, random_state=42),
        scoring='r2', n_jobs=26,
        train_sizes=np.linspace(0.1, 1.0, 5))
    plot_learning_curve(train_sizes, train_scores, val_scores,
                        os.path.join(output_dir, f"{dataset_name}_learning_curve.png"))

    # ---------- Feature Importance ----------
    print(f"   üìà Calculating feature importance for {dataset_name}...")
    importance = final_model.get_score(importance_type='gain')
    importance_df = pd.DataFrame({
        'feature': list(importance.keys()),
        'importance': list(importance.values())
    }).sort_values('importance', ascending=False)
    importance_df.to_csv(os.path.join(output_dir, f"{dataset_name}_feature_importance.csv"), index=False)

    # Plot top 20 features
    top_n = 20
    if len(importance_df) > top_n:
        plot_df = importance_df.head(top_n)
    else:
        plot_df = importance_df
    
    plt.figure(figsize=(10, 8))
    ax = plt.gca()
    iphone_style_ax(ax)
    colors = plt.cm.Blues(np.linspace(0.3, 1, len(plot_df)))
    ax.barh(range(len(plot_df)), plot_df['importance'], color=colors)
    ax.set_yticks(range(len(plot_df)))
    ax.set_yticklabels(plot_df['feature'], fontsize=10)
    ax.set_xlabel('Feature Importance (Gain)', fontsize=14)
    ax.set_title(f'Top {len(plot_df)} Feature Importance - {dataset_name}', fontsize=16)
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, f"{dataset_name}_feature_importance.png"), dpi=300)
    plt.close()

    print(f"‚úÖ Completed: {dataset_name}")
    print(f"   Test R¬≤: {test_metrics['R2']:.4f}, MAE: {test_metrics['MAE']:.4f}, RMSE: {test_metrics['RMSE']:.4f}")
    print(f"   Results saved to: {output_dir}\n")
    
    # Return metrics for summary
    return {
        'dataset': dataset_name,
        'best_r2': study.best_value,
        'test_r2': test_metrics['R2'],
        'test_mae': test_metrics['MAE'],
        'test_rmse': test_metrics['RMSE'],
        'train_r2': train_metrics['R2'],
        'n_train': len(train_df),
        'n_test': len(test_df)
    }

# =========================================================
# 4. Sequential Processing for AM Datasets
# =========================================================
if __name__ == "__main__":
    # Define target datasets for sequential processing
    TARGET_DATASETS = ['AM-I-filtered_with_labels_k4', 'AM-II-filtered_with_labels_k3', 'AM-III-filtered_with_labels_k4']
    
    print("=" * 60)
    print("XGBoost Sequential Processing for AM Datasets")
    print("=" * 60)
    print(f"üìÅ Data folder: {os.path.abspath(DATA_FOLDER)}")
    print(f"üìÅ Output folder: {os.path.abspath(OUTPUT_ROOT)}")
    
    # Check if data folder exists
    if not os.path.exists(DATA_FOLDER):
        print(f"\n‚ùå ERROR: Data folder '{DATA_FOLDER}' does not exist!")
        print(f"Please create the folder and place your data files there.")
        print(f"Expected files: AM-I_train.csv, AM-I_test.csv, etc.")
        exit(1)
    
    # List files in data folder
    print(f"\nüìã Files in data folder:")
    data_files = os.listdir(DATA_FOLDER)
    csv_files = [f for f in data_files if f.endswith('.csv')]
    for file in sorted(csv_files):
        file_path = os.path.join(DATA_FOLDER, file)
        file_size = os.path.getsize(file_path) / (1024*1024)  # MB
        print(f"   {file} ({file_size:.1f} MB)")
    
    # Sequential processing (no parallelization)
    all_metrics = []
    for dataset in TARGET_DATASETS:
        print(f"\n{'='*40}")
        try:
            metrics = process_dataset(dataset)
            if metrics:
                all_metrics.append(metrics)
        except Exception as e:
            print(f"‚ùå Error processing {dataset}: {str(e)}")
            import traceback
            traceback.print_exc()
    
    # Print summary table
    if all_metrics:
        print("\n" + "=" * 60)
        print("SUMMARY OF RESULTS")
        print("=" * 60)
        summary_df = pd.DataFrame(all_metrics)
        # Reorder columns for better readability
        summary_df = summary_df[['dataset', 'n_train', 'n_test', 'best_r2', 'train_r2', 'test_r2', 'test_mae', 'test_rmse']]
        print(summary_df.to_string(index=False, float_format=lambda x: f"{x:.4f}"))
        
        # Save summary
        summary_df.to_csv(os.path.join(OUTPUT_ROOT, 'summary_results.csv'), index=False)
        print(f"\nüìã Summary saved to: {os.path.join(OUTPUT_ROOT, 'summary_results.csv')}")
        
        # Create a simple visualization of results comparison
        plt.figure(figsize=(10, 6))
        ax = plt.gca()
        iphone_style_ax(ax)
        
        x_pos = np.arange(len(all_metrics))
        width = 0.35
        
        train_r2 = [m['train_r2'] for m in all_metrics]
        test_r2 = [m['test_r2'] for m in all_metrics]
        
        ax.bar(x_pos - width/2, train_r2, width, label='Train R¬≤', color=IPHONE_COLORS['scatter'])
        ax.bar(x_pos + width/2, test_r2, width, label='Test R¬≤', color=IPHONE_COLORS['line'])
        
        ax.set_xlabel('Dataset', fontsize=14)
        ax.set_ylabel('R¬≤ Score', fontsize=14)
        ax.set_title('Model Performance Comparison', fontsize=16)
        ax.set_xticks(x_pos)
        ax.set_xticklabels([m['dataset'] for m in all_metrics])
        ax.legend()
        plt.tight_layout()
        plt.savefig(os.path.join(OUTPUT_ROOT, 'performance_comparison.png'), dpi=300)
        plt.close()
        print(f"üìä Comparison plot saved to: {os.path.join(OUTPUT_ROOT, 'performance_comparison.png')}")
    
    print("\n‚ú® All processing completed! ‚ú®")

  from .autonotebook import tqdm as notebook_tqdm


XGBoost Sequential Processing for AM Datasets
üìÅ Data folder: /home/xuxianyan/uplc/uplc-260116/train_test_split
üìÅ Output folder: /home/xuxianyan/uplc/uplc-260116/xgb-models

üìã Files in data folder:
   AM-I-filtered_with_labels_k4_test.csv (4.9 MB)
   AM-I-filtered_with_labels_k4_train.csv (43.6 MB)
   AM-II-filtered_with_labels_k3_test.csv (1.3 MB)
   AM-II-filtered_with_labels_k3_train.csv (11.8 MB)
   AM-III-filtered_with_labels_k4_test.csv (0.9 MB)
   AM-III-filtered_with_labels_k4_train.csv (7.8 MB)


üîç Checking data files for AM-I-filtered_with_labels_k4:
   Train file: ./train_test_split/AM-I-filtered_with_labels_k4_train.csv
   Test file: ./train_test_split/AM-I-filtered_with_labels_k4_test.csv
   ‚úÖ Files found. Checking structure...
   Train shape: (6118, 1857)
   Test shape: (681, 1857)

üöÄ Processing dataset: AM-I-filtered_with_labels_k4
   üì• Loading data from ./train_test_split...
   üìä Data loaded:
     Train set: 6118 samples, 1857 columns
     Test set

[I 2026-01-19 17:59:40,509] A new study created in memory with name: no-name-cbacaf11-305a-4ae9-b637-df143ac23a64


   üîç Optimizing hyperparameters for AM-I-filtered_with_labels_k4...


[I 2026-01-19 18:48:23,683] Trial 0 finished with value: 0.865777075657072 and parameters: {'max_depth': 8, 'learning_rate': 0.2501250251882702, 'subsample': 0.9387194583773132, 'colsample_bytree': 0.7464807021374643, 'min_child_weight': 1}. Best is trial 0 with value: 0.865777075657072.
[I 2026-01-19 18:51:22,647] Trial 14 finished with value: 0.8800862141696306 and parameters: {'max_depth': 9, 'learning_rate': 0.13664501539596258, 'subsample': 0.6764948666394592, 'colsample_bytree': 0.8606829830248302, 'min_child_weight': 5}. Best is trial 14 with value: 0.8800862141696306.
[I 2026-01-19 18:58:49,232] Trial 16 finished with value: 0.887855892703365 and parameters: {'max_depth': 7, 'learning_rate': 0.16439288401916352, 'subsample': 0.913790590824245, 'colsample_bytree': 0.8197820036760587, 'min_child_weight': 8}. Best is trial 16 with value: 0.887855892703365.
[I 2026-01-19 19:10:43,380] Trial 11 finished with value: 0.8744509370649503 and parameters: {'max_depth': 3, 'learning_rate':

   ‚úÖ Optuna optimization completed. Best R¬≤: 0.8974
   üèãÔ∏è Training final model for AM-I-filtered_with_labels_k4...
   üìä Making predictions for AM-I-filtered_with_labels_k4...
   üé® Generating plots for AM-I-filtered_with_labels_k4...
   üìà Calculating feature importance for AM-I-filtered_with_labels_k4...
‚úÖ Completed: AM-I-filtered_with_labels_k4
   Test R¬≤: 0.9101, MAE: 2.9681, RMSE: 3.8937
   Results saved to: ./xgb-models/AM-I-filtered_with_labels_k4



üîç Checking data files for AM-II-filtered_with_labels_k3:
   Train file: ./train_test_split/AM-II-filtered_with_labels_k3_train.csv
   Test file: ./train_test_split/AM-II-filtered_with_labels_k3_test.csv
   ‚úÖ Files found. Checking structure...
   Train shape: (1650, 1857)
   Test shape: (186, 1857)

üöÄ Processing dataset: AM-II-filtered_with_labels_k3
   üì• Loading data from ./train_test_split...
   üìä Data loaded:
     Train set: 1650 samples, 1857 columns
     Test set: 186 samples, 1857 columns
     Tar

[I 2026-01-19 19:30:05,064] A new study created in memory with name: no-name-a1e1d5cb-dc01-476d-b9e6-ed6502d813ce


   üîç Optimizing hyperparameters for AM-II-filtered_with_labels_k3...


[I 2026-01-19 19:54:08,248] Trial 2 finished with value: 0.8252444199166731 and parameters: {'max_depth': 6, 'learning_rate': 0.2844082399756833, 'subsample': 0.7478007350747022, 'colsample_bytree': 0.6238916024151742, 'min_child_weight': 6}. Best is trial 2 with value: 0.8252444199166731.
[I 2026-01-19 19:56:06,227] Trial 24 finished with value: 0.8195291225749841 and parameters: {'max_depth': 8, 'learning_rate': 0.20208034607826383, 'subsample': 0.7702147918124977, 'colsample_bytree': 0.7226459507136536, 'min_child_weight': 2}. Best is trial 2 with value: 0.8252444199166731.
[I 2026-01-19 20:01:40,414] Trial 19 finished with value: 0.8302393812976133 and parameters: {'max_depth': 9, 'learning_rate': 0.19811563776841498, 'subsample': 0.611698245535932, 'colsample_bytree': 0.9856117256271762, 'min_child_weight': 3}. Best is trial 19 with value: 0.8302393812976133.
[I 2026-01-19 20:05:57,072] Trial 4 finished with value: 0.8489726765102503 and parameters: {'max_depth': 4, 'learning_rate

   ‚úÖ Optuna optimization completed. Best R¬≤: 0.8656
   üèãÔ∏è Training final model for AM-II-filtered_with_labels_k3...
   üìä Making predictions for AM-II-filtered_with_labels_k3...
   üé® Generating plots for AM-II-filtered_with_labels_k3...
   üìà Calculating feature importance for AM-II-filtered_with_labels_k3...
‚úÖ Completed: AM-II-filtered_with_labels_k3
   Test R¬≤: 0.8759, MAE: 2.4358, RMSE: 3.2216
   Results saved to: ./xgb-models/AM-II-filtered_with_labels_k3



üîç Checking data files for AM-III-filtered_with_labels_k4:
   Train file: ./train_test_split/AM-III-filtered_with_labels_k4_train.csv
   Test file: ./train_test_split/AM-III-filtered_with_labels_k4_test.csv
   ‚úÖ Files found. Checking structure...
   Train shape: (1089, 1857)
   Test shape: (123, 1857)

üöÄ Processing dataset: AM-III-filtered_with_labels_k4
   üì• Loading data from ./train_test_split...
   üìä Data loaded:
     Train set: 1089 samples, 1857 columns
     Test set: 123 samples, 1857 column

[I 2026-01-19 20:52:39,094] A new study created in memory with name: no-name-515c7fa7-d2cd-45aa-b22c-6bc1806da482


   üîç Optimizing hyperparameters for AM-III-filtered_with_labels_k4...


[I 2026-01-19 21:11:24,647] Trial 9 finished with value: 0.8216915614314008 and parameters: {'max_depth': 5, 'learning_rate': 0.2727249808080775, 'subsample': 0.9543166694603076, 'colsample_bytree': 0.9669208656075335, 'min_child_weight': 4}. Best is trial 9 with value: 0.8216915614314008.
[I 2026-01-19 21:36:35,732] Trial 10 finished with value: 0.8418245356760308 and parameters: {'max_depth': 6, 'learning_rate': 0.09090139928025558, 'subsample': 0.6573879180990411, 'colsample_bytree': 0.9608457970384582, 'min_child_weight': 6}. Best is trial 10 with value: 0.8418245356760308.
[I 2026-01-19 21:53:22,134] Trial 20 finished with value: 0.8203811366395986 and parameters: {'max_depth': 10, 'learning_rate': 0.10123508130841984, 'subsample': 0.9985230564098673, 'colsample_bytree': 0.8789375461759092, 'min_child_weight': 10}. Best is trial 10 with value: 0.8418245356760308.
[I 2026-01-19 22:01:49,323] Trial 12 finished with value: 0.7835266667393814 and parameters: {'max_depth': 3, 'learning

   ‚úÖ Optuna optimization completed. Best R¬≤: 0.8448
   üèãÔ∏è Training final model for AM-III-filtered_with_labels_k4...
   üìä Making predictions for AM-III-filtered_with_labels_k4...
   üé® Generating plots for AM-III-filtered_with_labels_k4...
   üìà Calculating feature importance for AM-III-filtered_with_labels_k4...
‚úÖ Completed: AM-III-filtered_with_labels_k4
   Test R¬≤: 0.8900, MAE: 3.3834, RMSE: 4.8383
   Results saved to: ./xgb-models/AM-III-filtered_with_labels_k4


SUMMARY OF RESULTS
                       dataset  n_train  n_test  best_r2  train_r2  test_r2  test_mae  test_rmse
  AM-I-filtered_with_labels_k4     6118     681   0.8974    0.9810   0.9101    2.9681     3.8937
 AM-II-filtered_with_labels_k3     1650     186   0.8656    0.9968   0.8759    2.4358     3.2216
AM-III-filtered_with_labels_k4     1089     123   0.8448    0.9907   0.8900    3.3834     4.8383

üìã Summary saved to: ./xgb-models/summary_results.csv
üìä Comparison plot saved to: ./xgb-models/pe