#  lgb model of AM-II and AM-III by transfer learning of from AM-I 

In [None]:
import os
import json
import re
import joblib
import logging
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import lightgbm as lgb
from lightgbm import early_stopping
import optuna
from optuna.pruners import MedianPruner
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Set LightGBM thread count to 26
os.environ['LIGHTGBM_NUM_THREADS'] = '10'

# -------------------- Global Settings --------------------
SEED = 42
np.random.seed(SEED)

# Dataset directory set to current directory
DATA_DIR = "./train_test_split"

# Pretrained model directory set to ./lgb-models
PRETRAIN_MODEL_DIR = "./lgb-models"

# Modified model save directory to ./lgb-TL
OUTPUT_FOLDER = './'
MODEL_DIR = os.path.join(OUTPUT_FOLDER, 'lgb-TL')
os.makedirs(MODEL_DIR, exist_ok=True)

# -------------------- Logging Configuration --------------------
# Modified: Save log file to ./lgb-TL directory
log_file_path = os.path.join(MODEL_DIR, "lgb_transfer_from_AM-I.log")

logging.basicConfig(
    filename=log_file_path,
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    filemode="a"
)

TARGET_DATASETS = [
    "AM-III-filtered_with_labels_k4",
    "AM-II-filtered_with_labels_k3",
]

PRETRAIN_TAG = "AM-I-filtered_with_labels_k4"
# Keep original tag without replacing special characters
PRETRAIN_SAFE_TAG = PRETRAIN_TAG

# Pretrained model paths
PRETRAIN_MODEL_PATH = os.path.join(PRETRAIN_MODEL_DIR, f"{PRETRAIN_SAFE_TAG}_model.joblib")
PRETRAIN_FEATURE_PATH = os.path.join(PRETRAIN_MODEL_DIR, f"{PRETRAIN_SAFE_TAG}_feature_list.pkl")
PRETRAIN_IMPUTER_PATH = os.path.join(PRETRAIN_MODEL_DIR, f"{PRETRAIN_SAFE_TAG}_imputer.pkl")
PRETRAIN_METRICS_PATH = os.path.join(PRETRAIN_MODEL_DIR, f"{PRETRAIN_SAFE_TAG}_metrics.json")

IPHONE_COLORS = {
    "scatter": "#007AFF",
    "line": "#FF3B30",
    "residual": "#34C759",
    "text": "#1C1C1E"
}

def safe_tag(tag: str) -> str:
    """Return the tag as-is without replacing special characters."""
    return tag


# -------------------- Plotting Functions --------------------
def plot_scatter_and_residuals(y_true, y_pred, save_folder, base_name):
    try:
        os.makedirs(save_folder, exist_ok=True)
        # Scatter plot
        plt.figure(figsize=(6, 6))
        ax = plt.gca()
        ax.tick_params(axis='both', direction='out', length=6, width=1.2, color='black')
        for spine in ax.spines.values():
            spine.set_visible(True)
            spine.set_color('black')
        plt.grid(False)
        plt.scatter(y_true, y_pred, alpha=0.6, color=IPHONE_COLORS['scatter'], edgecolor='k', s=50)
        xymin, xymax = float(np.min(y_true)), float(np.max(y_true))
        plt.plot([xymin, xymax], [xymin, xymax], linestyle='--', color=IPHONE_COLORS['line'], linewidth=1)
        r2 = r2_score(y_true, y_pred)
        mae = mean_absolute_error(y_true, y_pred)
        plt.xlabel("True Retention Time (s)")
        plt.ylabel("Predicted Retention Time (s)")
        plt.text(0.05, 0.95, f"RÂ² = {r2:.3g}\nMAE = {mae:.3g}", transform=ax.transAxes, va='top', ha='left', fontsize=10, color=IPHONE_COLORS['text'])
        pad = 0.1 * (xymax - xymin + 1e-9)
        ax.set_ylim([xymin - pad, xymax + pad])
        ax.set_facecolor('white')
        plt.tight_layout()
        plt.savefig(os.path.join(save_folder, f"{base_name}_scatter.png"), dpi=600)
        plt.close()

        # Residual plot
        residuals = y_pred - y_true
        plt.figure(figsize=(6, 6))
        ax = plt.gca()
        ax.tick_params(axis='both', direction='out', length=6, width=1.2, color='black')
        for spine in ax.spines.values():
            spine.set_visible(True)
            spine.set_color('black')
        plt.grid(False)
        plt.scatter(y_pred, residuals, alpha=0.6, color=IPHONE_COLORS['scatter'], edgecolor='k', s=50)
        plt.axhline(y=0, linestyle='--', color=IPHONE_COLORS['line'], linewidth=1)
        plt.xlabel("Predicted Retention Time (s)")
        plt.ylabel("Residuals (Predicted - True)")
        rmin, rmax = float(np.min(residuals)), float(np.max(residuals))
        pad = 0.1 * (rmax - rmin + 1e-9)
        ax.set_ylim([rmin - pad, rmax + pad])
        ax.set_facecolor('white')
        plt.tight_layout()
        plt.savefig(os.path.join(save_folder, f"{base_name}_residuals.png"), dpi=600)
        plt.close()
    except Exception as e:
        logging.error(f"[{base_name}] Plotting failed: {str(e)}", exc_info=True)


def plot_optuna_rmse(study, save_path):
    try:
        best_rmses = [t.value for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE]
        if not best_rmses:
            return
        cumulative_best = np.minimum.accumulate(best_rmses)
        plt.figure(figsize=(6, 4))
        plt.plot(cumulative_best, marker='o', linestyle='-', color='blue')
        plt.xlabel('Trial')
        plt.ylabel('Best RMSE so far')
        plt.title('Optuna RMSE Convergence')
        plt.grid(True)
        plt.tight_layout()
        plt.savefig(save_path, dpi=600)
        plt.close()
    except Exception as e:
        logging.error(f"Failed to plot Optuna RMSE curve: {str(e)}", exc_info=True)


# -------------------- Pretrained Asset Loading --------------------
def load_pretrained_assets():
    if not (os.path.isfile(PRETRAIN_MODEL_PATH) and os.path.isfile(PRETRAIN_FEATURE_PATH) and os.path.isfile(PRETRAIN_IMPUTER_PATH)):
        raise FileNotFoundError("Pretrained assets not found")
    model = joblib.load(PRETRAIN_MODEL_PATH)
    feature_cols = joblib.load(PRETRAIN_FEATURE_PATH)
    imputer = joblib.load(PRETRAIN_IMPUTER_PATH)
    pretrain_params = {}
    if os.path.isfile(PRETRAIN_METRICS_PATH):
        try:
            with open(PRETRAIN_METRICS_PATH, "r") as f:
                pretrain_params = json.load(f).get("best_params", {})
        except Exception:
            pass
    return model, feature_cols, imputer, pretrain_params


# -------------------- Data Checking --------------------
def check_and_extract(df: pd.DataFrame, feature_cols: list, target_col: str):
    missing = [c for c in feature_cols if c not in df.columns]
    if missing:
        raise KeyError(f"Data missing the following feature columns: {missing}")
    if target_col not in df.columns:
        raise KeyError(f"Missing target column '{target_col}'")
    X = df[feature_cols].values
    y = df[target_col].values
    return X, y


# -------------------- Fine-tuning with Optuna --------------------
def finetune_with_optuna(tag_raw, pretrain_model, feature_cols, imputer, pretrain_params,
                         target_col="UV_RT-s", n_trials=40, n_splits=5,
                         new_trees=50):
    tag = safe_tag(tag_raw)  # Returns original tag unchanged
    train_path = os.path.join(DATA_DIR, f"{tag_raw}_train.csv")
    test_path = os.path.join(DATA_DIR, f"{tag_raw}_test.csv")
    
    # Modified: Create dataset directory with dataset name only
    dataset_dir = os.path.join(MODEL_DIR, f"{tag}")
    os.makedirs(dataset_dir, exist_ok=True)

    if not os.path.isfile(train_path) or not os.path.isfile(test_path):
        msg = f"Cannot find train/test CSV for {tag_raw}"
        logging.warning(msg)
        return {"tag": tag_raw, "status": "warn", "message": msg}

    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    X_all, y_all = check_and_extract(train_df, feature_cols, target_col)
    X_test, y_test = check_and_extract(test_df, feature_cols, target_col)

    X_all = imputer.transform(X_all)
    X_test = imputer.transform(X_test)

    # -------------------- Safe Parameter Handling --------------------
    base_params = pretrain_model.get_params()
    base_params.pop("n_estimators", None)
    base_params.pop("random_state", None)
    if pretrain_params:
        pretrain_params.pop("n_estimators", None)
        pretrain_params.pop("random_state", None)
        base_params.update(pretrain_params)

    # -------------------- Multi-stage Parameter Tuning --------------------
    def objective(trial, stage="coarse"):
        params = base_params.copy()
        if stage == "coarse":
            params.update({
                "learning_rate": trial.suggest_float("learning_rate", 0.002, 0.1, log=True),
                "num_leaves": trial.suggest_int("num_leaves", 16, 128),
                "max_depth": trial.suggest_int("max_depth", 3, 12),
                "min_child_samples": trial.suggest_int("min_child_samples", 5, 50),
                "subsample": trial.suggest_float("subsample", 0.6, 1.0),
                "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
            })
        else:  # fine
            params.update({
                "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.05, log=True),
                "num_leaves": trial.suggest_int("num_leaves", max(16, base_params.get("num_leaves", 31)-16),
                                               min(128, base_params.get("num_leaves", 31)+16)),
                "max_depth": trial.suggest_int("max_depth", max(3, base_params.get("max_depth", 7)-3),
                                              min(12, base_params.get("max_depth", 7)+3)),
            })
        kf = KFold(n_splits=n_splits, shuffle=True, random_state=SEED)
        rmses = []
        for train_idx, valid_idx in kf.split(X_all):
            X_tr, X_val = X_all[train_idx], X_all[valid_idx]
            y_tr, y_val = y_all[train_idx], y_all[valid_idx]

            model = lgb.LGBMRegressor(
                **params,
                n_estimators=pretrain_model.n_estimators + new_trees,
                random_state=SEED
            )
            model.fit(
                X_tr, y_tr,
                init_model=pretrain_model,
                eval_set=[(X_val, y_val)],
                eval_metric="rmse",
                callbacks=[early_stopping(stopping_rounds=50)]
            )
            y_pred = model.predict(X_val)
            rmses.append(np.sqrt(mean_squared_error(y_val, y_pred)))
        return float(np.mean(rmses))

    # -------------------- Optuna Parameter Tuning --------------------
    study = optuna.create_study(direction="minimize", pruner=MedianPruner(n_startup_trials=5))
    study.optimize(lambda t: objective(t, stage="coarse"), n_trials=n_trials, show_progress_bar=True)
    plot_optuna_rmse(study, os.path.join(dataset_dir, f"{tag}_coarse_optuna.png"))

    best_params = base_params.copy()
    best_params.update(study.best_params)
    fine_model = lgb.LGBMRegressor(
        **best_params,
        n_estimators=pretrain_model.n_estimators + new_trees,
        random_state=SEED
    )
    fine_model.fit(X_all, y_all, init_model=pretrain_model)

    # -------------------- Scratch Training Control Model --------------------
    scratch_model = lgb.LGBMRegressor(
        **best_params,
        n_estimators=pretrain_model.n_estimators + new_trees,
        random_state=SEED
    )
    scratch_model.fit(X_all, y_all)

    y_pred_test_fine = fine_model.predict(X_test)
    y_pred_test_scratch = scratch_model.predict(X_test)

    rmse_fine = np.sqrt(mean_squared_error(y_test, y_pred_test_fine))
    r2_fine = r2_score(y_test, y_pred_test_fine)
    mae_fine = mean_absolute_error(y_test, y_pred_test_fine)

    rmse_scratch = np.sqrt(mean_squared_error(y_test, y_pred_test_scratch))
    r2_scratch = r2_score(y_test, y_pred_test_scratch)
    mae_scratch = mean_absolute_error(y_test, y_pred_test_scratch)

    plot_scatter_and_residuals(y_test, y_pred_test_fine, dataset_dir, f"{tag}_finetuned")
    plot_scatter_and_residuals(y_test, y_pred_test_scratch, dataset_dir, f"{tag}_scratch")

    # -------------------- Save Models and Parameters --------------------
    fine_model_path = os.path.join(dataset_dir, f"{tag}_finetuned_model.joblib")
    scratch_model_path = os.path.join(dataset_dir, f"{tag}_scratch_model.joblib")
    feature_path = os.path.join(dataset_dir, f"{tag}_feature_list.pkl")
    imputer_path = os.path.join(dataset_dir, f"{tag}_imputer.pkl")
    metrics_path = os.path.join(dataset_dir, f"{tag}_metrics.json")

    joblib.dump(fine_model, fine_model_path)
    joblib.dump(scratch_model, scratch_model_path)
    joblib.dump(feature_cols, feature_path)
    joblib.dump(imputer, imputer_path)

    metrics = {
        "finetuned": {
            "rmse": rmse_fine,
            "r2": r2_fine,
            "mae": mae_fine
        },
        "scratch": {
            "rmse": rmse_scratch,
            "r2": r2_scratch,
            "mae": mae_scratch
        },
        "best_params": best_params
    }

    with open(metrics_path, "w") as f:
        json.dump(metrics, f, indent=4)

    logging.info(f"[{tag_raw}] Fine-tuning completed. Finetuned RMSE={rmse_fine:.4f}, Scratch RMSE={rmse_scratch:.4f}")

    return {
        "tag": tag_raw,
        "status": "ok",
        "metrics": metrics,
        "fine_model_path": fine_model_path,
        "scratch_model_path": scratch_model_path
    }


# -------------------- Main Workflow --------------------
if __name__ == "__main__":
    try:
        pretrain_model, feature_cols, imputer, pretrain_params = load_pretrained_assets()
    except FileNotFoundError as e:
        logging.error(str(e))
        raise

    results = []
    for target_dataset in TARGET_DATASETS:
        res = finetune_with_optuna(target_dataset, pretrain_model, feature_cols, imputer, pretrain_params,
                                   n_trials=40, n_splits=5, new_trees=50)
        results.append(res)

    results_path = os.path.join(MODEL_DIR, "finetune_results.json")
    with open(results_path, "w") as f:
        json.dump(results, f, indent=4)

    logging.info("All dataset fine-tuning completed.")