In [None]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, explained_variance_score, mean_absolute_error
from scipy.stats import pearsonr
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK
from joblib import Parallel, delayed
import joblib
import os

# Define the data file path
data_path = "/content/drive/My Drive/Adaptive gradient method/Adaptive gradient method from L_0 to L infnity/Final_pine_data.csv"

# Check if the data file exists
if not os.path.exists(data_path):
    print(f"Error: The file '{data_path}' was not found.")
    print("Please make sure the file is correctly placed and the path is accessible.")
else:
    # ----------------------------
    # Load and Preprocess Data
    # ----------------------------
    data = pd.read_csv(data_path)
    Y = data.iloc[:, :7].values  # 7 regression tasks
    X = data.iloc[:, 7:].values  # SNPs/features

    # Scale the features
    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    # Split the data into a training/validation set and a final test set
    X_trainval, X_test, Y_trainval, Y_test = train_test_split(
        X, Y, test_size=0.2, random_state=42
    )

    # Define the search space for hyperparameter tuning
    search_space = {
        'num_leaves': hp.choice('num_leaves', np.arange(10, 100, 10, dtype=int)),
        'max_depth': hp.choice('max_depth', np.arange(3, 15, 1, dtype=int)),
        'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.2)),
        'n_estimators': hp.choice('n_estimators', np.arange(100, 500, 50, dtype=int)),
        'subsample': hp.uniform('subsample', 0.6, 1.0),
        'colsample_bytree': hp.uniform('colsample_bytree', 0.6, 1.0),
        'reg_alpha': hp.uniform('reg_alpha', 0.0, 1.0),
        'reg_lambda': hp.uniform('reg_lambda', 0.0, 1.0),
    }

    # ----------------------------------------------------
    # Function to train and evaluate a single trait model
    # ----------------------------------------------------
    def train_and_eval_for_trait(X_train, Y_train, X_test, Y_test, trait_idx):
        """
        Trains and evaluates a LightGBM model for a single trait.
        """
        Y_train_trait = Y_train[:, trait_idx]
        Y_test_trait = Y_test[:, trait_idx]

        # Objective function for Hyperopt
        def objective(params):
            model = lgb.LGBMRegressor(
                objective='regression',
                metric='rmse',
                **params
            )
            model.fit(
                X_train, Y_train_trait,
                eval_set=[(X_test, Y_test_trait)],
                callbacks=[lgb.early_stopping(100)]
            )

            y_pred = model.predict(X_test)
            loss = mean_squared_error(Y_test_trait, y_pred)
            return {'loss': loss, 'status': STATUS_OK, 'model': model}

        # Run Bayesian Optimization to find the best hyperparameters
        trials = Trials()
        best = fmin(
            fn=objective,
            space=search_space,
            algo=tpe.suggest,
            max_evals=50,
            trials=trials,
            rstate=np.random.default_rng(42)
        )

        # Get the best model from the trials
        best_model = trials.best_trial['result']['model']

        # Make predictions on the final test set
        y_pred = best_model.predict(X_test)

        # Calculate metrics
        r2 = r2_score(Y_test_trait, y_pred)
        mse = mean_squared_error(Y_test_trait, y_pred)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(Y_test_trait, y_pred)
        corr, _ = pearsonr(Y_test_trait, y_pred)

        return {
            'trait_index': trait_idx,
            'r2': r2,
            'mse': mse,
            'rmse': rmse,
            'mae': mae,
            'correlation': corr
        }

    # ----------------------------------------------------
    # Train Models for All 7 Traits in Parallel
    # ----------------------------------------------------
    print("Training LightGBM models for 7 traits...")

    results = Parallel(n_jobs=-1)(
        delayed(train_and_eval_for_trait)(X_trainval, Y_trainval, X_test, Y_test, i)
        for i in range(Y_trainval.shape[1])
    )

    # ----------------------------
    # Display Results
    # ----------------------------
    print("\n--- Final Test Evaluation for all Traits ---")
    for res in results:
        print(f"\nTrait {res['trait_index'] + 1}:")
        print(f"  R2: {res['r2']:.4f}")
        print(f"  MSE: {res['mse']:.4f}")
        print(f"  RMSE: {res['rmse']:.4f}")
        print(f"  MAE: {res['mae']:.4f}")
        print(f"  Pearson Correlation: {res['correlation']:.4f}")