In [1]:
import numpy as np 
from pandas import Series, DataFrame
import pandas as pd
import copy
from astropy import table
import matplotlib.pyplot as plt
import time

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import train_test_split

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

##### Import data

In [3]:
data = pd.read_parquet(r'D:\Estágio IA\newdata\data.parquet')
target = pd.read_parquet(r'D:\Estágio IA\newdata\target.parquet')

###### ML

## MultiOutputRegressor

In [5]:
labels = ['lp_mass_best', 'lp_zBEST', 'lp_SFR_best']
columns_df = ['xgboost', 'lightgbm', 'catboost']

X = data
y = target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

models = [
    XGBRegressor(
        objective='reg:squarederror', 
        n_estimators=100,
        max_depth=8,
        nthread=-1,
    ),
    LGBMRegressor(
        objective='regression', 
        n_jobs=-1,
        n_estimators=100,
        max_depth=8,
        subsample=0.8,
        verbosity=-1
    ),
    CatBoostRegressor(
        loss_function='RMSE',   
        logging_level='Silent',
        n_estimators=100,
        max_depth=8
    )
]

pred_dict = {}

for i, method in enumerate(models):
    model_name = columns_df[i]
    print(f"\nTraining {model_name}...")
    
    start_time = time.time()
    
    regressor = MultiOutputRegressor(method)
    regressor.fit(X_train, y_train)
    pred = regressor.predict(X_train)

    end_time = time.time()
    execution_time = end_time - start_time
    print(f"Execution time for {model_name}: {execution_time:.2f} seconds")

    for j, label in enumerate(labels):
        col_name = model_name + '_' + label
        pred_dict[col_name] = pred[:, j]
        
    y_pred_test = regressor.predict(X_test)
    
pred_df_v2 = pd.DataFrame(pred_dict, index=y_train.index)


Training xgboost...
Execution time for xgboost: 1307.07 seconds

Training lightgbm...
Execution time for lightgbm: 43.31 seconds

Training catboost...
Execution time for catboost: 80.43 seconds


In [6]:
pred_df_v2

Unnamed: 0,xgboost_lp_mass_best,xgboost_lp_zBEST,xgboost_lp_SFR_best,lightgbm_lp_mass_best,lightgbm_lp_zBEST,lightgbm_lp_SFR_best,catboost_lp_mass_best,catboost_lp_zBEST,catboost_lp_SFR_best
222345,1.342709e+08,1.134190,7.234098,2.968920e+08,0.028103,6.028714,-1.836767e+08,3.846582,7.385354
253163,2.746504e+08,-0.132914,7.412137,5.277471e+08,0.020592,6.879609,2.784501e+08,0.271987,6.564315
181424,1.280761e+08,0.107212,7.761805,4.810818e+08,-0.072516,7.955942,1.755886e+08,0.021859,7.723534
273590,1.239478e+09,0.131686,9.012308,8.509591e+08,0.220224,8.789131,1.444623e+09,0.438247,8.405913
596815,8.162358e+08,0.085051,8.657460,8.638945e+08,0.108882,8.708990,9.078432e+08,0.006404,8.270723
...,...,...,...,...,...,...,...,...,...
700696,2.057562e+09,0.071851,8.788814,1.943647e+09,0.118027,8.490816,1.669874e+09,-0.003614,8.172510
419844,7.296207e+08,-66.436684,-68.038658,8.258971e+08,-40.100092,-29.555641,8.913955e+08,-14.678451,-45.012268
97200,3.156642e+09,0.212369,8.782661,1.716548e+09,0.106178,8.676920,2.002398e+09,-0.204143,7.200900
110171,1.122658e+09,-4.674853,7.395592,2.529595e+09,-14.388249,-10.409167,1.339996e+09,-17.760750,-9.730334
