## Imports

In [2]:
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import random
import pickle
import joblib

In [3]:
from sklearn.model_selection import train_test_split, KFold, GridSearchCV

from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor

from sklearn.metrics import precision_score, recall_score, f1_score, matthews_corrcoef, confusion_matrix,\
                             explained_variance_score, mean_squared_error, max_error, mean_absolute_error,\
                             root_mean_squared_error, median_absolute_error
from scipy.stats import pearsonr

from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer
from sklearn.decomposition import PCA, KernelPCA
# pas besoin de feature selection parce que pas beacoup de colonnes

import optuna

In [4]:
import read_mist_models

from utils import Iso_data_handler

def gaussian(dsts):
    kernel_width = .5
    weights = np.exp(-(dsts**2)/kernel_width)
    return weights

# To find certain rows

def isclose_pandas_apply(row, col_name, value, bool_index, rel_tol=1e-6):
    if math.isclose(row[col_name], value, rel_tol=rel_tol):
        bool_index.append(True)
    else:
        bool_index.append(False)

def isclose_pandas(df, col_name, value, rel_tol=1e-6):
    bool_index = []
    df.apply(isclose_pandas_apply, axis=1, args=(col_name, value, bool_index, rel_tol))
    return bool_index

## Data preparation

In [5]:
iso_handler = Iso_data_handler("data/MIST_v1.2_vvcrit0.0_basic_isos/", 
                              ['log10_isochrone_age_yr', 'log_Teff', 'log_g', 'star_mass', 'phase', 'metallicity', 'log_R'])


In [6]:
iso_df = iso_handler.full_iso_data_to_panda()

Reading dataframe from csv file...


In [7]:
display(iso_df)

Unnamed: 0,log10_isochrone_age_yr,log_Teff,log_g,star_mass,phase,metallicity,log_R
0,5.0,3.486221,3.131342,0.100000,-1.0,-0.25,0.153402
1,5.0,3.487362,3.126808,0.102645,-1.0,-0.25,0.160326
2,5.0,3.489243,3.119367,0.107039,-1.0,-0.25,0.171785
3,5.0,3.491102,3.112165,0.111419,-1.0,-0.25,0.183099
4,5.0,3.492937,3.105143,0.115789,-1.0,-0.25,0.194305
...,...,...,...,...,...,...,...
1467117,10.3,4.402490,7.777159,0.532726,6.0,0.50,-1.806255
1467118,10.3,4.387132,7.783242,0.532730,6.0,0.50,-1.809295
1467119,10.3,4.371789,7.789130,0.532735,6.0,0.50,-1.812237
1467120,10.3,4.356480,7.794844,0.532741,6.0,0.50,-1.815091


In [8]:
# Keeping only the relevant star phases
phase_filtered_iso_df = iso_df\
    .where((iso_df.phase == 0) | (iso_df.phase == 2) | (iso_df.phase == 3) | (iso_df.phase == 4) | (iso_df.phase == 5))\
    .dropna().reset_index(drop=True)

In [9]:
display(phase_filtered_iso_df)

Unnamed: 0,log10_isochrone_age_yr,log_Teff,log_g,star_mass,phase,metallicity,log_R
0,5.0,4.494412,4.346972,13.584360,0.0,-0.25,0.610679
1,5.0,4.497517,4.345776,13.765512,0.0,-0.25,0.614753
2,5.0,4.500556,4.344580,13.942887,0.0,-0.25,0.618755
3,5.0,4.504040,4.343050,14.591712,0.0,-0.25,0.624670
4,5.0,4.507576,4.341483,15.426062,0.0,-0.25,0.631187
...,...,...,...,...,...,...,...
1165292,10.3,3.425746,-0.551440,0.602856,5.0,0.50,2.384899
1165293,10.3,3.426469,-0.560350,0.598549,5.0,0.50,2.387797
1165294,10.3,3.427744,-0.566057,0.594116,5.0,0.50,2.389036
1165295,10.3,3.429413,-0.569225,0.589648,5.0,0.50,2.388981


In [10]:
X = phase_filtered_iso_df[['log10_isochrone_age_yr', 'metallicity', 'star_mass', 'log_R']].to_numpy()
y = phase_filtered_iso_df[['log_Teff', 'log_g', 'log_R']].to_numpy()

X_TRAIN, X_IVS, y_TRAIN, y_IVS = train_test_split(X, y, test_size=0.25, random_state=1337)

print(X_TRAIN.shape, X_IVS.shape)
print(y_TRAIN.shape, y_IVS.shape)

(873972, 4) (291325, 4)
(873972, 3) (291325, 3)


In [11]:
print(f"Range in train data for the Teff parameter : {min(y_TRAIN[:, 0])} - {max(y_TRAIN[:, 0])}")
print(f"Median value in train data for the Teff parameter: {np.median(y_TRAIN[:, 0])}")
print(f"Mean value in train data for the Teff parameter: {np.mean(y_TRAIN[:, 0])}")

print(f"Range in test data for the Teff parameter : {min(y_IVS[:, 0])} - {max(y_IVS[:, 0])}")
print(f"Median value in test data for the Teff parameter: {np.median(y_IVS[:, 0])}")
print(f"Mean value in test data for the Teff parameter: {np.mean(y_IVS[:, 0])}")

print()

print(f"Range in train data for the log_g parameter : {min(y_TRAIN[:, 1])} - {max(y_TRAIN[:, 1])}")
print(f"Median value in train data for the log_g parameter: {np.median(y_TRAIN[:, 1])}")
print(f"Mean value in train data for the log_g parameter: {np.mean(y_TRAIN[:, 1])}")

print(f"Range in test data for the log_g parameter : {min(y_IVS[:, 1])} - {max(y_IVS[:, 1])}")
print(f"Median value in test data for the log_g parameter: {np.median(y_IVS[:, 1])}")
print(f"Mean value in test data for the log_g parameter: {np.mean(y_IVS[:, 1])}")

print()

print(f"Range in train data for the log_R parameter : {min(y_TRAIN[:, 2])} - {max(y_TRAIN[:, 2])}")
print(f"Median value in train data for the log_R parameter: {np.median(y_TRAIN[:, 2])}")
print(f"Mean value in train data for the log_R parameter: {np.mean(y_TRAIN[:, 2])}")

print(f"Range in test data for the log_R parameter : {min(y_IVS[:, 2])} - {max(y_IVS[:, 2])}")
print(f"Median value in test data for the log_R parameter: {np.median(y_IVS[:, 2])}")
print(f"Mean value in test data for the log_R parameter: {np.mean(y_IVS[:, 2])}")


Range in train data for the Teff parameter : 3.344668191350628 - 5.587140896561816
Median value in train data for the Teff parameter: 3.6805516874555706
Mean value in train data for the Teff parameter: 3.8474460990706407
Range in test data for the Teff parameter : 3.344674699659814 - 5.551422149694809
Median value in test data for the Teff parameter: 3.681388618692048
Mean value in test data for the Teff parameter: 3.8476834348837503

Range in train data for the log_g parameter : -1.1419195945355185 - 8.612101665912416
Median value in train data for the log_g parameter: 2.066565790889051
Mean value in train data for the log_g parameter: 2.134248500181511
Range in test data for the log_g parameter : -1.14489780699774 - 8.605154669542225
Median value in test data for the log_g parameter: 2.0724371580456222
Mean value in test data for the log_g parameter: 2.1360511132815074

Range in train data for the log_R parameter : -2.085171571669866 - 3.1297545143214007
Median value in train data fo

In [12]:
phase_mass_filtered_iso_df = phase_filtered_iso_df.where(phase_filtered_iso_df.star_mass < 30)\
                                                  .dropna()\
                                                  .reset_index(drop=True)

In [13]:
display(phase_mass_filtered_iso_df)

Unnamed: 0,log10_isochrone_age_yr,log_Teff,log_g,star_mass,phase,metallicity,log_R
0,5.0,4.494412,4.346972,13.584360,0.0,-0.25,0.610679
1,5.0,4.497517,4.345776,13.765512,0.0,-0.25,0.614753
2,5.0,4.500556,4.344580,13.942887,0.0,-0.25,0.618755
3,5.0,4.504040,4.343050,14.591712,0.0,-0.25,0.624670
4,5.0,4.507576,4.341483,15.426062,0.0,-0.25,0.631187
...,...,...,...,...,...,...,...
1105728,10.3,3.425746,-0.551440,0.602856,5.0,0.50,2.384899
1105729,10.3,3.426469,-0.560350,0.598549,5.0,0.50,2.387797
1105730,10.3,3.427744,-0.566057,0.594116,5.0,0.50,2.389036
1105731,10.3,3.429413,-0.569225,0.589648,5.0,0.50,2.388981


In [14]:
X_mass = phase_filtered_iso_df[['log10_isochrone_age_yr', 'metallicity', 'star_mass']].to_numpy()
y_mass = phase_filtered_iso_df[['log_Teff', 'log_g', 'log_R']].to_numpy()

X_TRAIN_mass, X_IVS_mass, y_TRAIN_mass, y_IVS_mass = train_test_split(X_mass, y_mass, test_size=0.25, random_state=1337)

print(X_TRAIN_mass.shape, X_IVS_mass.shape)
print(y_TRAIN_mass.shape, y_IVS_mass.shape)

(873972, 3) (291325, 3)
(873972, 3) (291325, 3)


## PCA data preparation

In [14]:
pca = PCA(n_components=4) # maybe try with less or more components
pca.fit(X_TRAIN)
tve=0
for i, ve in enumerate(pca.explained_variance_ratio_):
    tve+=ve
    print("PC%d - Variance explained: %7.4f - Total Variance: %7.4f" % (i, ve, tve) )
print()

PC0 - Variance explained:  0.9909 - Total Variance:  0.9909
PC1 - Variance explained:  0.0044 - Total Variance:  0.9953
PC2 - Variance explained:  0.0025 - Total Variance:  0.9979
PC3 - Variance explained:  0.0021 - Total Variance:  1.0000



In [15]:
X_TRAIN_PCA=pca.transform(X_TRAIN)
X_IVS_PCA=pca.transform(X_IVS)
print(X_TRAIN_PCA.shape)

(873972, 4)


## Model training

In [16]:

# TODO faire que la fonction puisse accepter autant d'output qu'on veut
# TODO mettre les outputs du modèle dans un fichier csv pour utiliser plus tard
# TODO mettre la possibilité de rajouter des paramètres à tester dans le modèle
# TODO rajouter le calcul du temps et le rajouter dans le csv

def Kfold_pipeline(model, x_train_data=X_TRAIN, y_train_data=y_TRAIN, filename="", n_splits=10, shuffle=True):
    kf = KFold(n_splits=n_splits, shuffle=shuffle, random_state=12)
    TRUTH_TEFF=None
    TRUTH_LOG_G=None
    TRUTH_RADIUS=None
    PREDS_TEFF=None
    PREDS_LOG_G=None
    PREDS_RADIUS=None
    counter = 0
    print("split", end=' ')
    for train_index, test_index in kf.split(x_train_data):
        counter += 1
        print(str(counter), end=' ')
        X_train, X_test = x_train_data[train_index], x_train_data[test_index]
        y_train, y_test = y_train_data[train_index], y_train_data[test_index]

        mdl = model()
        mdl.fit(X_train, y_train)
        preds = mdl.predict(X_test)

        if TRUTH_TEFF is None:
            PREDS_TEFF=preds[:, 0]
            TRUTH_TEFF=y_test[:, 0]
        else:
            PREDS_TEFF=np.hstack((PREDS_TEFF, preds[:, 0]))
            TRUTH_TEFF=np.hstack((TRUTH_TEFF, y_test[:, 0]))
        
        if TRUTH_LOG_G is None:
            PREDS_LOG_G=preds[:, 1]
            TRUTH_LOG_G=y_test[:, 1]
        else:
            PREDS_LOG_G=np.hstack((PREDS_LOG_G, preds[:, 1]))
            TRUTH_LOG_G=np.hstack((TRUTH_LOG_G, y_test[:, 1]))

        if TRUTH_RADIUS is None:
            PREDS_RADIUS=preds[:, 2]
            TRUTH_RADIUS=y_test[:, 2]
        else:
            PREDS_RADIUS=np.hstack((PREDS_RADIUS, preds[:, 2]))
            TRUTH_RADIUS=np.hstack((TRUTH_RADIUS, y_test[:, 2]))

    print()
    print("Teff results:")
    print("RVE: ",explained_variance_score(TRUTH_TEFF, PREDS_TEFF))
    print("RMSE: ",root_mean_squared_error(TRUTH_TEFF, PREDS_TEFF))
    corr, pval=pearsonr(TRUTH_TEFF, PREDS_TEFF)
    print("CORR: ",corr)
    print("PVAL: ",pval)
    print("MAX_ER: ",max_error(TRUTH_TEFF, PREDS_TEFF))
    print("MEAN_ABS_ER: ",mean_absolute_error(TRUTH_TEFF, PREDS_TEFF))  
    print("MEDIAN_ABS_ER: ",median_absolute_error(TRUTH_TEFF, PREDS_TEFF))

    print()
    print("log_g results:")
    print("RVE: ",explained_variance_score(TRUTH_LOG_G, PREDS_LOG_G))
    print("RMSE: ",root_mean_squared_error(TRUTH_LOG_G, PREDS_LOG_G))
    corr, pval=pearsonr(TRUTH_LOG_G, PREDS_LOG_G)
    print("CORR: ",corr)
    print("PVAL: ",pval)
    print("MAX_ER: ",max_error(TRUTH_LOG_G, PREDS_LOG_G))
    print("MEAN_ABS_ER: ",mean_absolute_error(TRUTH_LOG_G, PREDS_LOG_G))  
    print("MEDIAN_ABS_ER: ",median_absolute_error(TRUTH_LOG_G, PREDS_LOG_G))

    print()
    print("Radius results")
    print("RVE: ",explained_variance_score(TRUTH_RADIUS, PREDS_RADIUS))
    print("RMSE: ",root_mean_squared_error(TRUTH_RADIUS, PREDS_RADIUS))
    corr, pval=pearsonr(TRUTH_RADIUS, PREDS_RADIUS)
    print("CORR: ",corr)
    print("PVAL: ",pval)
    print("MAX_ER: ",max_error(TRUTH_RADIUS, PREDS_RADIUS))
    print("MEAN_ABS_ER: ",mean_absolute_error(TRUTH_RADIUS, PREDS_RADIUS))  
    print("MEDIAN_ABS_ER: ",median_absolute_error(TRUTH_RADIUS, PREDS_RADIUS))

### Linear model

#### Linear regression

In [32]:
print("Base train data :")
Kfold_pipeline(LinearRegression, x_train_data=X_TRAIN, y_train_data=y_TRAIN)
print("\nNo massive stars train data :")
Kfold_pipeline(LinearRegression, x_train_data=X_TRAIN_mass, y_train_data=y_TRAIN_mass)
print("\nPCA train data :")
Kfold_pipeline(LinearRegression, x_train_data=X_TRAIN_PCA, y_train_data=y_TRAIN)

Base train data :
split 1 2 3 4 5 6 7 8 9 10 
Teff results:
RVE:  0.7505690248913951
RMSE:  0.18470291151520093
CORR:  0.866353868207232
PVAL:  0.0
MAX_ER:  1.1548208510435742
MEAN_ABS_ER:  0.12939177829692178
MEDIAN_ABS_ER:  0.09466448752505086

log_g results:
RVE:  0.9895257097668725
RMSE:  0.20215082309657464
CORR:  0.9947490687441936
PVAL:  0.0
MAX_ER:  1.468009235276078
MEAN_ABS_ER:  0.15341554446835742
MEDIAN_ABS_ER:  0.12500796791980606

Radius results
RVE:  1.0
RMSE:  4.013259053614499e-15
CORR:  0.9999999999999998
PVAL:  0.0
MAX_ER:  2.7533531010703882e-14
MEAN_ABS_ER:  2.8386331048872974e-15
MEDIAN_ABS_ER:  1.9984014443252818e-15

No massive stars train data :
split 1 2 3 4 5 6 7 8 9 10 
Teff results:
RVE:  0.4977420030246361
RMSE:  0.26209695733675137
CORR:  0.7055083295758279
PVAL:  0.0
MAX_ER:  1.7330458811318352
MEAN_ABS_ER:  0.1983188560099639
MEDIAN_ABS_ER:  0.17545832354355717

log_g results:
RVE:  0.10664254962543018
RMSE:  1.866920583526842
CORR:  0.3265617089100818


#### Ridge

In [33]:
print("Base train data :")
Kfold_pipeline(Ridge, x_train_data=X_TRAIN, y_train_data=y_TRAIN)
print("\nNo massive stars train data :")
Kfold_pipeline(Ridge, x_train_data=X_TRAIN_mass, y_train_data=y_TRAIN_mass)
print("\nPCA train data :")
Kfold_pipeline(Ridge, x_train_data=X_TRAIN_PCA, y_train_data=y_TRAIN)

Base train data :
split 1 2 3 4 5 6 7 8 9 10 
Teff results:
RVE:  0.7505690248852424
RMSE:  0.184702911517479
CORR:  0.8663538681974103
PVAL:  0.0
MAX_ER:  1.1548216769968533
MEAN_ABS_ER:  0.1293917800419144
MEDIAN_ABS_ER:  0.09466453277327691

log_g results:
RVE:  0.9895257097665303
RMSE:  0.20215082309987653
CORR:  0.9947490687438436
PVAL:  0.0
MAX_ER:  1.468008268590653
MEAN_ABS_ER:  0.15341589607827966
MEDIAN_ABS_ER:  0.12500974319784808

Radius results
RVE:  0.9999999999982986
RMSE:  1.3005559461939927e-06
CORR:  0.9999999999999669
PVAL:  0.0
MAX_ER:  4.677766993754773e-06
MEAN_ABS_ER:  1.1006406881480555e-06
MEDIAN_ABS_ER:  1.026443813723077e-06

No massive stars train data :
split 1 2 3 4 5 6 7 8 9 10 
Teff results:
RVE:  0.49774200301624816
RMSE:  0.2620969573389399
CORR:  0.7055083295641156
PVAL:  0.0
MAX_ER:  1.7330460079782832
MEAN_ABS_ER:  0.19831886661626733
MEDIAN_ABS_ER:  0.1754583703422934

log_g results:
RVE:  0.10664254962831343
RMSE:  1.86692058352383
CORR:  0.326561

### Decision tree regressor

In [34]:
print("Base train data :")
Kfold_pipeline(DecisionTreeRegressor, x_train_data=X_TRAIN, y_train_data=y_TRAIN)
print("\nNo massive stars train data :")
Kfold_pipeline(DecisionTreeRegressor, x_train_data=X_TRAIN_mass, y_train_data=y_TRAIN_mass)
print("\nPCA train data :")
Kfold_pipeline(DecisionTreeRegressor, x_train_data=X_TRAIN_PCA, y_train_data=y_TRAIN)

Base train data :
split 1 2 3 4 5 6 7 8 9 10 
Teff results:
RVE:  0.9963369587667614
RMSE:  0.022383429772147356
CORR:  0.9981684957287787
PVAL:  0.0
MAX_ER:  1.2852376358233562
MEAN_ABS_ER:  0.0060562596851704694
MEDIAN_ABS_ER:  0.0018348327307042833

log_g results:
RVE:  0.9999779660609207
RMSE:  0.00927188987917026
CORR:  0.9999889831896766
PVAL:  0.0
MAX_ER:  0.8638890521941072
MEAN_ABS_ER:  0.0061400600529025705
MEDIAN_ABS_ER:  0.004010104791665192

Radius results
RVE:  0.9999886696352427
RMSE:  0.0033562554626827613
CORR:  0.9999943348996085
PVAL:  0.0
MAX_ER:  0.12112270844841241
MEAN_ABS_ER:  0.002170530638215426
MEDIAN_ABS_ER:  0.0012869179834849587

No massive stars train data :
split 1 2 3 4 5 6 7 8 9 10 
Teff results:
RVE:  0.7697959886168102
RMSE:  0.17744308139891282
CORR:  0.8851053121939393
PVAL:  0.0
MAX_ER:  1.7981556662220504
MEAN_ABS_ER:  0.07431846518487983
MEDIAN_ABS_ER:  0.0024149134864703825

log_g results:
RVE:  0.27277339633466924
RMSE:  1.6844177558714524
COR

### KNeighbours regressor

In [35]:
print("Base train data :")
Kfold_pipeline(KNeighborsRegressor, x_train_data=X_TRAIN, y_train_data=y_TRAIN)
print("\nNo massive stars train data :")
Kfold_pipeline(KNeighborsRegressor, x_train_data=X_TRAIN_mass, y_train_data=y_TRAIN_mass)
print("\nPCA train data :")
Kfold_pipeline(KNeighborsRegressor, x_train_data=X_TRAIN_PCA, y_train_data=y_TRAIN)

Base train data :
split 1 2 3 4 5 6 7 8 9 10 
Teff results:
RVE:  0.9967303181190179
RMSE:  0.021147621504295704
CORR:  0.9983642799868448
PVAL:  0.0
MAX_ER:  1.2816922831887068
MEAN_ABS_ER:  0.003471298455109109
MEDIAN_ABS_ER:  0.0008305218547122184

log_g results:
RVE:  0.9997303393746169
RMSE:  0.032438262217560326
CORR:  0.9998652925900822
PVAL:  0.0
MAX_ER:  1.5455260946025855
MEAN_ABS_ER:  0.01020570687631788
MEDIAN_ABS_ER:  0.00409007042739562

Radius results
RVE:  0.9997297303441615
RMSE:  0.016392994130414913
CORR:  0.9998649139442062
PVAL:  0.0
MAX_ER:  0.7726753237978556
MEAN_ABS_ER:  0.005327280290462802
MEDIAN_ABS_ER:  0.0022363721068851955

No massive stars train data :
split 1 2 3 4 5 6 7 8 9 10 
Teff results:
RVE:  0.8806564931006484
RMSE:  0.1278436294846631
CORR:  0.9387039152397779
PVAL:  0.0
MAX_ER:  1.8259602723588482
MEAN_ABS_ER:  0.06023619598134205
MEDIAN_ABS_ER:  0.004888411932765457

log_g results:
RVE:  0.6416988471351245
RMSE:  1.1831498382068169
CORR:  0.80

### Support vector regressor

In [36]:
Kfold_pipeline(SVR)

split 1 

ValueError: y should be a 1d array, got an array of shape (786574, 3) instead.

### Random forest regressor

In [None]:
# très long donc je teste juste sur 1 pour l'instant
# Kfold_pipeline(RandomForestRegressor)
print("Base train data :")
Kfold_pipeline(RandomForestRegressor, x_train_data=X_TRAIN, y_train_data=y_TRAIN)
print("\nNo massive stars train data :")
Kfold_pipeline(RandomForestRegressor, x_train_data=X_TRAIN_mass, y_train_data=y_TRAIN_mass)
print("\nPCA train data :")
Kfold_pipeline(RandomForestRegressor, x_train_data=X_TRAIN_PCA, y_train_data=y_TRAIN)

split 1


KeyboardInterrupt: 

In [17]:
kf = KFold(n_splits=10, shuffle=True, random_state=12)
TRUTH_TEFF=None
TRUTH_LOG_G=None
TRUTH_RADIUS=None
PREDS_TEFF=None
PREDS_LOG_G=None
PREDS_RADIUS=None
counter = 0
print("split", end=' ')
for train_index, test_index in kf.split(X_TRAIN):
    counter += 1
    print(str(counter), end=' ')
    if counter == 2:
        break
    X_train, X_test = X_TRAIN[train_index], X_TRAIN[test_index]
    y_train, y_test = y_TRAIN[train_index], y_TRAIN[test_index]

    mdl = RandomForestRegressor()
    mdl.fit(X_train, y_train)
    preds = mdl.predict(X_test)

    if TRUTH_TEFF is None:
        PREDS_TEFF=preds[:, 0]
        TRUTH_TEFF=y_test[:, 0]
    else:
        PREDS_TEFF=np.hstack((PREDS_TEFF, preds[:, 0]))
        TRUTH_TEFF=np.hstack((TRUTH_TEFF, y_test[:, 0]))
    
    if TRUTH_LOG_G is None:
        PREDS_LOG_G=preds[:, 1]
        TRUTH_LOG_G=y_test[:, 1]
    else:
        PREDS_LOG_G=np.hstack((PREDS_LOG_G, preds[:, 1]))
        TRUTH_LOG_G=np.hstack((TRUTH_LOG_G, y_test[:, 1]))

    if TRUTH_RADIUS is None:
        PREDS_RADIUS=preds[:, 2]
        TRUTH_RADIUS=y_test[:, 2]
    else:
        PREDS_RADIUS=np.hstack((PREDS_RADIUS, preds[:, 2]))
        TRUTH_RADIUS=np.hstack((TRUTH_RADIUS, y_test[:, 2]))

print()
print("Teff results:")
print("RVE: ",explained_variance_score(TRUTH_TEFF, PREDS_TEFF))
print("RMSE: ",root_mean_squared_error(TRUTH_TEFF, PREDS_TEFF))
corr, pval=pearsonr(TRUTH_TEFF, PREDS_TEFF)
print("CORR: ",corr)
print("PVAL: ",pval)
print("MAX_ER: ",max_error(TRUTH_TEFF, PREDS_TEFF))
print("MEAN_ABS_ER: ",mean_absolute_error(TRUTH_TEFF, PREDS_TEFF))  
print("MEDIAN_ABS_ER: ",median_absolute_error(TRUTH_TEFF, PREDS_TEFF))

print()
print("log_g results:")
print("RVE: ",explained_variance_score(TRUTH_LOG_G, PREDS_LOG_G))
print("RMSE: ",root_mean_squared_error(TRUTH_LOG_G, PREDS_LOG_G))
corr, pval=pearsonr(TRUTH_LOG_G, PREDS_LOG_G)
print("CORR: ",corr)
print("PVAL: ",pval)
print("MAX_ER: ",max_error(TRUTH_LOG_G, PREDS_LOG_G))
print("MEAN_ABS_ER: ",mean_absolute_error(TRUTH_LOG_G, PREDS_LOG_G))  
print("MEDIAN_ABS_ER: ",median_absolute_error(TRUTH_LOG_G, PREDS_LOG_G))

print()
print("Radius results")
print("RVE: ",explained_variance_score(TRUTH_RADIUS, PREDS_RADIUS))
print("RMSE: ",root_mean_squared_error(TRUTH_RADIUS, PREDS_RADIUS))
corr, pval=pearsonr(TRUTH_RADIUS, PREDS_RADIUS)
print("CORR: ",corr)
print("PVAL: ",pval)
print("MAX_ER: ",max_error(TRUTH_RADIUS, PREDS_RADIUS))
print("MEAN_ABS_ER: ",mean_absolute_error(TRUTH_RADIUS, PREDS_RADIUS))  
print("MEDIAN_ABS_ER: ",median_absolute_error(TRUTH_RADIUS, PREDS_RADIUS))

split 1 2 
Teff results:
RVE:  0.9974366654590495
RMSE:  0.018708882865537988
CORR:  0.998717686230453
PVAL:  0.0
MAX_ER:  1.2055023924687887
MEAN_ABS_ER:  0.004723230948485085
MEDIAN_ABS_ER:  0.001630549773168788

log_g results:
RVE:  0.9999962666696316
RMSE:  0.0038138378897362825
CORR:  0.9999981333742665
PVAL:  0.0
MAX_ER:  0.19927281550868514
MEAN_ABS_ER:  0.0021318291711662327
MEDIAN_ABS_ER:  0.0012815765233764598

Radius results
RVE:  0.9999991378190917
RMSE:  0.0009245922627300068
CORR:  0.9999995689659968
PVAL:  0.0
MAX_ER:  0.061811721013700804
MEAN_ABS_ER:  0.0005961375876764042
MEDIAN_ABS_ER:  0.00039560649826353433


### Adaboost regressor

In [None]:
Kfold_pipeline(AdaBoostRegressor)

split 1 

ValueError: y should be a 1d array, got an array of shape (786574, 2) instead.

### XGB regressor

In [37]:
print("Base train data :")
Kfold_pipeline(XGBRegressor, x_train_data=X_TRAIN, y_train_data=y_TRAIN)
print("\nNo massive stars train data :")
Kfold_pipeline(XGBRegressor, x_train_data=X_TRAIN_mass, y_train_data=y_TRAIN_mass)
print("\nPCA train data :")
Kfold_pipeline(XGBRegressor, x_train_data=X_TRAIN_PCA, y_train_data=y_TRAIN)

Base train data :
split 1 2 3 4 5 6 7 8 9 10 
Teff results:
RVE:  0.9938256414029675
RMSE:  0.029059924109313726
CORR:  0.9969081244421539
PVAL:  0.0
MAX_ER:  1.1795151117996037
MEAN_ABS_ER:  0.01312410375165058
MEDIAN_ABS_ER:  0.007521086842846136

log_g results:
RVE:  0.999751478233894
RMSE:  0.031138338039316673
CORR:  0.9998757355823482
PVAL:  0.0
MAX_ER:  0.7184015733772533
MEAN_ABS_ER:  0.018706010398244956
MEDIAN_ABS_ER:  0.013142634677956913

Radius results
RVE:  0.9998657929699676
RMSE:  0.011550918246928811
CORR:  0.9999328942386554
PVAL:  0.0
MAX_ER:  0.36922256131425857
MEAN_ABS_ER:  0.004370032304983248
MEDIAN_ABS_ER:  0.0028901474398071447

No massive stars train data :
split 1 2 3 4 5 6 7 8 9 10 
Teff results:
RVE:  0.7916642153210531
RMSE:  0.1688031442805466
CORR:  0.8898510651402036
PVAL:  0.0
MAX_ER:  1.4874330276335725
MEAN_ABS_ER:  0.11976801417669664
MEDIAN_ABS_ER:  0.08216887178234722

log_g results:
RVE:  0.5354160707858437
RMSE:  1.346310720959642
CORR:  0.7320

### MLP regressor

In [None]:
print("Base train data :")
Kfold_pipeline(MLPRegressor, x_train_data=X_TRAIN, y_train_data=y_TRAIN)
print("\nNo massive stars train data :")
Kfold_pipeline(MLPRegressor, x_train_data=X_TRAIN_mass, y_train_data=y_TRAIN_mass)
print("\nPCA train data :")
Kfold_pipeline(MLPRegressor, x_train_data=X_TRAIN_PCA, y_train_data=y_TRAIN)

Base train data :
split 1 

: 

In [None]:
kf = KFold(n_splits=10, shuffle=True, random_state=12)
TRUTH_MASS=None
TRUTH_RADIUS=None
PREDS_MASS=None
PREDS_RADIUS=None
counter = 0
print("split", end=' ')
for train_index, test_index in kf.split(X_TRAIN):
    counter += 1
    print(str(counter), end=' ')
    X_train, X_test = X_TRAIN[train_index], X_TRAIN[test_index]
    y_train, y_test = y_TRAIN[train_index], y_TRAIN[test_index]

    mdl = MLPRegressor()
    mdl.fit(X_train, y_train)
    preds = mdl.predict(X_test)

    if TRUTH_MASS is None:
        PREDS_MASS=preds[:, 0]
        TRUTH_MASS=y_test[:, 0]
    else:
        PREDS_MASS=np.hstack((PREDS_MASS, preds[:, 0]))
        TRUTH_MASS=np.hstack((TRUTH_MASS, y_test[:, 0]))

    if TRUTH_RADIUS is None:
        PREDS_RADIUS=preds[:, 1]
        TRUTH_RADIUS=y_test[:, 1]
    else:
        PREDS_RADIUS=np.hstack((PREDS_RADIUS, preds[:, 1]))
        TRUTH_RADIUS=np.hstack((TRUTH_RADIUS, y_test[:, 1]))
    break

print()
print("Mass results:")
print("RVE: ",explained_variance_score(TRUTH_MASS, PREDS_MASS))
print("RMSE: ",root_mean_squared_error(TRUTH_MASS, PREDS_MASS))
corr, pval=pearsonr(TRUTH_MASS, PREDS_MASS)
print("CORR: ",corr)
print("PVAL: ",pval)
print("MAX_ER: ",max_error(TRUTH_MASS, PREDS_MASS))
print("MEAN_ABS_ER: ",mean_absolute_error(TRUTH_MASS, PREDS_MASS))  
print("MEDIAN_ABS_ER: ",median_absolute_error(TRUTH_MASS, PREDS_MASS))

print()
print("Radius results")
print("RVE: ",explained_variance_score(TRUTH_RADIUS, PREDS_RADIUS))
print("RMSE: ",root_mean_squared_error(TRUTH_RADIUS, PREDS_RADIUS))
corr, pval=pearsonr(TRUTH_RADIUS, PREDS_RADIUS)
print("CORR: ",corr)
print("PVAL: ",pval)
print("MAX_ER: ",max_error(TRUTH_RADIUS, PREDS_RADIUS))
print("MEAN_ABS_ER: ",mean_absolute_error(TRUTH_RADIUS, PREDS_RADIUS))  
print("MEDIAN_ABS_ER: ",median_absolute_error(TRUTH_RADIUS, PREDS_RADIUS))

split 1 
Mass results:
RVE:  0.948873047121406
RMSE:  4.507970784338664
CORR:  0.9743212793829307
PVAL:  0.0
MAX_ER:  126.20152720121959
MEAN_ABS_ER:  1.7504682057433405
MEDIAN_ABS_ER:  0.7501270477934466

Radius results
RVE:  0.996909595709774
RMSE:  0.055344822615252975
CORR:  0.9984587829256812
PVAL:  0.0
MAX_ER:  0.5171337636424453
MEAN_ABS_ER:  0.035280193009108556
MEDIAN_ABS_ER:  0.022433301737730837




## Parameter tuning

In [17]:
# Define the Optuna objective function
def objective(trial):
    # Hyperparameters to tune
    param = {
        'objective': 'reg:squarederror',
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0)
    }
    
    # Train the XGBoost Classifier
    mdl = XGBRegressor(**param)
    mdl.fit(x_train, y_train, verbose=False)
    
    # Predict on validation set
    y_val_pred = mdl.predict(x_val)
    val_f1 = root_mean_squared_error(y_val, y_val_pred)
    
    return val_f1

x_train, x_val, y_train, y_val = train_test_split(X_TRAIN, y_TRAIN, test_size=0.25, random_state=1337)

# Run Optuna optimization
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)

# Best hyperparameters
best_params = study.best_params
print("Best Hyperparameters:", best_params)

# Train the final model with the best parameters
mdl = XGBRegressor(
    **best_params
)
mdl.fit(X_TRAIN, y_TRAIN)

# Evaluate on test set
y_IVS_pred = mdl.predict(X_IVS)
TRUTH_TEFF, PREDS_TEFF = y_IVS[:, 0], y_IVS_pred[:, 0]
TRUTH_LOG_G, PREDS_LOG_G = y_IVS[:, 1], y_IVS_pred[:, 1]
TRUTH_RADIUS, PREDS_RADIUS = y_IVS[:, 2], y_IVS_pred[:, 2]

print()
print("Teff results:")
print("RVE: ",explained_variance_score(TRUTH_TEFF, PREDS_TEFF))
print("RMSE: ",root_mean_squared_error(TRUTH_TEFF, PREDS_TEFF))
corr, pval=pearsonr(TRUTH_TEFF, PREDS_TEFF)
print("CORR: ",corr)
print("PVAL: ",pval)
print("MAX_ER: ",max_error(TRUTH_TEFF, PREDS_TEFF))
print("MEAN_ABS_ER: ",mean_absolute_error(TRUTH_TEFF, PREDS_TEFF))  
print("MEDIAN_ABS_ER: ",median_absolute_error(TRUTH_TEFF, PREDS_TEFF))

print()
print("log_g results:")
print("RVE: ",explained_variance_score(TRUTH_LOG_G, PREDS_LOG_G))
print("RMSE: ",root_mean_squared_error(TRUTH_LOG_G, PREDS_LOG_G))
corr, pval=pearsonr(TRUTH_LOG_G, PREDS_LOG_G)
print("CORR: ",corr)
print("PVAL: ",pval)
print("MAX_ER: ",max_error(TRUTH_LOG_G, PREDS_LOG_G))
print("MEAN_ABS_ER: ",mean_absolute_error(TRUTH_LOG_G, PREDS_LOG_G))  
print("MEDIAN_ABS_ER: ",median_absolute_error(TRUTH_LOG_G, PREDS_LOG_G))

print()
print("Radius results")
print("RVE: ",explained_variance_score(TRUTH_RADIUS, PREDS_RADIUS))
print("RMSE: ",root_mean_squared_error(TRUTH_RADIUS, PREDS_RADIUS))
corr, pval=pearsonr(TRUTH_RADIUS, PREDS_RADIUS)
print("CORR: ",corr)
print("PVAL: ",pval)
print("MAX_ER: ",max_error(TRUTH_RADIUS, PREDS_RADIUS))
print("MEAN_ABS_ER: ",mean_absolute_error(TRUTH_RADIUS, PREDS_RADIUS))  
print("MEDIAN_ABS_ER: ",median_absolute_error(TRUTH_RADIUS, PREDS_RADIUS))

[I 2025-04-14 12:23:52,695] A new study created in memory with name: no-name-9a4e6271-50d5-4863-9bd4-115f1997824a
[I 2025-04-14 12:24:02,838] Trial 0 finished with value: 0.03052093338591357 and parameters: {'max_depth': 4, 'learning_rate': 0.15174219326561025, 'n_estimators': 296, 'subsample': 0.9488565668950056, 'colsample_bytree': 0.8118788537239586}. Best is trial 0 with value: 0.03052093338591357.
[I 2025-04-14 12:24:08,493] Trial 1 finished with value: 0.045013205439555025 and parameters: {'max_depth': 3, 'learning_rate': 0.1264072445952452, 'n_estimators': 179, 'subsample': 0.6235926267992971, 'colsample_bytree': 0.8755250674932091}. Best is trial 0 with value: 0.03052093338591357.
[I 2025-04-14 12:24:18,771] Trial 2 finished with value: 0.025404439183044043 and parameters: {'max_depth': 5, 'learning_rate': 0.21626941500148608, 'n_estimators': 294, 'subsample': 0.5303234290825225, 'colsample_bytree': 0.8223432011673157}. Best is trial 2 with value: 0.025404439183044043.
[I 2025-

Best Hyperparameters: {'max_depth': 10, 'learning_rate': 0.14221957417353837, 'n_estimators': 300, 'subsample': 0.8267801580077724, 'colsample_bytree': 0.8588304017979319}

Teff results:
RVE:  0.9967304222484684
RMSE:  0.021141392635122637
CORR:  0.9983640530679367
PVAL:  0.0
MAX_ER:  0.9988939780952695
MEAN_ABS_ER:  0.007287768402571821
MEDIAN_ABS_ER:  0.0036436087172333487

log_g results:
RVE:  0.9998370053635235
RMSE:  0.025186988687523948
CORR:  0.9999185030862698
PVAL:  0.0
MAX_ER:  0.7504738944807494
MEAN_ABS_ER:  0.011098875394824098
MEDIAN_ABS_ER:  0.007222764739378329

Radius results
RVE:  0.9998122487263185
RMSE:  0.01363908450067097
CORR:  0.9999061223644827
PVAL:  0.0
MAX_ER:  0.37869788005225913
MEAN_ABS_ER:  0.006053665015841827
MEDIAN_ABS_ER:  0.0036936068225239538


In [18]:
mdl = XGBRegressor(max_depth=10, learning_rate=0.14221957417353837, n_estimators=300, subsample=0.8267801580077724, colsample_bytree=0.8588304017979319)
mdl.fit(X, y)

joblib.dump(mdl, 'model/secondary_XGB.pkl')

['model/secondary_XGB.pkl']