## Imports

In [2]:
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import random

In [3]:
from sklearn.model_selection import train_test_split, KFold, GridSearchCV

from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor

from sklearn.metrics import precision_score, recall_score, f1_score, matthews_corrcoef, confusion_matrix,\
                             explained_variance_score, mean_squared_error, max_error, mean_absolute_error,\
                             root_mean_squared_error, median_absolute_error
from scipy.stats import pearsonr

from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer
from sklearn.decomposition import PCA, KernelPCA
# pas besoin de feature selection parce que pas beacoup de colonnes

import optuna

In [4]:
import read_mist_models

from utils import Iso_data_handler

def gaussian(dsts):
    kernel_width = .5
    weights = np.exp(-(dsts**2)/kernel_width)
    return weights

# To find certain rows

def isclose_pandas_apply(row, col_name, value, bool_index, rel_tol=1e-6):
    if math.isclose(row[col_name], value, rel_tol=rel_tol):
        bool_index.append(True)
    else:
        bool_index.append(False)

def isclose_pandas(df, col_name, value, rel_tol=1e-6):
    bool_index = []
    df.apply(isclose_pandas_apply, axis=1, args=(col_name, value, bool_index, rel_tol))
    return bool_index

## Data preparation

In [5]:
iso_handler = Iso_data_handler("data/MIST_v1.2_vvcrit0.0_basic_isos/", 
                              ['log10_isochrone_age_yr', 'log_Teff', 'log_g', 'star_mass', 'phase', 'metallicity', 'log_R'])


In [6]:
iso_df = iso_handler.full_iso_data_to_panda()

Reading dataframe from csv file...


In [7]:
display(iso_df)

Unnamed: 0,log10_isochrone_age_yr,log_Teff,log_g,star_mass,phase,metallicity,log_R
0,5.0,3.486221,3.131342,0.100000,-1.0,-0.25,0.153402
1,5.0,3.487362,3.126808,0.102645,-1.0,-0.25,0.160326
2,5.0,3.489243,3.119367,0.107039,-1.0,-0.25,0.171785
3,5.0,3.491102,3.112165,0.111419,-1.0,-0.25,0.183099
4,5.0,3.492937,3.105143,0.115789,-1.0,-0.25,0.194305
...,...,...,...,...,...,...,...
1467117,10.3,4.402490,7.777159,0.532726,6.0,0.50,-1.806255
1467118,10.3,4.387132,7.783242,0.532730,6.0,0.50,-1.809295
1467119,10.3,4.371789,7.789130,0.532735,6.0,0.50,-1.812237
1467120,10.3,4.356480,7.794844,0.532741,6.0,0.50,-1.815091


In [8]:
# Keeping only the relevant star phases
phase_filtered_iso_df = iso_df\
    .where((iso_df.phase == 0) | (iso_df.phase == 2) | (iso_df.phase == 3) | (iso_df.phase == 4) | (iso_df.phase == 5))\
    .dropna().reset_index(drop=True)

In [9]:
display(phase_filtered_iso_df)

Unnamed: 0,log10_isochrone_age_yr,log_Teff,log_g,star_mass,phase,metallicity,log_R
0,5.0,4.494412,4.346972,13.584360,0.0,-0.25,0.610679
1,5.0,4.497517,4.345776,13.765512,0.0,-0.25,0.614753
2,5.0,4.500556,4.344580,13.942887,0.0,-0.25,0.618755
3,5.0,4.504040,4.343050,14.591712,0.0,-0.25,0.624670
4,5.0,4.507576,4.341483,15.426062,0.0,-0.25,0.631187
...,...,...,...,...,...,...,...
1165292,10.3,3.425746,-0.551440,0.602856,5.0,0.50,2.384899
1165293,10.3,3.426469,-0.560350,0.598549,5.0,0.50,2.387797
1165294,10.3,3.427744,-0.566057,0.594116,5.0,0.50,2.389036
1165295,10.3,3.429413,-0.569225,0.589648,5.0,0.50,2.388981


In [10]:
X = phase_filtered_iso_df.drop(['star_mass', 'phase', 'log_R'], axis=1).to_numpy()
y = phase_filtered_iso_df[['star_mass', 'log_R']].to_numpy()

X_TRAIN, X_IVS, y_TRAIN, y_IVS = train_test_split(X, y, test_size=0.25, random_state=1337)

print(X_TRAIN.shape, X_IVS.shape)
print(y_TRAIN.shape, y_IVS.shape)

(873972, 4) (291325, 4)
(873972, 2) (291325, 2)


In [11]:
print(f"Range in train data for the mass parameter : {min(y_TRAIN[:, 0])} - {max(y_TRAIN[:, 0])}")
print(f"Median value in train data for the mass parameter: {np.median(y_TRAIN[:, 0])}")
print(f"Mean value in train data for the mass parameter: {np.mean(y_TRAIN[:, 0])}")

print(f"Range in test data for the mass parameter : {min(y_IVS[:, 0])} - {max(y_IVS[:, 0])}")
print(f"Median value in test data for the mass parameter: {np.median(y_IVS[:, 0])}")
print(f"Mean value in test data for the mass parameter: {np.mean(y_IVS[:, 0])}")

print()

print(f"Range in train data for the radius parameter : {min(y_TRAIN[:, 1])} - {max(y_TRAIN[:, 1])}")
print(f"Median value in train data for the radius parameter: {np.median(y_TRAIN[:, 1])}")
print(f"Mean value in train data for the radius parameter: {np.mean(y_TRAIN[:, 1])}")

print(f"Range in test data for the radius parameter : {min(y_IVS[:, 1])} - {max(y_IVS[:, 1])}")
print(f"Median value in test data for the radius parameter: {np.median(y_IVS[:, 1])}")
print(f"Mean value in test data for the radius parameter: {np.mean(y_IVS[:, 1])}")


Range in train data for the mass parameter : 0.0999979840073621 - 296.5221171165397
Median value in train data for the mass parameter: 2.0298876569807147
Mean value in train data for the mass parameter: 7.387582424532589
Range in test data for the mass parameter : 0.099998052173157 - 298.5447575808816
Median value in test data for the mass parameter: 2.031987247312461
Mean value in test data for the mass parameter: 7.403392070038953

Range in train data for the radius parameter : -2.085171571669866 - 3.1297545143214007
Median value in train data for the radius parameter: 1.4613315181026039
Mean value in train data for the radius parameter: 1.3545468932779332
Range in test data for the radius parameter : -2.081830985668411 - 3.129269620812593
Median value in test data for the radius parameter: 1.457489869624725
Mean value in test data for the radius parameter: 1.353692007577153


In [12]:
phase_mass_filtered_iso_df = phase_filtered_iso_df.where(phase_filtered_iso_df.star_mass < 30)\
                                                  .dropna()\
                                                  .reset_index(drop=True)

In [13]:
display(phase_mass_filtered_iso_df)

Unnamed: 0,log10_isochrone_age_yr,log_Teff,log_g,star_mass,phase,metallicity,log_R
0,5.0,4.494412,4.346972,13.584360,0.0,-0.25,0.610679
1,5.0,4.497517,4.345776,13.765512,0.0,-0.25,0.614753
2,5.0,4.500556,4.344580,13.942887,0.0,-0.25,0.618755
3,5.0,4.504040,4.343050,14.591712,0.0,-0.25,0.624670
4,5.0,4.507576,4.341483,15.426062,0.0,-0.25,0.631187
...,...,...,...,...,...,...,...
1105728,10.3,3.425746,-0.551440,0.602856,5.0,0.50,2.384899
1105729,10.3,3.426469,-0.560350,0.598549,5.0,0.50,2.387797
1105730,10.3,3.427744,-0.566057,0.594116,5.0,0.50,2.389036
1105731,10.3,3.429413,-0.569225,0.589648,5.0,0.50,2.388981


In [14]:
X_mass = phase_mass_filtered_iso_df.drop(['star_mass', 'phase', 'log_R'], axis=1).to_numpy()
y_mass = phase_mass_filtered_iso_df[['star_mass', 'log_R']].to_numpy()

X_TRAIN_mass, X_IVS_mass, y_TRAIN_mass, y_IVS_mass = train_test_split(X_mass, y_mass, test_size=0.25, random_state=1337)

print(X_TRAIN_mass.shape, X_IVS_mass.shape)
print(y_TRAIN_mass.shape, y_IVS_mass.shape)

(829299, 4) (276434, 4)
(829299, 2) (276434, 2)


## PCA data preparation

In [27]:
pca = PCA(n_components=4) # maybe try with less or more components
pca.fit(X_TRAIN)
tve=0
for i, ve in enumerate(pca.explained_variance_ratio_):
    tve+=ve
    print("PC%d - Variance explained: %7.4f - Total Variance: %7.4f" % (i, ve, tve) )
print()

PC0 - Variance explained:  0.5934 - Total Variance:  0.5934
PC1 - Variance explained:  0.2337 - Total Variance:  0.8271
PC2 - Variance explained:  0.1685 - Total Variance:  0.9955
PC3 - Variance explained:  0.0045 - Total Variance:  1.0000



In [28]:
X_TRAIN_PCA=pca.transform(X_TRAIN)
X_IVS_PCA=pca.transform(X_IVS)
print(X_TRAIN_PCA.shape)

(873972, 4)


## Model training

In [17]:

# TODO faire que la fonction puisse accepter autant d'output qu'on veut
# TODO mettre les outputs du modèle dans un fichier csv pour utiliser plus tard
# TODO mettre la possibilité de rajouter des paramètres à tester dans le modèle
# TODO rajouter le calcul du temps et le rajouter dans le csv

def Kfold_pipeline(model, x_train_data=X_TRAIN, y_train_data=y_TRAIN, filename="", n_splits=10, shuffle=True):
    kf = KFold(n_splits=n_splits, shuffle=shuffle, random_state=12)
    TRUTH_MASS=None
    TRUTH_RADIUS=None
    PREDS_MASS=None
    PREDS_RADIUS=None
    counter = 0
    print("split", end=' ')
    for train_index, test_index in kf.split(x_train_data):
        counter += 1
        print(str(counter), end=' ')
        X_train, X_test = x_train_data[train_index], x_train_data[test_index]
        y_train, y_test = y_train_data[train_index], y_train_data[test_index]

        mdl = model()
        mdl.fit(X_train, y_train)
        preds = mdl.predict(X_test)

        if TRUTH_MASS is None:
            PREDS_MASS=preds[:, 0]
            TRUTH_MASS=y_test[:, 0]
        else:
            PREDS_MASS=np.hstack((PREDS_MASS, preds[:, 0]))
            TRUTH_MASS=np.hstack((TRUTH_MASS, y_test[:, 0]))

        if TRUTH_RADIUS is None:
            PREDS_RADIUS=preds[:, 1]
            TRUTH_RADIUS=y_test[:, 1]
        else:
            PREDS_RADIUS=np.hstack((PREDS_RADIUS, preds[:, 1]))
            TRUTH_RADIUS=np.hstack((TRUTH_RADIUS, y_test[:, 1]))

    print()
    print("Mass results:")
    print("RVE: ",explained_variance_score(TRUTH_MASS, PREDS_MASS))
    print("RMSE: ",root_mean_squared_error(TRUTH_MASS, PREDS_MASS))
    corr, pval=pearsonr(TRUTH_MASS, PREDS_MASS)
    print("CORR: ",corr)
    print("PVAL: ",pval)
    print("MAX_ER: ",max_error(TRUTH_MASS, PREDS_MASS))
    print("MEAN_ABS_ER: ",mean_absolute_error(TRUTH_MASS, PREDS_MASS))  
    print("MEDIAN_ABS_ER: ",median_absolute_error(TRUTH_MASS, PREDS_MASS))

    print()
    print("Radius results")
    print("RVE: ",explained_variance_score(TRUTH_RADIUS, PREDS_RADIUS))
    print("RMSE: ",root_mean_squared_error(TRUTH_RADIUS, PREDS_RADIUS))
    corr, pval=pearsonr(TRUTH_RADIUS, PREDS_RADIUS)
    print("CORR: ",corr)
    print("PVAL: ",pval)
    print("MAX_ER: ",max_error(TRUTH_RADIUS, PREDS_RADIUS))
    print("MEAN_ABS_ER: ",mean_absolute_error(TRUTH_RADIUS, PREDS_RADIUS))  
    print("MEDIAN_ABS_ER: ",median_absolute_error(TRUTH_RADIUS, PREDS_RADIUS))

### Linear model

#### Linear regression

In [51]:
print("Base train data :")
Kfold_pipeline(LinearRegression, x_train_data=X_TRAIN, y_train_data=y_TRAIN)
print("\nNo massive stars train data :")
Kfold_pipeline(LinearRegression, x_train_data=X_TRAIN_mass, y_train_data=y_TRAIN_mass)
print("\nPCA train data :")
Kfold_pipeline(LinearRegression, x_train_data=X_TRAIN_PCA, y_train_data=y_TRAIN)

Base train data :
split 1 2 3 4 5 6 7 8 9 10 
Mass results:
RVE:  0.30901687258498745
RMSE:  16.8380323132355
CORR:  0.5558928608298248
PVAL:  0.0
MAX_ER:  261.34078636747347
MEAN_ABS_ER:  7.338182392289765
MEDIAN_ABS_ER:  3.8258231429360032

Radius results
RVE:  0.9873685426795851
RMSE:  0.11206128690888313
CORR:  0.9936642001601586
PVAL:  0.0
MAX_ER:  0.6588808734804275
MEAN_ABS_ER:  0.08048427077094718
MEDIAN_ABS_ER:  0.05786574271786016

No massive stars train data :
split 1 2 3 4 5 6 7 8 9 10 
Mass results:
RVE:  0.5878738610825109
RMSE:  3.232946777342243
CORR:  0.7667293271441842
PVAL:  0.0
MAX_ER:  21.097567878125897
MEAN_ABS_ER:  1.9929938203460211
MEDIAN_ABS_ER:  1.1574677953336232

Radius results
RVE:  0.9906064628081043
RMSE:  0.09825720028390682
CORR:  0.9952921494758149
PVAL:  0.0
MAX_ER:  0.548136971817978
MEAN_ABS_ER:  0.07538333570188119
MEDIAN_ABS_ER:  0.06126458082042363

PCA train data :
split 1 2 3 4 5 6 7 8 9 10 
Mass results:
RVE:  0.25338992107059666
RMSE:  17.5

#### Ridge

In [52]:
print("Base train data :")
Kfold_pipeline(Ridge, x_train_data=X_TRAIN, y_train_data=y_TRAIN)
print("\nNo massive stars train data :")
Kfold_pipeline(Ridge, x_train_data=X_TRAIN_mass, y_train_data=y_TRAIN_mass)
print("\nPCA train data :")
Kfold_pipeline(Ridge, x_train_data=X_TRAIN_PCA, y_train_data=y_TRAIN)

Base train data :
split 1 2 3 4 5 6 7 8 9 10 
Mass results:
RVE:  0.3090168726744542
RMSE:  16.838032312145423
CORR:  0.5558928608350088
PVAL:  0.0
MAX_ER:  261.3410094433783
MEAN_ABS_ER:  7.338176156303839
MEDIAN_ABS_ER:  3.825916383563096

Radius results
RVE:  0.9873685426773078
RMSE:  0.11206128691898454
CORR:  0.9936642001595695
PVAL:  0.0
MAX_ER:  0.6588639674529206
MEAN_ABS_ER:  0.08048433522428677
MEDIAN_ABS_ER:  0.057865452478165436

No massive stars train data :
split 1 2 3 4 5 6 7 8 9 10 
Mass results:
RVE:  0.5878738610661661
RMSE:  3.232946777406351
CORR:  0.7667293271201951
PVAL:  0.0
MAX_ER:  21.097526006072854
MEAN_ABS_ER:  1.9929926074149598
MEDIAN_ABS_ER:  1.157463451570829

Radius results
RVE:  0.9906064628035638
RMSE:  0.09825720030765356
CORR:  0.9952921494738379
PVAL:  0.0
MAX_ER:  0.5481215560886592
MEAN_ABS_ER:  0.07538338828372189
MEDIAN_ABS_ER:  0.06126440051464921

PCA train data :
split 1 2 3 4 5 6 7 8 9 10 
Mass results:
RVE:  0.25338992107358926
RMSE:  17.5

### Decision tree regressor

In [53]:
print("Base train data :")
Kfold_pipeline(DecisionTreeRegressor, x_train_data=X_TRAIN, y_train_data=y_TRAIN)
print("\nNo massive stars train data :")
Kfold_pipeline(DecisionTreeRegressor, x_train_data=X_TRAIN_mass, y_train_data=y_TRAIN_mass)
print("\nPCA train data :")
Kfold_pipeline(DecisionTreeRegressor, x_train_data=X_TRAIN_PCA, y_train_data=y_TRAIN)

Base train data :
split 1 2 3 4 5 6 7 8 9 10 
Mass results:
RVE:  0.995334971891372
RMSE:  1.3835178387671911
CORR:  0.9976663127576462
PVAL:  0.0
MAX_ER:  174.26359172346247
MEAN_ABS_ER:  0.1310991981424897
MEDIAN_ABS_ER:  0.005558587737977239

Radius results
RVE:  0.9998065615405345
RMSE:  0.013867687233684022
CORR:  0.9999033125376784
PVAL:  0.0
MAX_ER:  1.519566343792953
MEAN_ABS_ER:  0.007206643770715822
MEDIAN_ABS_ER:  0.003578188648193037

No massive stars train data :
split 1 2 3 4 5 6 7 8 9 10 
Mass results:
RVE:  0.999094357687175
RMSE:  0.15155303690909386
CORR:  0.9995472657978013
PVAL:  0.0
MAX_ER:  12.538444054534796
MEAN_ABS_ER:  0.03833636327145334
MEDIAN_ABS_ER:  0.005162312018770265

Radius results
RVE:  0.9998219083684112
RMSE:  0.013529336048955646
CORR:  0.9999109866454536
PVAL:  0.0
MAX_ER:  0.5508167849818763
MEAN_ABS_ER:  0.007077638363063763
MEDIAN_ABS_ER:  0.0034907390198903876

PCA train data :
split 1 2 3 4 5 6 7 8 9 10 
Mass results:
RVE:  0.978770470543624

### KNeighbours regressor

In [54]:
print("Base train data :")
Kfold_pipeline(KNeighborsRegressor, x_train_data=X_TRAIN, y_train_data=y_TRAIN)
print("\nNo massive stars train data :")
Kfold_pipeline(KNeighborsRegressor, x_train_data=X_TRAIN_mass, y_train_data=y_TRAIN_mass)
print("\nPCA train data :")
Kfold_pipeline(KNeighborsRegressor, x_train_data=X_TRAIN_PCA, y_train_data=y_TRAIN)

Base train data :
split 1 2 3 4 5 6 7 8 9 10 
Mass results:
RVE:  0.9904188733887882
RMSE:  1.9829555118612112
CORR:  0.9952988225628827
PVAL:  0.0
MAX_ER:  127.52316531624926
MEAN_ABS_ER:  0.2050401600882721
MEDIAN_ABS_ER:  0.014054829272664437

Radius results
RVE:  0.9998512272018272
RMSE:  0.012162112776814298
CORR:  0.999925623194784
PVAL:  0.0
MAX_ER:  0.29422562882024195
MEAN_ABS_ER:  0.006446601792812023
MEDIAN_ABS_ER:  0.003103084884320595

No massive stars train data :
split 1 2 3 4 5 6 7 8 9 10 
Mass results:
RVE:  0.9987830576815646
RMSE:  0.17569088492991541
CORR:  0.9993955450415333
PVAL:  0.0
MAX_ER:  7.384421706285124
MEAN_ABS_ER:  0.05793771996084051
MEDIAN_ABS_ER:  0.012992508308667494

Radius results
RVE:  0.999859945170533
RMSE:  0.011998667335047529
CORR:  0.9999299834445969
PVAL:  0.0
MAX_ER:  0.2800961543697973
MEAN_ABS_ER:  0.006372829293744965
MEDIAN_ABS_ER:  0.003069220070010436

PCA train data :
split 1 2 3 4 5 6 7 8 9 10 
Mass results:
RVE:  0.979438047579148

### Support vector regressor

In [55]:
Kfold_pipeline(SVR)

split 1


ValueError: y should be a 1d array, got an array of shape (786574, 2) instead.

### Random forest regressor

In [None]:
# très long donc je teste juste sur 1 pour l'instant
# Kfold_pipeline(RandomForestRegressor)
print("Base train data :")
Kfold_pipeline(RandomForestRegressor, x_train_data=X_TRAIN, y_train_data=y_TRAIN)
print("\nNo massive stars train data :")
Kfold_pipeline(RandomForestRegressor, x_train_data=X_TRAIN_mass, y_train_data=y_TRAIN_mass)
print("\nPCA train data :")
Kfold_pipeline(RandomForestRegressor, x_train_data=X_TRAIN_PCA, y_train_data=y_TRAIN)

split 1


KeyboardInterrupt: 

In [69]:
kf = KFold(n_splits=10, shuffle=True, random_state=12)
TRUTH_MASS=None
TRUTH_RADIUS=None
PREDS_MASS=None
PREDS_RADIUS=None
counter = 0
print("split", end=' ')
for train_index, test_index in kf.split(X_TRAIN):
    counter += 1
    print(str(counter), end=' ')
    X_train, X_test = X_TRAIN[train_index], X_TRAIN[test_index]
    y_train, y_test = y_TRAIN[train_index], y_TRAIN[test_index]

    mdl = RandomForestRegressor()
    mdl.fit(X_train, y_train)
    preds = mdl.predict(X_test)

    if TRUTH_MASS is None:
        PREDS_MASS=preds[:, 0]
        TRUTH_MASS=y_test[:, 0]
    else:
        PREDS_MASS=np.hstack((PREDS_MASS, preds[:, 0]))
        TRUTH_MASS=np.hstack((TRUTH_MASS, y_test[:, 0]))

    if TRUTH_RADIUS is None:
        PREDS_RADIUS=preds[:, 1]
        TRUTH_RADIUS=y_test[:, 1]
    else:
        PREDS_RADIUS=np.hstack((PREDS_RADIUS, preds[:, 1]))
        TRUTH_RADIUS=np.hstack((TRUTH_RADIUS, y_test[:, 1]))
    break

print()
print("Mass results:")
print("RVE: ",explained_variance_score(TRUTH_MASS, PREDS_MASS))
print("RMSE: ",root_mean_squared_error(TRUTH_MASS, PREDS_MASS))
corr, pval=pearsonr(TRUTH_MASS, PREDS_MASS)
print("CORR: ",corr)
print("PVAL: ",pval)
print("MAX_ER: ",max_error(TRUTH_MASS, PREDS_MASS))
print("MEAN_ABS_ER: ",mean_absolute_error(TRUTH_MASS, PREDS_MASS))  
print("MEDIAN_ABS_ER: ",median_absolute_error(TRUTH_MASS, PREDS_MASS))

print()
print("Radius results")
print("RVE: ",explained_variance_score(TRUTH_RADIUS, PREDS_RADIUS))
print("RMSE: ",root_mean_squared_error(TRUTH_RADIUS, PREDS_RADIUS))
corr, pval=pearsonr(TRUTH_RADIUS, PREDS_RADIUS)
print("CORR: ",corr)
print("PVAL: ",pval)
print("MAX_ER: ",max_error(TRUTH_RADIUS, PREDS_RADIUS))
print("MEAN_ABS_ER: ",mean_absolute_error(TRUTH_RADIUS, PREDS_RADIUS))  
print("MEDIAN_ABS_ER: ",median_absolute_error(TRUTH_RADIUS, PREDS_RADIUS))

split 1 
Mass results:
RVE:  0.9979971612522945
RMSE:  0.8897723222979466
CORR:  0.9989990388735349
PVAL:  0.0
MAX_ER:  95.13832901219004
MEAN_ABS_ER:  0.09816245592922257
MEDIAN_ABS_ER:  0.007945579678619863

Radius results
RVE:  0.9999162533559964
RMSE:  0.009111516631795574
CORR:  0.9999581343850581
PVAL:  0.0
MAX_ER:  0.17793452382785047
MEAN_ABS_ER:  0.004513832099154549
MEDIAN_ABS_ER:  0.001878066205547424


### Adaboost regressor

In [None]:
Kfold_pipeline(AdaBoostRegressor)

split 1 

ValueError: y should be a 1d array, got an array of shape (786574, 2) instead.

### XGB regressor

In [56]:
print("Base train data :")
Kfold_pipeline(XGBRegressor, x_train_data=X_TRAIN, y_train_data=y_TRAIN)
print("\nNo massive stars train data :")
Kfold_pipeline(XGBRegressor, x_train_data=X_TRAIN_mass, y_train_data=y_TRAIN_mass)
print("\nPCA train data :")
Kfold_pipeline(XGBRegressor, x_train_data=X_TRAIN_PCA, y_train_data=y_TRAIN)

Base train data :
split 1 2 3 4 5 6 7 8 9 10 
Mass results:
RVE:  0.9928464903002319
RMSE:  1.7132360835078733
CORR:  0.9964180011819096
PVAL:  0.0
MAX_ER:  110.18805938646284
MEAN_ABS_ER:  0.3946311668949472
MEDIAN_ABS_ER:  0.104163468846735

Radius results
RVE:  0.9994233045747828
RMSE:  0.02394430071401917
CORR:  0.9997116187617063
PVAL:  0.0
MAX_ER:  0.5126239784743876
MEAN_ABS_ER:  0.015282643275083123
MEDIAN_ABS_ER:  0.011229646007128169

No massive stars train data :
split 1 2 3 4 5 6 7 8 9 10 
Mass results:
RVE:  0.9976024472038579
RMSE:  0.24658551889858976
CORR:  0.998800526603412
PVAL:  0.0
MAX_ER:  6.6038831100335535
MEAN_ABS_ER:  0.12457318504025985
MEDIAN_ABS_ER:  0.06460424986924651

Radius results
RVE:  0.999515025535636
RMSE:  0.02232591373764753
CORR:  0.999757486103044
PVAL:  0.0
MAX_ER:  0.5735360958383472
MEAN_ABS_ER:  0.014035917675008317
MEDIAN_ABS_ER:  0.01025726482890743

PCA train data :
split 1 2 3 4 5 6 7 8 9 10 
Mass results:
RVE:  0.9552884539838402
RMSE: 

### MLP regressor

In [17]:
print("Base train data :")
Kfold_pipeline(MLPRegressor, x_train_data=X_TRAIN, y_train_data=y_TRAIN)
print("\nNo massive stars train data :")
Kfold_pipeline(MLPRegressor, x_train_data=X_TRAIN_mass, y_train_data=y_TRAIN_mass)
print("\nPCA train data :")
Kfold_pipeline(MLPRegressor, x_train_data=X_TRAIN_PCA, y_train_data=y_TRAIN)

Base train data :
split 1 

: 

In [71]:
kf = KFold(n_splits=10, shuffle=True, random_state=12)
TRUTH_MASS=None
TRUTH_RADIUS=None
PREDS_MASS=None
PREDS_RADIUS=None
counter = 0
print("split", end=' ')
for train_index, test_index in kf.split(X_TRAIN):
    counter += 1
    print(str(counter), end=' ')
    X_train, X_test = X_TRAIN[train_index], X_TRAIN[test_index]
    y_train, y_test = y_TRAIN[train_index], y_TRAIN[test_index]

    mdl = MLPRegressor()
    mdl.fit(X_train, y_train)
    preds = mdl.predict(X_test)

    if TRUTH_MASS is None:
        PREDS_MASS=preds[:, 0]
        TRUTH_MASS=y_test[:, 0]
    else:
        PREDS_MASS=np.hstack((PREDS_MASS, preds[:, 0]))
        TRUTH_MASS=np.hstack((TRUTH_MASS, y_test[:, 0]))

    if TRUTH_RADIUS is None:
        PREDS_RADIUS=preds[:, 1]
        TRUTH_RADIUS=y_test[:, 1]
    else:
        PREDS_RADIUS=np.hstack((PREDS_RADIUS, preds[:, 1]))
        TRUTH_RADIUS=np.hstack((TRUTH_RADIUS, y_test[:, 1]))
    break

print()
print("Mass results:")
print("RVE: ",explained_variance_score(TRUTH_MASS, PREDS_MASS))
print("RMSE: ",root_mean_squared_error(TRUTH_MASS, PREDS_MASS))
corr, pval=pearsonr(TRUTH_MASS, PREDS_MASS)
print("CORR: ",corr)
print("PVAL: ",pval)
print("MAX_ER: ",max_error(TRUTH_MASS, PREDS_MASS))
print("MEAN_ABS_ER: ",mean_absolute_error(TRUTH_MASS, PREDS_MASS))  
print("MEDIAN_ABS_ER: ",median_absolute_error(TRUTH_MASS, PREDS_MASS))

print()
print("Radius results")
print("RVE: ",explained_variance_score(TRUTH_RADIUS, PREDS_RADIUS))
print("RMSE: ",root_mean_squared_error(TRUTH_RADIUS, PREDS_RADIUS))
corr, pval=pearsonr(TRUTH_RADIUS, PREDS_RADIUS)
print("CORR: ",corr)
print("PVAL: ",pval)
print("MAX_ER: ",max_error(TRUTH_RADIUS, PREDS_RADIUS))
print("MEAN_ABS_ER: ",mean_absolute_error(TRUTH_RADIUS, PREDS_RADIUS))  
print("MEDIAN_ABS_ER: ",median_absolute_error(TRUTH_RADIUS, PREDS_RADIUS))

split 1 
Mass results:
RVE:  0.948873047121406
RMSE:  4.507970784338664
CORR:  0.9743212793829307
PVAL:  0.0
MAX_ER:  126.20152720121959
MEAN_ABS_ER:  1.7504682057433405
MEDIAN_ABS_ER:  0.7501270477934466

Radius results
RVE:  0.996909595709774
RMSE:  0.055344822615252975
CORR:  0.9984587829256812
PVAL:  0.0
MAX_ER:  0.5171337636424453
MEAN_ABS_ER:  0.035280193009108556
MEDIAN_ABS_ER:  0.022433301737730837




## Parameter tuning

In [19]:
print(y_IVS)

[[ 0.34542254 -0.51293473]
 [ 0.69656751  1.70131578]
 [ 1.0497905   2.35969294]
 ...
 [ 2.29980154  2.2158493 ]
 [24.64651516  1.43565778]
 [ 0.73709532 -0.19643211]]


In [24]:
# Define the Optuna objective function
def objective(trial):
    # Hyperparameters to tune
    param = {
        'objective': 'reg:squarederror',
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0)
    }
    
    # Train the XGBoost Classifier
    mdl = XGBRegressor(**param)
    mdl.fit(x_train, y_train, verbose=False)
    
    # Predict on validation set
    y_val_pred = mdl.predict(x_val)
    val_f1 = root_mean_squared_error(y_val, y_val_pred)
    
    return val_f1

x_train, x_val, y_train, y_val = train_test_split(X_TRAIN, y_TRAIN, test_size=0.25, random_state=1337)

# Run Optuna optimization
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)

# Best hyperparameters
best_params = study.best_params
print("Best Hyperparameters:", best_params)

# Train the final model with the best parameters
mdl = XGBRegressor(
    **best_params
)
mdl.fit(x_train, y_train)

# Evaluate on test set
y_IVS_pred = mdl.predict(X_IVS)
print(y_IVS_pred)
TRUTH_MASS, PREDS_MASS = y_IVS[:, 0], y_IVS_pred[:, 0]
TRUTH_RADIUS, PREDS_RADIUS = y_IVS[:, 1], y_IVS_pred[:, 1]

print()
print("Mass results:")
print("RVE: ",explained_variance_score(TRUTH_MASS, PREDS_MASS))
print("RMSE: ",root_mean_squared_error(TRUTH_MASS, PREDS_MASS))
corr, pval=pearsonr(TRUTH_MASS, PREDS_MASS)
print("CORR: ",corr)
print("PVAL: ",pval)
print("MAX_ER: ",max_error(TRUTH_MASS, PREDS_MASS))
print("MEAN_ABS_ER: ",mean_absolute_error(TRUTH_MASS, PREDS_MASS))  
print("MEDIAN_ABS_ER: ",median_absolute_error(TRUTH_MASS, PREDS_MASS))

print()
print("Radius results")
print("RVE: ",explained_variance_score(TRUTH_RADIUS, PREDS_RADIUS))
print("RMSE: ",root_mean_squared_error(TRUTH_RADIUS, PREDS_RADIUS))
corr, pval=pearsonr(TRUTH_RADIUS, PREDS_RADIUS)
print("CORR: ",corr)
print("PVAL: ",pval)
print("MAX_ER: ",max_error(TRUTH_RADIUS, PREDS_RADIUS))
print("MEAN_ABS_ER: ",mean_absolute_error(TRUTH_RADIUS, PREDS_RADIUS))  
print("MEDIAN_ABS_ER: ",median_absolute_error(TRUTH_RADIUS, PREDS_RADIUS))

[I 2025-04-13 16:26:36,426] A new study created in memory with name: no-name-a29d6bb2-9d15-4913-919f-ae1185a94bc9
[I 2025-04-13 16:26:41,148] Trial 0 finished with value: 1.1338571298779923 and parameters: {'max_depth': 7, 'learning_rate': 0.07904495783826478, 'n_estimators': 167, 'subsample': 0.6347164819403, 'colsample_bytree': 0.9483718690235952}. Best is trial 0 with value: 1.1338571298779923.
[I 2025-04-13 16:26:43,701] Trial 1 finished with value: 3.8798413309648865 and parameters: {'max_depth': 3, 'learning_rate': 0.04246000624306239, 'n_estimators': 118, 'subsample': 0.6615044507935806, 'colsample_bytree': 0.5139746246898538}. Best is trial 0 with value: 1.1338571298779923.
[I 2025-04-13 16:26:50,445] Trial 2 finished with value: 0.9495050266428594 and parameters: {'max_depth': 10, 'learning_rate': 0.07083499029475966, 'n_estimators': 177, 'subsample': 0.8978190396679475, 'colsample_bytree': 0.9817131339195029}. Best is trial 2 with value: 0.9495050266428594.
[I 2025-04-13 16:2

Best Hyperparameters: {'max_depth': 9, 'learning_rate': 0.29756705156191526, 'n_estimators': 252, 'subsample': 0.8981926120724242, 'colsample_bytree': 0.8552677784958321}
[[ 0.3759572  -0.51608884]
 [ 0.6935576   1.6983844 ]
 [ 1.0266403   2.3358314 ]
 ...
 [ 2.3202796   2.2048564 ]
 [24.61057     1.4211148 ]
 [ 0.6250673  -0.18477708]]

Mass results:
RVE:  0.9946172685683172
RMSE:  1.4908087683684956
CORR:  0.9973051456195752
PVAL:  0.0
MAX_ER:  108.18199747921551
MEAN_ABS_ER:  0.28275510800128145
MEDIAN_ABS_ER:  0.06869179918591706

Radius results
RVE:  0.9995277329694996
RMSE:  0.021631480500297073
CORR:  0.9997638496787923
PVAL:  0.0
MAX_ER:  0.5458927354442795
MEAN_ABS_ER:  0.012513408563979235
MEDIAN_ABS_ER:  0.008368254893720728
