In [None]:
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd


import astropy as ap
from astropy.table import QTable


from sklearn.model_selection import train_test_split, KFold, GridSearchCV

from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
#from xgboost import XGBRegressor

from sklearn.metrics import precision_score, recall_score, f1_score, matthews_corrcoef, confusion_matrix,\
                             explained_variance_score, mean_squared_error, max_error, mean_absolute_error,\
                             root_mean_squared_error, median_absolute_error
from scipy.stats import pearsonr

from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer
from sklearn.decomposition import PCA, KernelPCA
# pas besoin de feature selection parce que pas beacoup de colonnes

import optuna

import read_mist_models

def gaussian(dsts):
    kernel_width = .5
    weights = np.exp(-(dsts**2)/kernel_width)
    return weights

# To find certain rows

def isclose_pandas_apply(row, col_name, value, bool_index, rel_tol=1e-6):
    if math.isclose(row[col_name], value, rel_tol=rel_tol):
        bool_index.append(True)
    else:
        bool_index.append(False)

def isclose_pandas(df, col_name, value, rel_tol=1e-6):
    bool_index = []
    df.apply(isclose_pandas_apply, axis=1, args=(col_name, value, bool_index, rel_tol))
    return bool_index

  from .autonotebook import tqdm as notebook_tqdm


## Data preparation

In [4]:
def get_iso_data_panda(file):
    iso = read_mist_models.ISO(file)

    age = []
    logTeff = []
    logL = []
    log_g = []
    mass = []
    logR = []
    for iso_ind in range(len(iso.isos)):
        age.extend(iso.isos[iso_ind]['log10_isochrone_age_yr'])
        logTeff.extend(iso.isos[iso_ind]['log_Teff'])
        log_g.extend(iso.isos[iso_ind]['log_g'])
        mass.extend(iso.isos[iso_ind]['star_mass'])
        
        logR.extend(iso.isos[iso_ind]['log_R'])
        logL.extend(iso.isos[iso_ind]['log_L'])

    iso_data_all = QTable([age, logTeff, log_g, mass, logL, logR],
                        names=('age', 'logTeff', 'log_g', "mass", "logL", "logR"),
                        meta={'name': "iso data"})

    # iso_data_all = QTable([age, logTeff, log_g, mass],
    #                     names=('age', 'logTeff', 'log_g', "mass"),
    #                     meta={'name': "iso data"})
    
    # print(iso_data_all.info)
    # print(iso_data_all)
    # display(iso_data_all.to_pandas())
    return iso_data_all.to_pandas()

In [5]:
full_data = get_iso_data_panda("data/MIST_v1.2_vvcrit0.0_basic_isos/MIST_v1.2_feh_p0.00_afe_p0.0_vvcrit0.0_basic.txt")

Reading in: data/MIST_v1.2_vvcrit0.0_basic_isos/MIST_v1.2_feh_p0.00_afe_p0.0_vvcrit0.0_basic.txt


In [6]:
display(full_data)

Unnamed: 0,age,logTeff,log_g,mass,logL,logR
0,5.0,3.468541,3.116651,0.100000,-0.849695,0.160747
1,5.0,3.469176,3.114042,0.101392,-0.839643,0.164503
2,5.0,3.471116,3.106133,0.105660,-0.808944,0.175973
3,5.0,3.473035,3.098417,0.109915,-0.778564,0.187325
4,5.0,3.474944,3.090875,0.114180,-0.748368,0.198605
...,...,...,...,...,...,...
103984,10.3,4.370643,7.779334,0.528715,-1.180793,-1.809005
103985,10.3,4.363336,7.782175,0.528717,-1.212858,-1.810425
103986,10.3,4.356026,7.784962,0.528720,-1.244886,-1.811817
103987,10.3,4.348711,7.787695,0.528724,-1.276875,-1.813182


In [78]:
display(full_data)

Unnamed: 0,age,logTeff,log_g,mass,logL,logR
0,5.0,3.468541,3.116651,0.100000,-0.849695,0.160747
1,5.0,3.469176,3.114042,0.101392,-0.839643,0.164503
2,5.0,3.471116,3.106133,0.105660,-0.808944,0.175973
3,5.0,3.473035,3.098417,0.109915,-0.778564,0.187325
4,5.0,3.474944,3.090875,0.114180,-0.748368,0.198605
...,...,...,...,...,...,...
103984,10.3,4.370643,7.779334,0.528715,-1.180793,-1.809005
103985,10.3,4.363336,7.782175,0.528717,-1.212858,-1.810425
103986,10.3,4.356026,7.784962,0.528720,-1.244886,-1.811817
103987,10.3,4.348711,7.787695,0.528724,-1.276875,-1.813182


In [7]:
no_massive_stars_data = full_data.where(full_data.mass < 5).dropna().reset_index(drop=True)

In [8]:
X = no_massive_stars_data.drop(['mass'], axis=1).to_numpy()
y = no_massive_stars_data['mass'].to_numpy()

X_TRAIN, X_IVS, y_TRAIN, y_IVS = train_test_split(X, y, test_size=0.25, random_state=1337)

print(X_TRAIN.shape, X_IVS.shape)
print(y_TRAIN.shape, y_IVS.shape)

(61773, 5) (20592, 5)
(61773,) (20592,)


**Meilleurs résultats avec plus de colonnes**

Utiliser toutes les colonnes au final?

## Linear model

### Linear regression

In [23]:
kf = KFold(n_splits=5, shuffle=True)
TRUTH=None
PREDS=None
for train_index, test_index in kf.split(X_TRAIN):
    X_train, X_test = X_TRAIN[train_index], X_TRAIN[test_index]
    y_train, y_test = y_TRAIN[train_index], y_TRAIN[test_index]

    mdl = LinearRegression().fit(X_train, y_train)#Lasso(alpha=0.0001, max_iter=100000).fit(X_train, y_train)#Ridge(alpha=0.0001, max_iter=9999999).fit(X_train, y_train)#LinearRegression().fit(X_train, y_train)
    mdl.fit(X_train, y_train)
    preds = mdl.predict(X_test)
    if TRUTH is None:
        PREDS=preds
        TRUTH=y_test
    else:
        PREDS=np.hstack((PREDS, preds))
        TRUTH=np.hstack((TRUTH, y_test))

print("RVE: ",explained_variance_score(TRUTH, PREDS))
print("RMSE: ",root_mean_squared_error(TRUTH, PREDS))
corr, pval=pearsonr(TRUTH, PREDS)
print("CORR: ",corr)
print("PVAL: ",pval)
print("MAX_ER: ",max_error(TRUTH, PREDS))
print("MEAN_ER: ",mean_absolute_error(TRUTH, PREDS))  

RVE:  0.8829903775477418
RMSE:  0.40209921133149557
CORR:  0.9396756771037909
PVAL:  0.0
MAX_ER:  1.9680503572451975
MEAN_ER:  0.29824906800002177


### Ridge

In [24]:
kf = KFold(n_splits=5, shuffle=True)
TRUTH=None
PREDS=None
for train_index, test_index in kf.split(X_TRAIN):
    X_train, X_test = X_TRAIN[train_index], X_TRAIN[test_index]
    y_train, y_test = y_TRAIN[train_index], y_TRAIN[test_index]

    mdl = Ridge(alpha=0.1).fit(X_train, y_train)
    mdl.fit(X_train, y_train)
    preds = mdl.predict(X_test)
    if TRUTH is None:
        PREDS=preds
        TRUTH=y_test
    else:
        PREDS=np.hstack((PREDS, preds))
        TRUTH=np.hstack((TRUTH, y_test))

print("RVE: ",explained_variance_score(TRUTH, PREDS))
print("RMSE: ",root_mean_squared_error(TRUTH, PREDS))
corr, pval=pearsonr(TRUTH, PREDS)
print("CORR: ",corr)
print("PVAL: ",pval)
print("MAX_ER: ",max_error(TRUTH, PREDS))
print("MEAN_ER: ",mean_absolute_error(TRUTH, PREDS))  

RVE:  0.8830023921067793
RMSE:  0.40207852987016124
CORR:  0.9396820701386172
PVAL:  0.0
MAX_ER:  1.9711124934287116
MEAN_ER:  0.2981935711506235


## Decision tree regressor

In [25]:
RVEs, RMSEs, CORRs,PVALs, MAXERRs, MEANABSERRs = [],[],[],[],[],[]

depthsteps, minsamplesteps = [2,5,10,15,20,50,100], [1,2,5,10,15,20,50]

for i in depthsteps:
    for j in minsamplesteps:
        kf = KFold(n_splits=5, shuffle=True)
        TRUTH=None
        PREDS=None
        for train_index, test_index in kf.split(X_TRAIN):
            X_train, X_test = X_TRAIN[train_index], X_TRAIN[test_index]
            y_train, y_test = y_TRAIN[train_index], y_TRAIN[test_index]

            mdl = DecisionTreeRegressor(max_depth=i, min_samples_leaf=j)
            mdl.fit(X_train, y_train)
            preds = mdl.predict(X_test)
            if TRUTH is None:
                PREDS=preds
                TRUTH=y_test
            else:
                PREDS=np.hstack((PREDS, preds))
                TRUTH=np.hstack((TRUTH, y_test))

        RVEs.append(explained_variance_score(TRUTH, PREDS))
        RMSEs.append(root_mean_squared_error(TRUTH, PREDS))
        corr, pval=pearsonr(TRUTH, PREDS)
        CORRs.append(corr)
        PVALs.append(pval)
        MAXERRs.append(max_error(TRUTH, PREDS))
        MEANABSERRs.append(mean_absolute_error(TRUTH, PREDS)) 

In [26]:
dt_RVEs_np, dt_CORRs_np, dt_MAEs_np, dt_RMSEs_np, dt_MAXERRs_np = np.array(RVEs).reshape(7,7),np.array(CORRs).reshape(7,7),np.array(MEANABSERRs).reshape(7,7), np.array(RMSEs).reshape(7,7), np.array(MAXERRs).reshape(7,7)

df_dt_RVE = pd.DataFrame(dt_RVEs_np, columns = minsamplesteps, index = depthsteps).round(4)
df_dt_CORR = pd.DataFrame(dt_CORRs_np, columns = minsamplesteps, index = depthsteps).round(4)
df_dt_MAE = pd.DataFrame(dt_MAEs_np, columns = minsamplesteps, index = depthsteps).round(4)
df_dt_RMSE = pd.DataFrame(dt_RMSEs_np, columns = minsamplesteps, index = depthsteps).round(4)
df_dt_MAXERR = pd.DataFrame(dt_MAXERRs_np, columns = minsamplesteps, index = depthsteps).round(4)

print("RVE: \n",df_dt_RVE)
print("\nCORR Score: \n",df_dt_CORR)
print("\nMAE: \n",df_dt_MAE)
print("\nRMSE: \n",df_dt_RMSE)
print("\nMAXERR: \n",df_dt_MAXERR)

RVE: 
          1       2       5       10      15      20      50
2    0.5920  0.5932  0.5926  0.5919  0.5925  0.5928  0.5936
5    0.8019  0.8045  0.8048  0.8040  0.8050  0.8031  0.8034
10   0.9775  0.9771  0.9773  0.9770  0.9776  0.9773  0.9761
15   0.9964  0.9958  0.9957  0.9956  0.9948  0.9939  0.9908
20   0.9979  0.9979  0.9976  0.9967  0.9958  0.9951  0.9910
50   0.9983  0.9980  0.9972  0.9969  0.9949  0.9952  0.9915
100  0.9983  0.9982  0.9973  0.9968  0.9956  0.9946  0.9912

CORR Score: 
          1       2       5       10      15      20      50
2    0.7694  0.7702  0.7698  0.7693  0.7697  0.7699  0.7705
5    0.8955  0.8969  0.8971  0.8966  0.8972  0.8962  0.8963
10   0.9887  0.9885  0.9886  0.9884  0.9887  0.9886  0.9880
15   0.9982  0.9979  0.9979  0.9978  0.9974  0.9969  0.9954
20   0.9990  0.9990  0.9988  0.9983  0.9979  0.9975  0.9955
50   0.9991  0.9990  0.9986  0.9985  0.9975  0.9976  0.9957
100  0.9992  0.9991  0.9986  0.9984  0.9978  0.9973  0.9956

MAE: 
          1

## KNeighbours regressor

In [27]:
RVEs, RMSEs, CORRs,PVALs, MAXERRs, MEANABSERRs = [],[],[],[],[],[]

n_neighbours, weight, metric = [2,5,10,15,20,50,100], ["uniform", "distance"], ["minkowski", gaussian]

for j in weight:
    for k in metric:
        for i in n_neighbours:
            kf = KFold(n_splits=5, shuffle=True)
            TRUTH=None
            PREDS=None
            for train_index, test_index in kf.split(X_TRAIN):
                X_train, X_test = X_TRAIN[train_index], X_TRAIN[test_index]
                y_train, y_test = y_TRAIN[train_index], y_TRAIN[test_index]
    
                mdl = KNeighborsRegressor(n_neighbors=i, weights=j)
                mdl.fit(X_train, y_train)
                preds = mdl.predict(X_test)
                if TRUTH is None:
                    PREDS=preds
                    TRUTH=y_test
                else:
                    PREDS=np.hstack((PREDS, preds))
                    TRUTH=np.hstack((TRUTH, y_test))
    
            RVEs.append(explained_variance_score(TRUTH, PREDS))
            RMSEs.append(root_mean_squared_error(TRUTH, PREDS))
            corr, pval=pearsonr(TRUTH, PREDS)
            CORRs.append(corr)
            PVALs.append(pval)
            MAXERRs.append(max_error(TRUTH, PREDS))
            MEANABSERRs.append(mean_absolute_error(TRUTH, PREDS))

In [28]:
KNN_RVEs_np, KNN_CORRs_np, KNN_MAEs_np, KNN_RMSEs_np, KNN_MAXERRs_np = np.array(RVEs).reshape(2,2,7),np.array(CORRs).reshape(2,2,7),np.array(MEANABSERRs).reshape(2,2,7), np.array(RMSEs).reshape(2,2,7), np.array(MAXERRs).reshape(2,2,7)

df_KNN_RVE_uniform = pd.DataFrame(KNN_RVEs_np[0], columns = n_neighbours, index = metric).round(4)
df_KNN_RVE_distance = pd.DataFrame(KNN_RVEs_np[1], columns = n_neighbours, index = metric).round(4)
df_KNN_CORR_uniform = pd.DataFrame(KNN_CORRs_np[0], columns = n_neighbours, index = metric).round(4)
df_KNN_CORR_distance = pd.DataFrame(KNN_CORRs_np[1], columns = n_neighbours, index = metric).round(4)
df_KNN_MAE_uniform = pd.DataFrame(KNN_MAEs_np[0], columns = n_neighbours, index = metric).round(4)
df_KNN_MAE_distance = pd.DataFrame(KNN_MAEs_np[1], columns = n_neighbours, index = metric).round(4)
df_KNN_RMSE_uniform = pd.DataFrame(KNN_RMSEs_np[0], columns = n_neighbours, index = metric).round(4)
df_KNN_RMSE_distance = pd.DataFrame(KNN_RMSEs_np[1], columns = n_neighbours, index = metric).round(4)
df_KNN_MAXERR_uniform = pd.DataFrame(KNN_MAXERRs_np[0], columns = n_neighbours, index = metric).round(4)
df_KNN_MAXERR_distance = pd.DataFrame(KNN_MAXERRs_np[1], columns = n_neighbours, index = metric).round(4)

display("RVE: uniform",df_KNN_RVE_uniform)
display("distance",df_KNN_RVE_uniform)
display("CORR Score: uniform",df_KNN_CORR_uniform)
display("distance",df_KNN_CORR_distance)
display("MAE: uniform",df_KNN_MAE_uniform)
display("distance",df_KNN_MAE_distance)
display("RMSE: uniform",df_KNN_RMSE_uniform)
display("distance",df_KNN_RMSE_distance)
display("MAXERR: uniform",df_KNN_MAXERR_uniform)
display("distance",df_KNN_MAXERR_distance)

'RVE: uniform'

Unnamed: 0,2,5,10,15,20,50,100
minkowski,0.9995,0.9993,0.9989,0.9986,0.9984,0.9969,0.9928
<function gaussian at 0x000001F7399136A0>,0.9995,0.9993,0.9989,0.9986,0.9984,0.9969,0.9928


'distance'

Unnamed: 0,2,5,10,15,20,50,100
minkowski,0.9995,0.9993,0.9989,0.9986,0.9984,0.9969,0.9928
<function gaussian at 0x000001F7399136A0>,0.9995,0.9993,0.9989,0.9986,0.9984,0.9969,0.9928


'CORR Score: uniform'

Unnamed: 0,2,5,10,15,20,50,100
minkowski,0.9997,0.9996,0.9995,0.9993,0.9992,0.9985,0.9965
<function gaussian at 0x000001F7399136A0>,0.9997,0.9996,0.9995,0.9993,0.9992,0.9985,0.9965


'distance'

Unnamed: 0,2,5,10,15,20,50,100
minkowski,0.9998,0.9997,0.9996,0.9996,0.9995,0.999,0.9979
<function gaussian at 0x000001F7399136A0>,0.9998,0.9997,0.9996,0.9996,0.9995,0.999,0.9979


'MAE: uniform'

Unnamed: 0,2,5,10,15,20,50,100
minkowski,0.0128,0.0163,0.0207,0.0237,0.0262,0.0356,0.0495
<function gaussian at 0x000001F7399136A0>,0.0128,0.0162,0.0207,0.0237,0.0262,0.0356,0.0493


'distance'

Unnamed: 0,2,5,10,15,20,50,100
minkowski,0.0114,0.0133,0.016,0.0181,0.02,0.0275,0.0381
<function gaussian at 0x000001F7399136A0>,0.0113,0.0133,0.016,0.0182,0.02,0.0275,0.038


'RMSE: uniform'

Unnamed: 0,2,5,10,15,20,50,100
minkowski,0.0275,0.0321,0.0388,0.0434,0.0473,0.0658,0.1
<function gaussian at 0x000001F7399136A0>,0.0272,0.0322,0.0386,0.0433,0.0474,0.0661,0.1


'distance'

Unnamed: 0,2,5,10,15,20,50,100
minkowski,0.0256,0.0273,0.0314,0.0349,0.0379,0.0523,0.0777
<function gaussian at 0x000001F7399136A0>,0.0255,0.0272,0.0314,0.0349,0.0379,0.0522,0.0773


'MAXERR: uniform'

Unnamed: 0,2,5,10,15,20,50,100
minkowski,0.464,0.4266,0.4766,0.5685,0.6411,0.8383,1.8164
<function gaussian at 0x000001F7399136A0>,0.3876,0.5385,0.4954,0.4817,0.6416,0.8794,1.7588


'distance'

Unnamed: 0,2,5,10,15,20,50,100
minkowski,0.4152,0.4325,0.4476,0.5944,0.476,0.6984,1.2689
<function gaussian at 0x000001F7399136A0>,0.4137,0.3807,0.4565,0.506,0.5185,0.7614,1.2577


## Support vector regressor

In [29]:
kf = KFold(n_splits=5, shuffle=True)
TRUTH=None
PREDS=None
for train_index, test_index in kf.split(X_TRAIN):
    X_train, X_test = X_TRAIN[train_index], X_TRAIN[test_index]
    y_train, y_test = y_TRAIN[train_index], y_TRAIN[test_index]

    mdl = SVR()
    mdl.fit(X_train, y_train)
    preds = mdl.predict(X_test)
    if TRUTH is None:
        PREDS=preds
        TRUTH=y_test
    else:
        PREDS=np.hstack((PREDS, preds))
        TRUTH=np.hstack((TRUTH, y_test))

print("RVE: ",explained_variance_score(TRUTH, PREDS))
print("RMSE: ",root_mean_squared_error(TRUTH, PREDS))
corr, pval=pearsonr(TRUTH, PREDS)
print("CORR: ",corr)
print("PVAL: ",pval)
print("MAX_ER: ",max_error(TRUTH, PREDS))
print("MEAN_ER: ",mean_absolute_error(TRUTH, PREDS))  

RVE:  0.9864924102170747
RMSE:  0.13664937196930102
CORR:  0.9934333153778061
PVAL:  0.0
MAX_ER:  0.7687009179480984
MEAN_ER:  0.09269053369062036


In [None]:
RVEs, RMSEs, CORRs,PVALs, MAXERRs, MEANABSERRs = [],[],[],[],[],[]

degree, C, epsilon = [2, 3, 4, 5, 7, 10, 15], [0.1, 0.5, 1.0, 5.0, 10.0, 20.0, 50.0], []

for j in weight:
    for k in metric:
        for i in n_neighbours:
            kf = KFold(n_splits=5, shuffle=True)
            TRUTH=None
            PREDS=None
            for train_index, test_index in kf.split(X_TRAIN):
                X_train, X_test = X_TRAIN[train_index], X_TRAIN[test_index]
                y_train, y_test = y_TRAIN[train_index], y_TRAIN[test_index]
    
                mdl = SVR(n_neighbors=i, weights=j)
                mdl.fit(X_train, y_train)
                preds = mdl.predict(X_test)
                if TRUTH is None:
                    PREDS=preds
                    TRUTH=y_test
                else:
                    PREDS=np.hstack((PREDS, preds))
                    TRUTH=np.hstack((TRUTH, y_test))
    
            RVEs.append(explained_variance_score(TRUTH, PREDS))
            RMSEs.append(root_mean_squared_error(TRUTH, PREDS))
            corr, pval=pearsonr(TRUTH, PREDS)
            CORRs.append(corr)
            PVALs.append(pval)
            MAXERRs.append(max_error(TRUTH, PREDS))
            MEANABSERRs.append(mean_absolute_error(TRUTH, PREDS))

## Random forest regressor

In [33]:
kf = KFold(n_splits=5, shuffle=True)
TRUTH=None
PREDS=None
for train_index, test_index in kf.split(X_TRAIN):
    X_train, X_test = X_TRAIN[train_index], X_TRAIN[test_index]
    y_train, y_test = y_TRAIN[train_index], y_TRAIN[test_index]

    mdl = RandomForestRegressor()
    mdl.fit(X_train, y_train)
    preds = mdl.predict(X_test)
    if TRUTH is None:
        PREDS=preds
        TRUTH=y_test
    else:
        PREDS=np.hstack((PREDS, preds))
        TRUTH=np.hstack((TRUTH, y_test))

print("RVE: ",explained_variance_score(TRUTH, PREDS))
print("RMSE: ",root_mean_squared_error(TRUTH, PREDS))
corr, pval=pearsonr(TRUTH, PREDS)
print("CORR: ",corr)
print("PVAL: ",pval)
print("MAX_ER: ",max_error(TRUTH, PREDS))
print("MEAN_ER: ",mean_absolute_error(TRUTH, PREDS))

RVE:  0.9992836234828456
RMSE:  0.03147063848628426
CORR:  0.9996422722540139
PVAL:  0.0
MAX_ER:  2.332025714924534
MEAN_ER:  0.01301905195154534


## Adaboost regressor

In [32]:
kf = KFold(n_splits=5, shuffle=True)
TRUTH=None
PREDS=None
for train_index, test_index in kf.split(X_TRAIN):
    X_train, X_test = X_TRAIN[train_index], X_TRAIN[test_index]
    y_train, y_test = y_TRAIN[train_index], y_TRAIN[test_index]

    mdl = AdaBoostRegressor()
    mdl.fit(X_train, y_train)
    preds = mdl.predict(X_test)
    if TRUTH is None:
        PREDS=preds
        TRUTH=y_test
    else:
        PREDS=np.hstack((PREDS, preds))
        TRUTH=np.hstack((TRUTH, y_test))

print("RVE: ",explained_variance_score(TRUTH, PREDS))
print("RMSE: ",root_mean_squared_error(TRUTH, PREDS))
corr, pval=pearsonr(TRUTH, PREDS)
print("CORR: ",corr)
print("PVAL: ",pval)
print("MAX_ER: ",max_error(TRUTH, PREDS))
print("MEAN_ER: ",mean_absolute_error(TRUTH, PREDS))

RVE:  0.8183396825827937
RMSE:  0.5010279062531589
CORR:  0.9347459256074782
PVAL:  0.0
MAX_ER:  1.7017462104800534
MEAN_ER:  0.40935254303841123


## XGB regressor

In [17]:
kf = KFold(n_splits=10, shuffle=True)
TRUTH=None
PREDS=None
for train_index, test_index in kf.split(X_TRAIN):
    X_train, X_test = X_TRAIN[train_index], X_TRAIN[test_index]
    y_train, y_test = y_TRAIN[train_index], y_TRAIN[test_index]

    mdl = XGBRegressor()
    mdl.fit(X_train, y_train)
    preds = mdl.predict(X_test)
    if TRUTH is None:
        PREDS=preds
        TRUTH=y_test
    else:
        PREDS=np.hstack((PREDS, preds))
        TRUTH=np.hstack((TRUTH, y_test))

print("RVE: ",explained_variance_score(TRUTH, PREDS))
print("RMSE: ",root_mean_squared_error(TRUTH, PREDS))
corr, pval=pearsonr(TRUTH, PREDS)
print("CORR: ",corr)
print("PVAL: ",pval)
print("MAX_ER: ",max_error(TRUTH, PREDS))
print("MEAN_ER: ",mean_absolute_error(TRUTH, PREDS))

RVE:  0.9984887517368491
RMSE:  0.045697370929086024
CORR:  0.9992441149386662
PVAL:  0.0
MAX_ER:  0.7315355754019537
MEAN_ER:  0.02950474408239082


## MLP regressor

In [30]:
kf = KFold(n_splits=5, shuffle=True)
TRUTH=None
PREDS=None
for train_index, test_index in kf.split(X_TRAIN):
    X_train, X_test = X_TRAIN[train_index], X_TRAIN[test_index]
    y_train, y_test = y_TRAIN[train_index], y_TRAIN[test_index]

    mdl = MLPRegressor()
    mdl.fit(X_train, y_train)
    preds = mdl.predict(X_test)
    if TRUTH is None:
        PREDS=preds
        TRUTH=y_test
    else:
        PREDS=np.hstack((PREDS, preds))
        TRUTH=np.hstack((TRUTH, y_test))

print("RVE: ",explained_variance_score(TRUTH, PREDS))
print("RMSE: ",root_mean_squared_error(TRUTH, PREDS))
corr, pval=pearsonr(TRUTH, PREDS)
print("CORR: ",corr)
print("PVAL: ",pval)
print("MAX_ER: ",max_error(TRUTH, PREDS))
print("MEAN_ER: ",mean_absolute_error(TRUTH, PREDS))

RVE:  0.997079508821139
RMSE:  0.0636002324033153
CORR:  0.9985386995251659
PVAL:  0.0
MAX_ER:  0.600462366604329
MEAN_ER:  0.044927784109559894


## PCA

In [11]:
pca = PCA(n_components=5) # maybe try with less or more components
pca.fit(X_TRAIN)
tve=0
for i, ve in enumerate(pca.explained_variance_ratio_):
    tve+=ve
    print("PC%d - Variance explained: %7.4f - Total Variance: %7.4f" % (i, ve, tve) )
print()

PC0 - Variance explained:  0.7820 - Total Variance:  0.7820
PC1 - Variance explained:  0.1272 - Total Variance:  0.9091
PC2 - Variance explained:  0.0895 - Total Variance:  0.9986
PC3 - Variance explained:  0.0014 - Total Variance:  1.0000
PC4 - Variance explained:  0.0000 - Total Variance:  1.0000



In [12]:
X_TRAIN_PCA=pca.transform(X_TRAIN)
X_IVS_PCA=pca.transform(X_IVS)
print(X_TRAIN_PCA.shape)

(61773, 5)


In [13]:
kf = KFold(n_splits=5, shuffle=True)
TRUTH=None
PREDS=None
for train_index, test_index in kf.split(X_TRAIN_PCA):
    X_train, X_test = X_TRAIN_PCA[train_index], X_TRAIN_PCA[test_index]
    y_train, y_test = y_TRAIN[train_index], y_TRAIN[test_index]

    mdl = XGBRegressor()
    mdl.fit(X_train, y_train)
    preds = mdl.predict(X_test)
    if TRUTH is None:
        PREDS=preds
        TRUTH=y_test
    else:
        PREDS=np.hstack((PREDS, preds))
        TRUTH=np.hstack((TRUTH, y_test))

print("RVE: ",explained_variance_score(TRUTH, PREDS))
print("RMSE: ",root_mean_squared_error(TRUTH, PREDS))
corr, pval=pearsonr(TRUTH, PREDS)
print("CORR: ",corr)
print("PVAL: ",pval)
print("MAX_ER: ",max_error(TRUTH, PREDS))
print("MEAN_ER: ",mean_absolute_error(TRUTH, PREDS))

RVE:  0.9995991488167091
RMSE:  0.023535068766571212
CORR:  0.9997995689213592
PVAL:  0.0
MAX_ER:  0.3133700634754746
MEAN_ER:  0.015002491067505555


**Meilleurs résultats qu'avec les données de base**

In [19]:
kf = KFold(n_splits=5, shuffle=True)
TRUTH=None
PREDS=None
for train_index, test_index in kf.split(X_TRAIN_PCA):
    X_train, X_test = X_TRAIN_PCA[train_index], X_TRAIN_PCA[test_index]
    y_train, y_test = y_TRAIN[train_index], y_TRAIN[test_index]

    mdl = MLPRegressor()
    mdl.fit(X_train, y_train)
    preds = mdl.predict(X_test)
    if TRUTH is None:
        PREDS=preds
        TRUTH=y_test
    else:
        PREDS=np.hstack((PREDS, preds))
        TRUTH=np.hstack((TRUTH, y_test))

print("RVE: ",explained_variance_score(TRUTH, PREDS))
print("RMSE: ",root_mean_squared_error(TRUTH, PREDS))
corr, pval=pearsonr(TRUTH, PREDS)
print("CORR: ",corr)
print("PVAL: ",pval)
print("MAX_ER: ",max_error(TRUTH, PREDS))
print("MEAN_ER: ",mean_absolute_error(TRUTH, PREDS))

RVE:  0.9995045944103211
RMSE:  0.026279061283715162
CORR:  0.9997552884547736
PVAL:  0.0
MAX_ER:  0.24201030304571258
MEAN_ER:  0.017937512459161573


## Best model (not updated)

In [9]:
mdl_predict_test = MLPRegressor()
mdl_predict_test.fit(X_TRAIN, y_TRAIN)
preds = mdl_predict_test.predict(X_IVS)

PREDS=preds
TRUTH=y_IVS

print("RVE: ",explained_variance_score(TRUTH, PREDS))
print("RMSE: ",root_mean_squared_error(TRUTH, PREDS))
corr, pval=pearsonr(TRUTH, PREDS)
print("CORR: ",corr)
print("PVAL: ",pval)
print("MAX_ER: ",max_error(TRUTH, PREDS))
print("MEAN_ER: ",mean_absolute_error(TRUTH, PREDS))

RVE:  0.9972267642392262
RMSE:  0.06254083435292317
CORR:  0.9986141983083916
PVAL:  0.0
MAX_ER:  0.5367991736205848
MEAN_ER:  0.042129219526977096


In [11]:
print(mdl_predict_test.n_layers_)

3


In [62]:
print(X_TRAIN)

[[ 8.1         3.53104357  0.11486022  4.08041081  2.50079542]
 [10.05        3.75432248  4.13045779  0.26118944  0.14462692]
 [ 9.75        3.5573065   0.79985701  2.89658627  1.85635729]
 ...
 [ 7.85        3.53958511 -0.7789286   4.47966533  2.68333959]
 [ 9.5         3.70969014  3.63472147  0.74174508  0.47416942]
 [ 6.3         4.08606568  4.02209111  2.22404655  0.46256906]]


In [97]:
display(full_data.loc[(full_data['age'] == 5.0) & (np.array(isclose_pandas(full_data, "logTeff", 3.468541, rel_tol=1e-6)))])

Unnamed: 0,age,logTeff,log_g,mass,logL,logR
0,5.0,3.468541,3.116651,0.1,-0.849695,0.160747


In [88]:
# test = np.array([[5.0, 3.468541, 3.116651, -0.849695, 0.160747]])
# test = np.array([[8.1, 3.53104357, 0.11486022, 4.08041081, 2.50079542]])
# test = np.array([[10.05, 3.75432248, 4.13045779, 0.26118944, 0.14462692]])
# test = np.array([[9.75, 3.5573065, 0.79985701, 2.89658627, 1.85635729]])
test = np.array([[9.5, 3.70969014, 3.63472147, 0.74174508, 0.47416942]])
print(X_TRAIN.shape)
print(test.shape)
mdl_predict_test.predict(test)

(61773, 5)
(1, 5)


array([1.2988277])

In [90]:
best_mdl = MLPRegressor()
best_mdl.fit(X_TRAIN_PCA, y_TRAIN)
preds = best_mdl.predict(X_IVS_PCA)

PREDS=preds
TRUTH=y_IVS

print("RVE: ",explained_variance_score(TRUTH, PREDS))
print("RMSE: ",root_mean_squared_error(TRUTH, PREDS))
corr, pval=pearsonr(TRUTH, PREDS)
print("CORR: ",corr)
print("PVAL: ",pval)
print("MAX_ER: ",max_error(TRUTH, PREDS))
print("MEAN_ER: ",mean_absolute_error(TRUTH, PREDS))

RVE:  0.999661585626747
RMSE:  0.02267372795585098
CORR:  0.9998310890456998
PVAL:  0.0
MAX_ER:  0.19574639860597554
MEAN_ER:  0.016031921986183598


In [98]:
test = np.array([[5.0, 3.468541, 3.116651, -0.849695, 0.160747]])
# test = np.array([[10.05, 3.75432248, 4.13045779, 0.26118944, 0.14462692]])
# test = np.array([[9.75, 3.5573065, 0.79985701, 2.89658627, 1.85635729]])
# test = np.array([[9.5, 3.70969014, 3.63472147, 0.74174508, 0.47416942]])
test_PCA=pca.transform(test)
print(X_TRAIN.shape)
print(test.shape)
best_mdl.predict(test_PCA)

(61773, 5)
(1, 5)


array([0.08122154])

In [21]:
total = 0
count = 0
for i in range(len(preds)):
    total += 1
    diff = y_IVS[i] - preds[i]
    bound = 0.2
    if (diff > bound) or (diff < -bound):
        count += 1
        print(f"prediction : {preds[i]}, truth : {y_IVS[i]}, diff : {y_IVS[i] - preds[i]}")

print(total)
print(count)

prediction : 4.361152827653326, truth : 4.15882380433359, diff : -0.20232902331973612
prediction : 1.1535886657994256, truth : 0.7503384814453014, diff : -0.4032501843541243
prediction : 3.1766928411030158, truth : 3.4511095669832206, diff : 0.2744167258802048
prediction : 3.2113820810820624, truth : 2.9431660165040614, diff : -0.26821606457800096
prediction : 4.696257978761651, truth : 4.059389323599795, diff : -0.6368686551618561
prediction : 1.6451719553993744, truth : 1.4310512148560115, diff : -0.21412074054336294
prediction : 4.730854569617687, truth : 4.450681542175936, diff : -0.28017302744175154
prediction : 1.6128001061321677, truth : 2.0481603758359133, diff : 0.43536026970374553
prediction : 3.246358565770469, truth : 3.0410420623067433, diff : -0.20531650346372565
prediction : 1.2839414547804986, truth : 0.9820584319456124, diff : -0.3018830228348862
prediction : 4.199233941501053, truth : 0.8657230455161277, diff : -3.333510895984925
prediction : 2.4711315984896474, truth

In [None]:
# faire le tracé des isochrones avec seulement max 5 masse => voir "data_preparation.py"

In [None]:
# calculer depuis les données d'entrainement, le rayon et la luminosité
#  -> si la différence est négligable => ne pas rajouter les colonnes
#  -> sinon => rajouter les colonnes et faire que les modèles output le rayon et la luminosité