## Imports

In [1]:
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import random

In [2]:
from sklearn.model_selection import train_test_split, KFold, GridSearchCV

from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor

from sklearn.metrics import precision_score, recall_score, f1_score, matthews_corrcoef, confusion_matrix,\
                             explained_variance_score, mean_squared_error, max_error, mean_absolute_error,\
                             root_mean_squared_error, median_absolute_error
from scipy.stats import pearsonr

from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer
from sklearn.decomposition import PCA, KernelPCA
# pas besoin de feature selection parce que pas beacoup de colonnes

import optuna

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import read_mist_models

from utils import Iso_data_handler

def gaussian(dsts):
    kernel_width = .5
    weights = np.exp(-(dsts**2)/kernel_width)
    return weights

# To find certain rows

def isclose_pandas_apply(row, col_name, value, bool_index, rel_tol=1e-6):
    if math.isclose(row[col_name], value, rel_tol=rel_tol):
        bool_index.append(True)
    else:
        bool_index.append(False)

def isclose_pandas(df, col_name, value, rel_tol=1e-6):
    bool_index = []
    df.apply(isclose_pandas_apply, axis=1, args=(col_name, value, bool_index, rel_tol))
    return bool_index

## Data preparation

In [4]:
iso_handler = Iso_data_handler("data/MIST_v1.2_vvcrit0.0_basic_isos/", 
                              ['log10_isochrone_age_yr', 'log_Teff', 'log_g', 'star_mass', 'phase', 'metallicity', 'log_R'])


In [5]:
iso_df = iso_handler.full_iso_data_to_panda()

Reading dataframe from csv file...


In [6]:
display(iso_df)

Unnamed: 0,log10_isochrone_age_yr,log_Teff,log_g,star_mass,phase,metallicity,log_R
0,5.0,3.486221,3.131342,0.100000,-1.0,-0.25,0.153402
1,5.0,3.487362,3.126808,0.102645,-1.0,-0.25,0.160326
2,5.0,3.489243,3.119367,0.107039,-1.0,-0.25,0.171785
3,5.0,3.491102,3.112165,0.111419,-1.0,-0.25,0.183099
4,5.0,3.492937,3.105143,0.115789,-1.0,-0.25,0.194305
...,...,...,...,...,...,...,...
1467117,10.3,4.402490,7.777159,0.532726,6.0,0.50,-1.806255
1467118,10.3,4.387132,7.783242,0.532730,6.0,0.50,-1.809295
1467119,10.3,4.371789,7.789130,0.532735,6.0,0.50,-1.812237
1467120,10.3,4.356480,7.794844,0.532741,6.0,0.50,-1.815091


In [7]:
# Keeping only the relevant star phases
phase_filtered_iso_df = iso_df\
    .where((iso_df.phase == 0) | (iso_df.phase == 2) | (iso_df.phase == 3) | (iso_df.phase == 4) | (iso_df.phase == 5))\
    .dropna().reset_index(drop=True)

In [8]:
display(phase_filtered_iso_df)

Unnamed: 0,log10_isochrone_age_yr,log_Teff,log_g,star_mass,phase,metallicity,log_R
0,5.0,4.494412,4.346972,13.584360,0.0,-0.25,0.610679
1,5.0,4.497517,4.345776,13.765512,0.0,-0.25,0.614753
2,5.0,4.500556,4.344580,13.942887,0.0,-0.25,0.618755
3,5.0,4.504040,4.343050,14.591712,0.0,-0.25,0.624670
4,5.0,4.507576,4.341483,15.426062,0.0,-0.25,0.631187
...,...,...,...,...,...,...,...
1165292,10.3,3.425746,-0.551440,0.602856,5.0,0.50,2.384899
1165293,10.3,3.426469,-0.560350,0.598549,5.0,0.50,2.387797
1165294,10.3,3.427744,-0.566057,0.594116,5.0,0.50,2.389036
1165295,10.3,3.429413,-0.569225,0.589648,5.0,0.50,2.388981


In [9]:
X = phase_filtered_iso_df.drop(['star_mass', 'phase', 'log_R'], axis=1).to_numpy()
y = phase_filtered_iso_df[['star_mass', 'log_R']].to_numpy()

X_TRAIN, X_IVS, y_TRAIN, y_IVS = train_test_split(X, y, test_size=0.25, random_state=1337)

print(X_TRAIN.shape, X_IVS.shape)
print(y_TRAIN.shape, y_IVS.shape)

(873972, 4) (291325, 4)
(873972, 2) (291325, 2)


In [31]:
print(f"Range in train data for the mass parameter : {min(y_TRAIN[:, 0])} - {max(y_TRAIN[:, 0])}")
print(f"Median value in train data for the mass parameter: {np.median(y_TRAIN[:, 0])}")
print(f"Mean value in train data for the mass parameter: {np.mean(y_TRAIN[:, 0])}")

print(f"Range in test data for the mass parameter : {min(y_IVS[:, 0])} - {max(y_IVS[:, 0])}")
print(f"Median value in test data for the mass parameter: {np.median(y_IVS[:, 0])}")
print(f"Mean value in test data for the mass parameter: {np.mean(y_IVS[:, 0])}")

print()

print(f"Range in train data for the radius parameter : {min(y_TRAIN[:, 1])} - {max(y_TRAIN[:, 1])}")
print(f"Median value in train data for the radius parameter: {np.median(y_TRAIN[:, 1])}")
print(f"Mean value in train data for the radius parameter: {np.mean(y_TRAIN[:, 1])}")

print(f"Range in test data for the radius parameter : {min(y_IVS[:, 1])} - {max(y_IVS[:, 1])}")
print(f"Median value in test data for the radius parameter: {np.median(y_IVS[:, 1])}")
print(f"Mean value in test data for the radius parameter: {np.mean(y_IVS[:, 1])}")


Range in train data for the mass parameter : 0.0999979840073621 - 296.5221171165397
Median value in train data for the mass parameter: 2.0298876569807147
Mean value in train data for the mass parameter: 7.387582424532589
Range in test data for the mass parameter : 0.099998052173157 - 298.5447575808816
Median value in test data for the mass parameter: 2.031987247312461
Mean value in test data for the mass parameter: 7.403392070038953

Range in train data for the radius parameter : -2.085171571669866 - 3.1297545143214007
Median value in train data for the radius parameter: 1.4613315181026039
Mean value in train data for the radius parameter: 1.3545468932779332
Range in test data for the radius parameter : -2.081830985668411 - 3.129269620812593
Median value in test data for the radius parameter: 1.457489869624725
Mean value in test data for the radius parameter: 1.353692007577153


## PCA data preparation

In [10]:
pca = PCA(n_components=3) # maybe try with less or more components
pca.fit(X_TRAIN)
tve=0
for i, ve in enumerate(pca.explained_variance_ratio_):
    tve+=ve
    print("PC%d - Variance explained: %7.4f - Total Variance: %7.4f" % (i, ve, tve) )
print()

PC0 - Variance explained:  0.5934 - Total Variance:  0.5934
PC1 - Variance explained:  0.2337 - Total Variance:  0.8271
PC2 - Variance explained:  0.1685 - Total Variance:  0.9955



In [11]:
X_TRAIN_PCA=pca.transform(X_TRAIN)
X_IVS_PCA=pca.transform(X_IVS)
print(X_TRAIN_PCA.shape)

(873972, 3)


## Model training

In [20]:

# TODO faire que la fonction puisse accepter autant d'output qu'on veut
# TODO mettre les outputs du modèle dans un fichier csv pour utiliser plus tard
# TODO mettr la possibilité de rajouter des paramètres à tester dans le modèle

def Kfold_pipeline(model, filename="", n_splits=10, shuffle=True):
    kf = KFold(n_splits=n_splits, shuffle=shuffle, random_state=12)
    TRUTH_MASS=None
    TRUTH_RADIUS=None
    PREDS_MASS=None
    PREDS_RADIUS=None
    for train_index, test_index in kf.split(X_TRAIN):
        X_train, X_test = X_TRAIN[train_index], X_TRAIN[test_index]
        y_train, y_test = y_TRAIN[train_index], y_TRAIN[test_index]

        mdl = model()
        mdl.fit(X_train, y_train)
        preds = mdl.predict(X_test)

        if TRUTH_MASS is None:
            PREDS_MASS=preds[:, 0]
            TRUTH_MASS=y_test[:, 0]
        else:
            PREDS_MASS=np.hstack((PREDS_MASS, preds[:, 0]))
            TRUTH_MASS=np.hstack((TRUTH_MASS, y_test[:, 0]))

        if TRUTH_RADIUS is None:
            PREDS_RADIUS=preds[:, 1]
            TRUTH_RADIUS=y_test[:, 1]
        else:
            PREDS_RADIUS=np.hstack((PREDS_RADIUS, preds[:, 1]))
            TRUTH_RADIUS=np.hstack((TRUTH_RADIUS, y_test[:, 1]))

    print("Mass results:")
    print("RVE: ",explained_variance_score(TRUTH_MASS, PREDS_MASS))
    print("RMSE: ",root_mean_squared_error(TRUTH_MASS, PREDS_MASS))
    corr, pval=pearsonr(TRUTH_MASS, PREDS_MASS)
    print("CORR: ",corr)
    print("PVAL: ",pval)
    print("MAX_ER: ",max_error(TRUTH_MASS, PREDS_MASS))
    print("MEAN_ABS_ER: ",mean_absolute_error(TRUTH_MASS, PREDS_MASS))  
    print("MEDIAN_ABS_ER: ",median_absolute_error(TRUTH_MASS, PREDS_MASS))

    print()
    print("Radius results")
    print("RVE: ",explained_variance_score(TRUTH_RADIUS, PREDS_RADIUS))
    print("RMSE: ",root_mean_squared_error(TRUTH_RADIUS, PREDS_RADIUS))
    corr, pval=pearsonr(TRUTH_RADIUS, PREDS_RADIUS)
    print("CORR: ",corr)
    print("PVAL: ",pval)
    print("MAX_ER: ",max_error(TRUTH_RADIUS, PREDS_RADIUS))
    print("MEAN_ABS_ER: ",mean_absolute_error(TRUTH_RADIUS, PREDS_RADIUS))  
    print("MEDIAN_ABS_ER: ",median_absolute_error(TRUTH_RADIUS, PREDS_RADIUS))
    return

### Linear model

#### Linear regression

In [21]:
Kfold_pipeline(LinearRegression)

Mass results:
RVE:  0.30901687258498645
RMSE:  16.83803231323551
CORR:  0.5558928608298243
PVAL:  0.0
MAX_ER:  261.3407863674693
MEAN_ABS_ER:  7.338182392289768
MEDIAN_ABS_ER:  3.825823142935673

Radius results
RVE:  0.987368542679585
RMSE:  0.11206128690888334
CORR:  0.9936642001601589
PVAL:  0.0
MAX_ER:  0.6588808734804295
MEAN_ABS_ER:  0.08048427077094714
MEDIAN_ABS_ER:  0.057865742717877144


In [29]:
mdl = LinearRegression()
mdl.fit(X_TRAIN, y_TRAIN)
preds = mdl.predict(X_IVS)

print(preds[:, 0])


[-11.18112541  -2.36290652  -1.17783343 ...   5.21130959  25.03330549
  -3.03299623]


In [53]:
kf = KFold(n_splits=5, shuffle=True, random_state=12)
TRUTH_MASS=None
TRUTH_RADIUS=None
PREDS_MASS=None
PREDS_RADIUS=None
for train_index, test_index in kf.split(X_TRAIN):
    X_train, X_test = X_TRAIN[train_index], X_TRAIN[test_index]
    y_train, y_test = y_TRAIN[train_index], y_TRAIN[test_index]

    mdl = LinearRegression()#Lasso(alpha=0.0001, max_iter=100000).fit(X_train, y_train)#Ridge(alpha=0.0001, max_iter=9999999).fit(X_train, y_train)#LinearRegression().fit(X_train, y_train)
    mdl.fit(X_train, y_train)
    preds = mdl.predict(X_test)

    if TRUTH_MASS is None:
        PREDS_MASS=preds[:, 0]
        TRUTH_MASS=y_test[:, 0]
    else:
        PREDS_MASS=np.hstack((PREDS_MASS, preds[:, 0]))
        TRUTH_MASS=np.hstack((TRUTH_MASS, y_test[:, 0]))

    if TRUTH_RADIUS is None:
        PREDS_RADIUS=preds[:, 1]
        TRUTH_RADIUS=y_test[:, 1]
    else:
        PREDS_RADIUS=np.hstack((PREDS_RADIUS, preds[:, 1]))
        TRUTH_RADIUS=np.hstack((TRUTH_RADIUS, y_test[:, 1]))

print("Mass results:")
print("RVE: ",explained_variance_score(TRUTH_MASS, PREDS_MASS))
print("RMSE: ",root_mean_squared_error(TRUTH_MASS, PREDS_MASS))
corr, pval=pearsonr(TRUTH_MASS, PREDS_MASS)
print("CORR: ",corr)
print("PVAL: ",pval)
print("MAX_ER: ",max_error(TRUTH_MASS, PREDS_MASS))
print("MEAN_ABS_ER: ",mean_absolute_error(TRUTH_MASS, PREDS_MASS))  
print("MEDIAN_ABS_ER: ",median_absolute_error(TRUTH_MASS, PREDS_MASS))

print("Radius results")
print("RVE: ",explained_variance_score(TRUTH_RADIUS, PREDS_RADIUS))
print("RMSE: ",root_mean_squared_error(TRUTH_RADIUS, PREDS_RADIUS))
corr, pval=pearsonr(TRUTH_RADIUS, PREDS_RADIUS)
print("CORR: ",corr)
print("PVAL: ",pval)
print("MAX_ER: ",max_error(TRUTH_RADIUS, PREDS_RADIUS))
print("MEAN_ABS_ER: ",mean_absolute_error(TRUTH_RADIUS, PREDS_RADIUS))  
print("MEDIAN_ABS_ER: ",median_absolute_error(TRUTH_RADIUS, PREDS_RADIUS))

Mass results:
RVE:  0.30901134858970825
RMSE:  16.838099618128872
CORR:  0.5558878922530521
PVAL:  0.0
MAX_ER:  261.3087981234931
MEAN_ABS_ER:  7.338185526477554
MEDIAN_ABS_ER:  3.8251226616140666
Radius results
RVE:  0.9873685444520779
RMSE:  0.11206127904633721
CORR:  0.9936642010519766
PVAL:  0.0
MAX_ER:  0.6584040143919994
MEAN_ABS_ER:  0.08048426814768868
MEDIAN_ABS_ER:  0.057863230750672234
