In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import random

In [2]:
from sklearn.ensemble import RandomForestRegressor

from sklearn.feature_selection import SelectFromModel

from sklearn.model_selection import train_test_split

In [3]:
import read_mist_models

from utils import Iso_data_handler

In [4]:
def associate_col_name_to_features(df, features_values):
    ret = dict()
    for cols in df.columns:
        ret[cols] = features_values[df.columns.get_loc(cols)]
    return ret

## Getting the data

In [5]:
iso_handler = Iso_data_handler("C:/Users/antoi/Code/unif/MA2/Thèse/data/MIST_v1.2_vvcrit0.0_basic_isos/", 
                              []) # all the columns

In [6]:
iso_df = iso_handler.full_iso_data_to_panda()

Reading dataframe from csv file...


In [7]:
display(iso_df)
print(iso_df.columns)

Unnamed: 0,log10_isochrone_age_yr,initial_mass,star_mass,star_mdot,he_core_mass,c_core_mass,log_L,log_LH,log_LHe,log_Teff,...,surface_c12,surface_o16,log_center_T,log_center_Rho,center_gamma,center_h1,center_he4,center_c12,phase,metallicity
0,5.0,0.100000,0.100000,-9.297134e-14,0.000000,0.000000,-0.793667,-3.548059,-99.0,3.486221,...,0.001404,0.003435,5.765442,-0.470852,0.297537,7.309741e-01,2.609346e-01,0.001404,-1.0,-0.25
1,5.0,0.102645,0.102645,-9.975347e-14,0.000000,0.000000,-0.775254,-3.521999,-99.0,3.487362,...,0.001404,0.003435,5.767557,-0.482044,0.294004,7.309741e-01,2.609346e-01,0.001404,-1.0,-0.25
2,5.0,0.107039,0.107039,-1.104573e-13,0.000000,0.000000,-0.744810,-3.478003,-99.0,3.489243,...,0.001404,0.003435,5.771118,-0.500506,0.288132,7.309741e-01,2.609346e-01,0.001404,-1.0,-0.25
3,5.0,0.111419,0.111419,-1.204011e-13,0.000000,0.000000,-0.714749,-3.432448,-99.0,3.491102,...,0.001404,0.003435,5.774774,-0.518606,0.282275,7.309741e-01,2.609346e-01,0.001404,-1.0,-0.25
4,5.0,0.115789,0.115789,-1.295945e-13,0.000000,0.000000,-0.684996,-3.385713,-99.0,3.492937,...,0.001404,0.003435,5.778504,-0.536428,0.276429,7.309741e-01,2.609346e-01,0.001404,-1.0,-0.25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1467117,10.3,0.925792,0.532726,-1.091816e-16,0.532517,0.467003,-1.047903,-2.207836,-99.0,4.402490,...,0.006043,0.018812,7.634464,6.388680,16.038778,1.516815e-38,5.212308e-15,0.264888,6.0,0.50
1467118,10.3,0.925825,0.532730,-9.216741e-17,0.532521,0.467008,-1.115415,-2.250365,-99.0,4.387132,...,0.006043,0.018812,7.618960,6.390228,16.632656,8.655463e-39,5.116901e-15,0.264776,6.0,0.50
1467119,10.3,0.925866,0.532735,-7.815650e-17,0.532526,0.467015,-1.182672,-2.289410,-99.0,4.371789,...,0.006042,0.018812,7.601599,6.391757,17.323102,4.717695e-39,5.036249e-15,0.264640,6.0,0.50
1467120,10.3,0.925919,0.532741,-6.650403e-17,0.532532,0.467023,-1.249617,-2.325829,-99.0,4.356480,...,0.006042,0.018812,7.582075,6.393277,18.134706,2.491051e-39,4.969942e-15,0.264470,6.0,0.50


Index(['log10_isochrone_age_yr', 'initial_mass', 'star_mass', 'star_mdot',
       'he_core_mass', 'c_core_mass', 'log_L', 'log_LH', 'log_LHe', 'log_Teff',
       'log_R', 'log_g', 'surface_h1', 'surface_he3', 'surface_he4',
       'surface_c12', 'surface_o16', 'log_center_T', 'log_center_Rho',
       'center_gamma', 'center_h1', 'center_he4', 'center_c12', 'phase',
       'metallicity'],
      dtype='object')


## Preparing the data

In [8]:
# Keeping only the relevant star phases
phase_filtered_iso_df = iso_df\
    .where((iso_df.phase == 0) | (iso_df.phase == 2) | (iso_df.phase == 3) | (iso_df.phase == 4) | (iso_df.phase == 5))\
    .dropna().reset_index(drop=True)

In [9]:
display(phase_filtered_iso_df)
print(phase_filtered_iso_df.columns)

Unnamed: 0,log10_isochrone_age_yr,initial_mass,star_mass,star_mdot,he_core_mass,c_core_mass,log_L,log_LH,log_LHe,log_Teff,...,surface_c12,surface_o16,log_center_T,log_center_Rho,center_gamma,center_h1,center_he4,center_c12,phase,metallicity
0,5.0,13.585307,13.584360,-2.266578e-09,0.000000,0.000000,4.153653,4.156490,-27.569158,4.494412,...,0.001403,0.003432,7.556588,0.943195,0.014288,7.299315e-01,2.618302e-01,0.000014,0.0,-0.25
1,5.0,13.766502,13.765512,-2.339355e-09,0.000000,0.000000,4.174219,4.177160,-27.511791,4.497517,...,0.001403,0.003432,7.557782,0.935433,0.014151,7.298502e-01,2.619141e-01,0.000015,0.0,-0.25
2,5.0,13.943919,13.942887,-2.410114e-09,0.000000,0.000000,4.194381,4.197400,-27.455930,4.500556,...,0.001402,0.003432,7.558945,0.927822,0.014018,7.297689e-01,2.619981e-01,0.000015,0.0,-0.25
3,5.0,14.593221,14.591712,-1.686032e-08,0.000000,0.000000,4.220145,4.223228,-27.391603,4.504040,...,0.001402,0.003432,7.560347,0.918964,0.013897,7.296845e-01,2.620853e-01,0.000015,0.0,-0.25
4,5.0,15.428247,15.426062,-3.712640e-08,0.000000,0.000000,4.247326,4.250463,-27.326047,4.507576,...,0.001402,0.003431,7.561801,0.909859,0.013787,7.296010e-01,2.621715e-01,0.000015,0.0,-0.25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1165292,10.3,0.925549,0.602856,-2.139875e-06,0.530371,0.465325,3.427430,3.359401,2.166522,3.425746,...,0.006257,0.018811,7.948933,6.172404,6.569365,2.188183e-35,4.464556e-14,0.266641,5.0,0.50
1165293,10.3,0.925549,0.598549,-2.372679e-06,0.530449,0.465582,3.436116,3.372719,2.132128,3.426469,...,0.006257,0.018811,7.949170,6.173285,6.570241,2.211247e-35,4.457846e-14,0.266640,5.0,0.50
1165294,10.3,0.925549,0.594116,-2.601874e-06,0.530525,0.465582,3.443696,3.384095,2.101849,3.427744,...,0.006257,0.018811,7.949386,6.174103,6.571095,2.232492e-35,4.451613e-14,0.266639,5.0,0.50
1165295,10.3,0.925549,0.589648,-2.819032e-06,0.530594,0.465790,3.450259,3.393816,2.075562,3.429413,...,0.006257,0.018811,7.949580,6.174848,6.571912,2.251633e-35,4.445887e-14,0.266639,5.0,0.50


Index(['log10_isochrone_age_yr', 'initial_mass', 'star_mass', 'star_mdot',
       'he_core_mass', 'c_core_mass', 'log_L', 'log_LH', 'log_LHe', 'log_Teff',
       'log_R', 'log_g', 'surface_h1', 'surface_he3', 'surface_he4',
       'surface_c12', 'surface_o16', 'log_center_T', 'log_center_Rho',
       'center_gamma', 'center_h1', 'center_he4', 'center_c12', 'phase',
       'metallicity'],
      dtype='object')


### Primary model

In [10]:
X_primary = phase_filtered_iso_df.drop(['star_mass', 'log_R'], axis=1).to_numpy()
y_primary = phase_filtered_iso_df[['star_mass', 'log_R']].to_numpy()

X_train_primary, X_test_primary, y_train_primary, y_test_primary = train_test_split(X_primary, y_primary, test_size=0.25, random_state=1337)

N_primary,M_primary = X_train_primary.shape

print(X_train_primary.shape, X_test_primary.shape)
print(y_train_primary.shape, y_test_primary.shape)

(873972, 23) (291325, 23)
(873972, 2) (291325, 2)


### Secondary model

In [15]:
X_secondary = phase_filtered_iso_df.drop(['log_Teff', 'log_g', 'log_R'], axis=1).to_numpy()
y_secondary = phase_filtered_iso_df[['log_Teff', 'log_g', 'log_R']].to_numpy()

X_train_secondary, X_test_secondary, y_train_secondary, y_test_secondary = train_test_split(X_secondary, y_secondary, test_size=0.25, random_state=1337)

N_secondary, M_secondary = X_train_secondary.shape

print(X_train_secondary.shape, X_test_secondary.shape)
print(y_train_secondary.shape, y_test_secondary.shape)

(873972, 22) (291325, 22)
(873972, 3) (291325, 3)


## Selecting features using random forests


### Primary model

In [12]:
rf_primary = RandomForestRegressor(random_state=1337, n_jobs=10, max_depth=30)
sel_primary = SelectFromModel(estimator=rf_primary, threshold=.00001) # très bas pour l'instant pour tester
# the variable needs to be above the threshold to be accepted
sel_primary.fit(X_train_primary, y_train_primary)

print("Importances: ", sel_primary.estimator_.feature_importances_)

print("Default threshold: ", sel_primary.threshold_)

features_primary = sel_primary.get_support()
features_selected_primary = np.arange(M_primary)[features_primary]
print("The features selected are columns: ", features_selected_primary)

Importances:  [1.90238177e-03 8.83931122e-01 8.57653494e-04 1.37687025e-04
 8.35985143e-06 9.15436793e-02 6.12664483e-05 2.02494145e-04
 4.56179345e-04 1.75682867e-03 1.11189737e-02 3.21392643e-03
 3.75892850e-04 7.48064609e-04 3.44819389e-04 4.80220317e-05
 1.64855765e-03 4.12502310e-04 3.75546198e-04 4.56962410e-04
 9.81934151e-05 2.78754350e-06 2.98099269e-04]
Default threshold:  1e-05
The features selected are columns:  [ 0  1  2  3  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 22]


In [16]:
col_value_dict = associate_col_name_to_features(phase_filtered_iso_df.drop(['star_mass', 'log_R'], axis=1), sel_primary.estimator_.feature_importances_)

In [None]:
print(sorted(col_value_dict.items(), key=lambda item: item[1], reverse=True))

# [('initial_mass', 0.8839477031034191), ('log_L', 0.09161165905982777), ('surface_h1', 0.011127700412579324), ('surface_he3', 0.0033049252788893155), 
# ('log10_isochrone_age_yr', 0.0019176891818681382), ('log_g', 0.0017572381580040005), ('log_center_Rho', 0.0016482348695309997), ('star_mdot', 0.0008625033420222818), 
# ('surface_c12', 0.0007597120972732856), ('center_he4', 0.00048267893388818695), ('log_Teff', 0.000453610777851697), ('center_gamma', 0.00041143300994071866), 
# ('surface_o16', 0.00039346082226295937), ('center_h1', 0.0003766845922272143), ('surface_he4', 0.00037247157356392333), ('log_LHe', 0.0002093291117933904), 
# ('he_core_mass', 0.00015498121364276824), ('center_c12', 0.00010566264505352677), ('log_LH', 6.0207526696747077e-05), ('log_center_T', 2.639257820837918e-05), 
# ('c_core_mass', 1.3094153529275165e-05), ('phase', 2.6275579269613185e-06)]


#[('initial_mass', 0.8839311221406747), ('log_L', 0.09154367925820746), ('surface_h1', 0.011118973747171336), ('surface_he3', 0.003213926433782181), 
# ('log10_isochrone_age_yr', 0.0019023817652589513), ('log_g', 0.0017568286736215695), ('log_center_Rho', 0.00164855764759087), ('star_mdot', 0.0008576534936135826), 
# ('surface_c12', 0.0007480646092296779), ('center_he4', 0.0004569624103238293), ('log_Teff', 0.00045617934471888817), ('center_gamma', 0.0004125023102093929), 
# ('surface_he4', 0.00037589284989807364), ('center_h1', 0.0003755461984464765), ('surface_o16', 0.0003448193887394036), ('metallicity', 0.00029809926869628144), 
# ('log_LHe', 0.00020249414526448388), ('he_core_mass', 0.00013768702458259513), ('center_c12', 9.819341505585615e-05), ('log_LH', 6.126644827201415e-05), 
# ('log_center_T', 4.8022031714413744e-05), ('c_core_mass', 8.359851429477307e-06), ('phase', 2.7875434985057837e-06)]

[('initial_mass', 0.8839311221406747), ('log_L', 0.09154367925820746), ('surface_h1', 0.011118973747171336), ('surface_he3', 0.003213926433782181), ('log10_isochrone_age_yr', 0.0019023817652589513), ('log_g', 0.0017568286736215695), ('log_center_Rho', 0.00164855764759087), ('star_mdot', 0.0008576534936135826), ('surface_c12', 0.0007480646092296779), ('center_he4', 0.0004569624103238293), ('log_Teff', 0.00045617934471888817), ('center_gamma', 0.0004125023102093929), ('surface_he4', 0.00037589284989807364), ('center_h1', 0.0003755461984464765), ('surface_o16', 0.0003448193887394036), ('metallicity', 0.00029809926869628144), ('log_LHe', 0.00020249414526448388), ('he_core_mass', 0.00013768702458259513), ('center_c12', 9.819341505585615e-05), ('log_LH', 6.126644827201415e-05), ('log_center_T', 4.8022031714413744e-05), ('c_core_mass', 8.359851429477307e-06), ('phase', 2.7875434985057837e-06)]


In [17]:
X_train_primary_transformed = sel_primary.transform(X_train_primary)
X_test_primary_transformed = sel_primary.transform(X_test_primary)

In [18]:
display(X_train_primary_transformed.shape)

display(X_test_primary_transformed.shape)

(873972, 21)

(291325, 21)

### Secondary model

In [19]:
rf_secondary = RandomForestRegressor(random_state=1337, n_jobs=10, max_depth=30)
sel_secondary = SelectFromModel(estimator=rf_secondary, threshold=.00001) # très bas pour l'instant pour tester
# the variable needs to be above the threshold to be accepted
sel_secondary.fit(X_train_secondary, y_train_secondary)

print("Importances: ", sel_secondary.estimator_.feature_importances_)

print("Default threshold: ", sel_secondary.threshold_)

features_secondary = sel_secondary.get_support()
features_selected_secondary = np.arange(M_secondary)[features_secondary]
print("The features selected are columns: ", features_selected_secondary)

Importances:  [3.78980972e-04 2.02163642e-03 3.56513245e-03 4.13948791e-02
 1.97780778e-02 2.44875849e-02 1.99264806e-03 1.27926297e-03
 6.75585390e-01 4.21815939e-03 3.68111861e-03 1.74623114e-02
 3.03256838e-03 7.32448367e-03 1.02530574e-02 1.50936668e-01
 1.95753239e-02 4.09653378e-03 1.99023583e-03 2.22704747e-03
 3.75295214e-03 9.65948011e-04]
Default threshold:  1e-05
The features selected are columns:  [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21]


In [20]:
col_value_dict = associate_col_name_to_features(phase_filtered_iso_df.drop(['log_Teff', 'log_g', 'log_R'], axis=1), sel_secondary.estimator_.feature_importances_)

In [None]:
print(sorted(col_value_dict.items(), key=lambda item: item[1], reverse=True))

#[('log_LHe', 0.6755853895829209), ('log_center_Rho', 0.15093666776698508), ('star_mdot', 0.04139487907428272), ('c_core_mass', 0.02448758494536492), 
# ('he_core_mass', 0.01977807781235997), ('center_gamma', 0.01957532390292936), ('surface_he4', 0.017462311368331363), ('log_center_T', 0.01025305739193053), 
# ('surface_o16', 0.007324483665342927), ('surface_h1', 0.004218159386315815), ('center_h1', 0.004096533783113224), ('phase', 0.0037529521402497176), 
# ('surface_he3', 0.0036811186144779665), ('star_mass', 0.0035651324476967945), ('surface_c12', 0.003032568383012045), ('center_c12', 0.002227047468901086), 
# ('initial_mass', 0.0020216364238288143), ('log_L', 0.0019926480605310243), ('center_he4', 0.001990235830341591), ('log_LH', 0.0012792629688576232), 
# ('metallicity', 0.0009659480106704929), ('log10_isochrone_age_yr', 0.00037898097155584517)]

[('log_LHe', 0.6755853895829209), ('log_center_Rho', 0.15093666776698508), ('star_mdot', 0.04139487907428272), ('c_core_mass', 0.02448758494536492), ('he_core_mass', 0.01977807781235997), ('center_gamma', 0.01957532390292936), ('surface_he4', 0.017462311368331363), ('log_center_T', 0.01025305739193053), ('surface_o16', 0.007324483665342927), ('surface_h1', 0.004218159386315815), ('center_h1', 0.004096533783113224), ('phase', 0.0037529521402497176), ('surface_he3', 0.0036811186144779665), ('star_mass', 0.0035651324476967945), ('surface_c12', 0.003032568383012045), ('center_c12', 0.002227047468901086), ('initial_mass', 0.0020216364238288143), ('log_L', 0.0019926480605310243), ('center_he4', 0.001990235830341591), ('log_LH', 0.0012792629688576232), ('metallicity', 0.0009659480106704929), ('log10_isochrone_age_yr', 0.00037898097155584517)]


In [None]:
X_train_secondary_transformed = sel_secondary.transform(X_train_secondary)
X_test_secondary_transformed = sel_secondary.transform(X_test_secondary)

In [None]:
display(X_train_secondary_transformed.shape)

display(X_test_secondary_transformed.shape)