In [66]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd


import astropy as ap
from astropy.table import QTable


from sklearn.model_selection import train_test_split, KFold, GridSearchCV

from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor

from sklearn.metrics import precision_score, recall_score, f1_score, matthews_corrcoef, confusion_matrix, explained_variance_score, mean_squared_error, max_error, mean_absolute_error, root_mean_squared_error
from scipy.stats import pearsonr

from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer
from sklearn.decomposition import PCA, KernelPCA
# pas besoin de feature selection parce que pas beacoup de colonnes

import read_mist_models

def gaussian(dsts):
    kernel_width = .5
    weights = np.exp(-(dsts**2)/kernel_width)
    return weights

In [13]:
def get_iso_data_panda(file):
    iso = read_mist_models.ISO(file)

    age = []
    logTeff = []
    # logL = []
    log_g = []
    mass = []
    for iso_ind in range(len(iso.isos)):
        age.extend(iso.isos[iso_ind]['log10_isochrone_age_yr'])
        logTeff.extend(iso.isos[iso_ind]['log_Teff'])
        # logL.extend(iso.isos[iso_ind]['log_L'])
        log_g.extend(iso.isos[iso_ind]['log_g'])
        mass.extend(iso.isos[iso_ind]['star_mass'])

    iso_data_all = QTable([age, logTeff, log_g, mass],
                        names=('age', 'logTeff', 'log_g', "mass"),
                        meta={'name': "iso data"})
    # print(iso_data_all.info)
    # print(iso_data_all)
    # display(iso_data_all.to_pandas())
    return iso_data_all.to_pandas()

In [14]:
full_data = get_iso_data_panda("data/MIST_v1.2_vvcrit0.0_basic_isos/MIST_v1.2_feh_p0.00_afe_p0.0_vvcrit0.0_basic.txt")

Reading in: data/MIST_v1.2_vvcrit0.0_basic_isos/MIST_v1.2_feh_p0.00_afe_p0.0_vvcrit0.0_basic.txt


In [15]:
display(full_data)

Unnamed: 0,age,logTeff,log_g,mass
0,5.0,3.468541,3.116651,0.100000
1,5.0,3.469176,3.114042,0.101392
2,5.0,3.471116,3.106133,0.105660
3,5.0,3.473035,3.098417,0.109915
4,5.0,3.474944,3.090875,0.114180
...,...,...,...,...
103984,10.3,4.370643,7.779334,0.528715
103985,10.3,4.363336,7.782175,0.528717
103986,10.3,4.356026,7.784962,0.528720
103987,10.3,4.348711,7.787695,0.528724


In [18]:
no_massive_stars_data = full_data.where(full_data.mass < 5).dropna().reset_index(drop=True)

In [45]:
X = no_massive_stars_data.drop(['mass'], axis=1).to_numpy()
y = no_massive_stars_data['mass'].to_numpy()

X_TRAIN, X_IVS, y_TRAIN, y_IVS = train_test_split(X, y, test_size=0.25, random_state=1337)

print(X_TRAIN.shape, X_IVS.shape)
print(y_TRAIN.shape, y_IVS.shape)

(61773, 3) (20592, 3)
(61773,) (20592,)


## Linear model

In [58]:
kf = KFold(n_splits=5, shuffle=True)
TRUTH=None
PREDS=None
for train_index, test_index in kf.split(X_TRAIN):
    X_train, X_test = X_TRAIN[train_index], X_TRAIN[test_index]
    y_train, y_test = y_TRAIN[train_index], y_TRAIN[test_index]

    mdl = LinearRegression().fit(X_train, y_train)#Lasso(alpha=0.0001, max_iter=100000).fit(X_train, y_train)#Ridge(alpha=0.0001, max_iter=9999999).fit(X_train, y_train)#LinearRegression().fit(X_train, y_train)
    mdl.fit(X_train, y_train)
    preds = mdl.predict(X_test)
    if TRUTH is None:
        PREDS=preds
        TRUTH=y_test
    else:
        PREDS=np.hstack((PREDS, preds))
        TRUTH=np.hstack((TRUTH, y_test))
        
print("RVE: ",explained_variance_score(TRUTH, PREDS))
print("RMSE: ",root_mean_squared_error(TRUTH, PREDS))
corr, pval=pearsonr(TRUTH, PREDS)
print("CORR: ",corr)
print("PVAL: ",pval)
print("MAX_ER: ",max_error(TRUTH, PREDS))
print("MEAN_ER: ",mean_absolute_error(TRUTH, PREDS))  

RVE:  0.30723352722909014
RMSE:  0.9783986016727161
CORR:  0.5542865083318406
PVAL:  0.0
MAX_ER:  3.176215605764966
MEAN_ER:  0.743663959413219


## Decision tree regressor

In [70]:
RVEs, RMSEs, CORRs,PVALs, MAXERRs, MEANABSERRs = [],[],[],[],[],[]

depthsteps, minsamplesteps = [2,5,10,15,20,50,100], [1,2,5,10,15,20,50]

for i in depthsteps:
    for j in minsamplesteps:
        kf = KFold(n_splits=5, shuffle=True)
        TRUTH=None
        PREDS=None
        for train_index, test_index in kf.split(X_TRAIN):
            X_train, X_test = X_TRAIN[train_index], X_TRAIN[test_index]
            y_train, y_test = y_TRAIN[train_index], y_TRAIN[test_index]

            mdl = DecisionTreeRegressor(max_depth=i, min_samples_leaf=j)
            mdl.fit(X_train, y_train)
            preds = mdl.predict(X_test)
            if TRUTH is None:
                PREDS=preds
                TRUTH=y_test
            else:
                PREDS=np.hstack((PREDS, preds))
                TRUTH=np.hstack((TRUTH, y_test))

        RVEs.append(explained_variance_score(TRUTH, PREDS))
        RMSEs.append(root_mean_squared_error(TRUTH, PREDS))
        corr, pval=pearsonr(TRUTH, PREDS)
        CORRs.append(corr)
        PVALs.append(pval)
        MAXERRs.append(max_error(TRUTH, PREDS))
        MEANABSERRs.append(mean_absolute_error(TRUTH, PREDS)) 

In [76]:
RVEs_np, CORRs_np, MAEs_np, RMSEs_np, MAXERRs_np = np.array(RVEs).reshape(7,7),np.array(CORRs).reshape(7,7),np.array(MEANABSERRs).reshape(7,7), np.array(RMSEs).reshape(7,7), np.array(MAXERRs).reshape(7,7)

df_RVE = pd.DataFrame(RVEs_np, columns = minsamplesteps, index = depthsteps).round(4)
df_CORR = pd.DataFrame(CORRs_np, columns = minsamplesteps, index = depthsteps).round(4)
df_MAE = pd.DataFrame(MAEs_np, columns = minsamplesteps, index = depthsteps).round(4)
df_RMSE = pd.DataFrame(RMSEs_np, columns = minsamplesteps, index = depthsteps).round(4)
df_MAXERR = pd.DataFrame(MAXERRs_np, columns = minsamplesteps, index = depthsteps).round(4)

print("RVE: \n",df_RVE)
print("\nCORR Score: \n",df_CORR)
print("\nMAE: \n",df_MAE)
print("\nRMSE: \n",df_RMSE)
print("\nMAXERR: \n",df_MAXERR)

RVE: 
          1       2       5       10      15      20      50
2    0.4907  0.4901  0.4907  0.4906  0.4907  0.4906  0.4907
5    0.7797  0.7794  0.7796  0.7788  0.7787  0.7801  0.7805
10   0.9212  0.9210  0.9242  0.9191  0.9196  0.9202  0.9232
15   0.9895  0.9887  0.9879  0.9859  0.9830  0.9801  0.9701
20   0.9915  0.9911  0.9900  0.9880  0.9835  0.9831  0.9707
50   0.9913  0.9916  0.9898  0.9881  0.9840  0.9825  0.9714
100  0.9923  0.9913  0.9889  0.9870  0.9850  0.9821  0.9700

CORR Score: 
          1       2       5       10      15      20      50
2    0.7005  0.7001  0.7005  0.7004  0.7005  0.7005  0.7005
5    0.8830  0.8828  0.8829  0.8825  0.8824  0.8832  0.8834
10   0.9598  0.9597  0.9614  0.9587  0.9590  0.9593  0.9608
15   0.9948  0.9943  0.9939  0.9929  0.9915  0.9900  0.9849
20   0.9958  0.9955  0.9950  0.9940  0.9917  0.9915  0.9853
50   0.9957  0.9958  0.9949  0.9941  0.9920  0.9912  0.9856
100  0.9961  0.9957  0.9945  0.9935  0.9925  0.9910  0.9849

MAE: 
          1

In [85]:
mdl = DecisionTreeRegressor(max_depth=15, min_samples_leaf=1)
mdl.fit(X_TRAIN, y_TRAIN)
preds = mdl.predict(X_IVS)

PREDS=preds
TRUTH=y_IVS

print("RVE: ",explained_variance_score(TRUTH, PREDS))
print("RMSE: ",root_mean_squared_error(TRUTH, PREDS))
corr, pval=pearsonr(TRUTH, PREDS)
print("CORR: ",corr)
print("PVAL: ",pval)
print("MAX_ER: ",max_error(TRUTH, PREDS))
print("MEAN_ER: ",mean_absolute_error(TRUTH, PREDS))

RVE:  0.9907418907414007
RMSE:  0.11364131464243916
CORR:  0.9953611542462999
PVAL:  0.0
MAX_ER:  4.101179007043641
MEAN_ER:  0.04763813434735741


In [111]:
total = 0
count = 0
for i in range(len(preds)):
    total += 1
    diff = y_IVS[i] - preds[i]
    bound = 0.2
    if (diff > bound) or (diff < -bound):
        count += 1
        print(f"prediction : {preds[i]}, truth : {y_IVS[i]}, diff : {y_IVS[i] - preds[i]}")

print(total)
print(count)

prediction : 4.490535472910274, truth : 4.15882380433359, diff : -0.3317116685766841
prediction : 1.1451707888146063, truth : 0.7503384814453014, diff : -0.39483230736930497
prediction : 4.622262621078533, truth : 4.277422431027421, diff : -0.34484019005111133
prediction : 1.3157522521947977, truth : 1.0585699134808366, diff : -0.2571823387139611
prediction : 4.57956938240723, truth : 4.059389323599795, diff : -0.5201800588074343
prediction : 4.622262621078533, truth : 4.279447309190718, diff : -0.342815311887815
prediction : 1.7134898742559406, truth : 1.4310512148560115, diff : -0.28243865939992907
prediction : 4.622262621078533, truth : 4.914176698822297, diff : 0.29191407774376454
prediction : 4.786621167419558, truth : 4.5864466655020095, diff : -0.20017450191754804
prediction : 4.403799807851855, truth : 4.68151734031362, diff : 0.27771753246176534
prediction : 1.9121150533740707, truth : 2.263939468397755, diff : 0.3518244150236842
prediction : 3.2880701834953, truth : 3.0132170