In [1]:
import pandas as pd
from pycaret.regression import setup, RegressionExperiment, compare_models
from pycaret.parallel import FugueBackend
from sklearn.decomposition import PCA
import numpy as np
import h5py
import polars as pl

In [2]:
datadir = '../../../data/APOGEE/'

training_set = datadir + 'training_data.h5'
normalization_data = datadir + 'mean_and_std.npy'

pca = PCA(n_components=1)

In [3]:
mean_and_std = np.load(normalization_data)
mean_labels = mean_and_std[0]
std_labels = mean_and_std[1]

In [4]:
def normalize(labels):
    # Normalize labels
    return (labels-mean_labels) / std_labels

In [5]:
# Define the number of output labels
num_labels = np.load(datadir+'mean_and_std.npy').shape[1]

# Load spectra and labels
with  h5py.File(training_set, 'r') as data_F:
    x_train = data_F['spectrum'][:]
    y_train = np.hstack((data_F['TEFF'][:], 
                         data_F['LOGG'][:], 
                         data_F['FE_H'][:]))

# Normalize labels
y_train = normalize(y_train)

# Transform into a single value
y_train = pca.fit_transform(y_train)

# Define the number of output labels
num_labels = y_train.shape[1]
num_fluxes = x_train.shape[1]

print('Each spectrum contains ' + str(num_fluxes) + ' wavelength bins')
print('Training set includes ' + str(x_train.shape[0]) + ' spectra.')

Each spectrum contains 7214 wavelength bins
Training set includes 44784 spectra.


In [6]:
df_train = pd.concat([pd.DataFrame(x_train), pd.DataFrame(y_train, columns=['target'])], axis=1)

df_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7205,7206,7207,7208,7209,7210,7211,7212,7213,target
0,1.036881,1.027561,1.017815,1.016619,1.016466,1.004510,0.988281,0.990171,1.012946,1.038713,...,0.947867,0.952879,0.954746,0.937672,0.933191,0.944470,0.954676,0.955576,0.959755,0.134516
1,1.035711,1.023595,1.028486,1.033887,1.020416,1.021567,1.025008,1.004612,1.001988,1.020844,...,0.952132,0.963200,0.989394,0.949660,0.917875,0.951611,0.936365,0.944244,0.936340,1.622526
2,1.011392,1.011521,1.008502,1.009408,1.007445,0.985910,0.955840,0.954559,0.987229,1.022529,...,0.935629,0.948573,0.939322,0.935014,0.922320,0.931384,0.945553,0.950321,0.961997,-0.750942
3,1.058238,1.067501,1.062629,1.050290,1.034646,1.026603,1.018828,1.014678,1.039715,1.064574,...,0.946117,0.961544,0.968247,0.955850,0.948623,0.956111,0.945956,0.967346,1.005403,0.105903
4,1.002464,0.991758,0.982990,0.983890,0.988004,0.973203,0.937438,0.924305,0.964944,1.020916,...,0.955617,0.981374,0.969568,0.947591,0.937702,0.954310,0.967761,0.987397,1.007436,-0.908189
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44779,1.090833,1.085457,1.082925,1.083167,1.073133,1.063151,1.060633,1.055826,1.055418,1.066160,...,0.951984,0.980766,0.967344,0.974858,0.994483,0.970611,1.000234,0.991213,0.962700,0.606724
44780,1.002617,0.996245,0.998153,1.003528,1.000306,0.978228,0.949339,0.945489,0.979600,1.030298,...,0.943019,0.950031,0.940917,0.936455,0.931839,0.931907,0.955119,0.968381,0.971754,-0.974801
44781,1.056679,1.070088,1.062519,1.057347,1.069799,1.057868,1.021001,1.012670,1.039869,1.065146,...,0.969660,0.965846,0.974367,0.966351,0.963690,0.964948,0.971462,0.993141,0.979873,-0.938594
44782,1.067051,1.066485,1.068391,1.072058,1.071626,1.056566,1.033489,1.027605,1.045373,1.062937,...,0.944424,0.944076,0.945529,0.944968,0.950801,0.961402,0.960059,0.959860,0.962276,0.432227


In [9]:
s = setup(data=df_train, target='target', session_id=123, n_jobs=1)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,target
2,Target type,Regression
3,Original data shape,"(44784, 7215)"
4,Transformed data shape,"(44784, 7215)"
5,Transformed train set shape,"(31348, 7215)"
6,Transformed test set shape,"(13436, 7215)"
7,Numeric features,7214
8,Preprocess,True
9,Imputation type,simple


In [11]:
exp = RegressionExperiment()
exp.setup(data=df_train, target='target', session_id=123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,target
2,Target type,Regression
3,Original data shape,"(44784, 7215)"
4,Transformed data shape,"(44784, 7215)"
5,Transformed train set shape,"(31348, 7215)"
6,Transformed test set shape,"(13436, 7215)"
7,Numeric features,7214
8,Preprocess,True
9,Imputation type,simple


<pycaret.regression.oop.RegressionExperiment at 0x2c6b76324a0>

In [13]:
best = compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
ridge,Ridge Regression,0.0758,0.0165,0.1271,0.9777,0.0657,1.615,39.9
br,Bayesian Ridge,0.0758,0.0165,0.1271,0.9777,0.0657,1.6157,247.504
omp,Orthogonal Matching Pursuit,0.0817,0.0191,0.1362,0.9741,0.0696,1.9351,40.95
lr,Linear Regression,0.087,0.0228,0.1497,0.9693,0.0757,1.7763,130.885
par,Passive Aggressive Regressor,0.1092,0.0257,0.1599,0.9655,0.0876,2.4651,89.763
huber,Huber Regressor,0.1246,0.0405,0.2008,0.9457,0.0971,2.3843,1132.814
dt,Decision Tree Regressor,0.177,0.0592,0.2433,0.9204,0.1367,5.8922,574.116
knn,K Neighbors Regressor,0.2314,0.1181,0.3435,0.8414,0.1836,5.9535,45.105
lasso,Lasso Regression,0.6631,0.7446,0.8628,-0.0003,0.5441,1.0763,30.605
en,Elastic Net,0.6631,0.7446,0.8628,-0.0003,0.5441,1.0763,42.567


Processing:   0%|          | 0/81 [00:00<?, ?it/s]