### Preparing the data in HDF5 Formatting
Switching the .fits file to an HDF5 for better storage management and mainly practice

Importing the appropriate packages.

In [24]:
from astropy.table import Table
from astropy.io import fits
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
import h5py
from sklearn.preprocessing import MinMaxScaler

In [25]:
data = Table.read('/arc/home/aydanmckay/gaiahike/bp_rp_apogee.fits').to_pandas()

In [26]:
data.shape

(642528, 230)

In [27]:
data.describe()

Unnamed: 0,sfd_ebv,gaiaedr3_phot_g_mean_mag,source_id,fe_h,m_h,alpha_m,logg,teff,ra,dec,...,rpe_46,rpe_47,rpe_48,rpe_49,rpe_50,rpe_51,rpe_52,rpe_53,rpe_54,rpe_55
count,535928.0,642528.0,642528.0,572305.0,572279.0,572028.0,608715.0,608715.0,642528.0,642528.0,...,642528.0,642528.0,642528.0,642528.0,642528.0,642528.0,642528.0,642528.0,642528.0,642528.0
mean,0.220359,13.210278,2.830327e+18,-0.234036,-0.234631,0.072114,3.094697,5013.174316,175.519527,7.960083,...,4.854111,3.789136,3.838794,3.560125,3.602133,3.097025,2.747941,2.496227,1.032372,0.588982
std,1.0759,1.901047,1.841647e+18,0.371935,0.375699,0.10239,1.222225,1338.82605,95.94366,41.185402,...,96.112526,67.44162,71.901985,66.543945,66.796387,54.086079,48.531826,42.599335,16.954266,12.840849
min,0.002474,2.65168,2851858000000.0,-2.4686,-2.4686,-0.714365,-0.482825,3100.844482,0.000103,-87.224808,...,0.091793,0.084758,0.064332,0.065701,0.075551,0.055099,0.053387,0.048981,0.02319,0.011225
25%,0.029444,11.9594,1.313496e+18,-0.40415,-0.40294,0.004917,2.308385,4412.790039,89.839767,-23.176965,...,0.500238,0.453001,0.441088,0.393016,0.400751,0.367162,0.324858,0.293619,0.110627,0.056413
50%,0.06241,13.1443,2.536361e+18,-0.17464,-0.17661,0.042937,2.995981,4800.755859,178.490082,16.192937,...,0.972698,0.823608,0.848126,0.764647,0.757259,0.689114,0.592611,0.531611,0.203665,0.104177
75%,0.208363,14.5815,4.20489e+18,0.009171,0.008376,0.109349,4.314407,5223.474609,261.744846,41.740414,...,2.151097,1.763614,1.771294,1.62709,1.622607,1.431115,1.250225,1.124677,0.462224,0.24094
max,62.374294,20.164499,6.917528e+18,0.95789,0.96857,0.992785,5.370551,19869.988281,359.999181,87.608246,...,44037.816406,31293.429688,33479.109375,28020.875,29163.664062,23545.835938,22205.037109,19124.441406,7081.585449,5876.122559


In [28]:
with pd.option_context('mode.use_inf_as_na', True):
    data.dropna(inplace=True)
data.shape

(481813, 230)

In [29]:
labels = ["bp_1", "bp_2", "bp_3", "bp_4", "bp_5", "bp_6", "bp_7",
          "bp_8", "bp_9", "bp_10", "bp_11", "bp_12", "bp_13", "bp_14",
          "bp_15", "bp_16", "bp_17", "bp_18", "bp_19", "bp_20", "bp_21",
          "bp_22", "bp_23", "bp_24", "bp_25", "bp_26", "bp_27", "bp_28",
          "bp_29", "bp_30", "bp_31", "bp_32", "bp_33", "bp_34", "bp_35",
          "bp_36", "bp_37", "bp_38", "bp_39", "bp_40", "bp_41", "bp_42",
          "bp_43", "bp_44", "bp_45", "bp_46", "bp_47", "bp_48", "bp_49",
          "bp_50", "bp_51", "bp_52", "bp_53", "bp_54", "bp_55", "rp_1",
          "rp_2", "rp_3", "rp_4", "rp_5", "rp_6", "rp_7", "rp_8",
          "rp_9", "rp_10", "rp_11", "rp_12", "rp_13", "rp_14", "rp_15",
          "rp_16", "rp_17", "rp_18", "rp_19", "rp_20", "rp_21", "rp_22",
          "rp_23", "rp_24", "rp_25", "rp_26", "rp_27", "rp_28", "rp_29",
          "rp_30", "rp_31", "rp_32", "rp_33", "rp_34", "rp_35", "rp_36",
          "rp_37", "rp_38", "rp_39", "rp_40", "rp_41", "rp_42", "rp_43",
          "rp_44", "rp_45", "rp_46", "rp_47", "rp_48", "rp_49", "rp_50",
          "rp_51", "rp_52", "rp_53", "rp_54", "rp_55"]
elabels = ["bpe_1", "bpe_2", "bpe_3", "bpe_4", "bpe_5", "bpe_6", "bpe_7",
          "bpe_8", "bpe_9", "bpe_10", "bpe_11", "bpe_12", "bpe_13", "bpe_14",
          "bpe_15", "bpe_16", "bpe_17", "bpe_18", "bpe_19", "bpe_20", "bpe_21",
          "bpe_22", "bpe_23", "bpe_24", "bpe_25", "bpe_26", "bpe_27", "bpe_28",
          "bpe_29", "bpe_30", "bpe_31", "bpe_32", "bpe_33", "bpe_34", "bpe_35",
          "bpe_36", "bpe_37", "bpe_38", "bpe_39", "bpe_40", "bpe_41", "bpe_42",
          "bpe_43", "bpe_44", "bpe_45", "bpe_46", "bpe_47", "bpe_48", "bpe_49",
          "bpe_50", "bpe_51", "bpe_52", "bpe_53", "bpe_54", "bpe_55", "rpe_1",
          "rpe_2", "rpe_3", "rpe_4", "rpe_5", "rpe_6", "rpe_7", "rpe_8",
          "rpe_9", "rpe_10", "rpe_11", "rpe_12", "rpe_13", "rpe_14", "rpe_15",
          "rpe_16", "rpe_17", "rpe_18", "rpe_19", "rpe_20", "rpe_21", "rpe_22",
          "rpe_23", "rpe_24", "rpe_25", "rpe_26", "rpe_27", "rpe_28", "rpe_29",
          "rpe_30", "rpe_31", "rpe_32", "rpe_33", "rpe_34", "rpe_35", "rpe_36",
          "rpe_37", "rpe_38", "rpe_39", "rpe_40", "rpe_41", "rpe_42", "rpe_43",
          "rpe_44", "rpe_45", "rpe_46", "rpe_47", "rpe_48", "rpe_49", "rpe_50",
          "rpe_51", "rpe_52", "rpe_53", "rpe_54", "rpe_55"]

In [30]:
# for label in elabels:
#     data = data[(data[label] < 10) & (data[label] > -10)]
# data.shape

In [31]:
# for small datasets
# data = data.sample(n=50000,random_state=42)

In [32]:
data.shape

(481813, 230)

In [33]:
feh = data['fe_h']
teff = data['teff']
am = data['alpha_m']
logg = data['logg']
gmag = data['gaiaedr3_phot_g_mean_mag']
normalize = lambda x, n: x / (10 ** (8.5 - n / 2.5))
xp = []
xpe = []
for elabel,label,g in zip(elabels,labels,gmag):
    xp.append(data[label])
    xpe.append(data[elabel])
xp = np.array(xp)
xpe = np.array(xpe)

In [34]:
xp.T.shape

(481813, 110)

In [35]:
gmag_train, gmag_test, feh_train, feh_test, teff_train, teff_test, am_train, am_test, logg_train, logg_test, xp_train, xp_test, xpe_train, xpe_test = train_test_split(
    gmag,
    feh,
    teff,
    am,
    logg,
    xp.T,
    xpe.T,
    test_size=0.1,
    random_state=42
)

In [36]:
gmag_train, gmag_valid, feh_train, feh_valid, teff_train, teff_valid, am_train, am_valid, logg_train, logg_valid, xp_train, xp_valid, xpe_train, xpe_valid = train_test_split(
    gmag_train,
    feh_train,
    teff_train,
    am_train,
    logg_train,
    xp_train,
    xpe_train,
    test_size=0.1,
    random_state=42
)

In [37]:
xp_train.shape

(390267, 110)

In [38]:
# Create a HDF5 file
hierarchicalFileName  = "/arc/home/aydanmckay/apogee_bprp_gmag.h5"
with h5py.File(hierarchicalFileName, 'w') as hierarchicalFile:
    # Create a group under root
    
    train = hierarchicalFile.create_group("group_1")
    valid = hierarchicalFile.create_group("group_2")
    test = hierarchicalFile.create_group("group_3")
    
    train['data'] = np.array(
        [feh_train,
         logg_train,
         teff_train,
         # am_train
        ]
    )
    valid['data'] = np.array(
        [feh_valid,
         logg_valid,
         teff_valid,
         # am_valid
        ]
    )
    test['data'] = np.array(
        [feh_test,
         logg_test,
         teff_test,
         # am_test
        ]
    )
    train['label'] = xp_train.T
    train['e_label'] = xpe_train.T
    valid['label'] = xp_valid.T
    valid['e_label'] = xpe_valid.T
    test['label'] = xp_test.T
    test['e_label'] = xpe_test.T
    
    train['gmag'] = gmag_train
    valid['gmag'] = gmag_valid
    test['gmag'] = gmag_test

    print(hierarchicalFile["/"])
    print(train)
    print(valid)
    print(train['data'])
    print(valid["label"])
    print(test['e_label'])
    print(train['gmag'])

<HDF5 group "/" (3 members)>
<HDF5 group "/group_1" (4 members)>
<HDF5 group "/group_2" (4 members)>
<HDF5 dataset "data": shape (3, 390267), type "<f4">
<HDF5 dataset "label": shape (110, 43364), type "<f8">
<HDF5 dataset "e_label": shape (110, 48182), type "<f4">
<HDF5 dataset "gmag": shape (390267,), type "<f4">


In [39]:
scalerlist = [MinMaxScaler() for _ in range(110)]
with h5py.File("/arc/home/aydanmckay/apogee_bprp_gmag.h5", 'r') as f:
    d = f['group_1']['label']
    dn = f['group_1']['gmag']
    dnset = dn[:]
    dset = d[:]
    ydat = np.array([
        scaler.fit_transform(d[[it]].T).flatten() for it,scaler in enumerate(scalerlist)
    ])
    print(ydat.shape[1])
    print(f['group_1']['label'].shape[1])
    print(f['group_1']['data'].shape)

390267
390267
(3, 390267)


In [41]:
dnset

array([14.1472, 14.2514, 11.4446, ..., 11.5993, 11.2877, 14.1614],
      dtype=float32)

In [16]:
dset.shape

(110, 390267)

In [17]:
dset

array([[ 2.32948541e+03,  1.87748561e+03,  3.05437837e+04, ...,
         2.93273792e+04,  3.36147429e+04,  1.93787806e+03],
       [ 4.07548300e+01,  1.79590140e+02, -1.31551022e+03, ...,
        -6.05281620e+03,  2.47677654e+02,  2.76209241e+02],
       [-1.13494776e+02, -9.85013410e+01, -1.10829670e+03, ...,
         2.97819643e+02, -1.54011844e+03, -9.17349044e+01],
       ...,
       [ 4.09214963e-01,  1.84395003e-01,  6.10825795e-01, ...,
         1.21317120e-01,  1.98121742e-01, -1.97503163e-01],
       [ 2.58034467e-02, -6.32892527e-02, -7.94160894e-02, ...,
        -3.49411580e-01, -5.80855590e-01,  2.35294369e-01],
       [-3.63938200e-02,  4.51327252e-02,  1.19896489e+00, ...,
         3.12188173e-02, -2.27753775e-02,  1.29013234e-01]])

In [18]:
ydat

array([[6.51188736e-04, 5.24746209e-04, 8.54386303e-03, ...,
        8.20358574e-03, 9.40293385e-03, 5.41640407e-04],
       [9.66160955e-01, 9.66259790e-01, 9.65195451e-01, ...,
        9.61823038e-01, 9.66308260e-01, 9.66328571e-01],
       [9.77605555e-02, 9.78111342e-02, 9.44047038e-02, ...,
        9.91480782e-02, 9.29480020e-02, 9.78339600e-02],
       ...,
       [2.58482625e-01, 2.58350438e-01, 2.58601166e-01, ...,
        2.58313351e-01, 2.58358509e-01, 2.58125895e-01],
       [5.56681276e-01, 5.56389712e-01, 5.56336936e-01, ...,
        5.55453350e-01, 5.54695928e-01, 5.57366855e-01],
       [4.31522500e-01, 4.31693116e-01, 4.34107820e-01, ...,
        4.31663997e-01, 4.31551000e-01, 4.31868659e-01]])

In [23]:
dset.T.shape

(390267, 110)