### Preparing the data in HDF5 Formatting
Switching the .fits file to an HDF5 for better storage management and mainly practice

Importing the appropriate packages.

In [1]:
from astropy.table import Table
from astropy.io import fits
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
import h5py
from sklearn.preprocessing import MinMaxScaler

In [2]:
data = Table.read('/arc/home/aydanmckay/gaiahike/bp_rp_apogee.fits').to_pandas()

In [3]:
data.shape

(642528, 230)

In [4]:
data['bp_1'].describe()

count    6.425280e+05
mean     3.668565e+04
std      3.137355e+05
min     -1.128071e+00
25%      1.264928e+03
50%      5.745377e+03
75%      1.863285e+04
max      5.773333e+07
Name: bp_1, dtype: float64

In [5]:
with pd.option_context('mode.use_inf_as_na', True):
    data.dropna(inplace=True)
data.shape

(481813, 230)

In [6]:
labels = ["bp_1", "bp_2", "bp_3", "bp_4", "bp_5", "bp_6", "bp_7",
          "bp_8", "bp_9", "bp_10", "bp_11", "bp_12", "bp_13", "bp_14",
          "bp_15", "bp_16", "bp_17", "bp_18", "bp_19", "bp_20", "bp_21",
          "bp_22", "bp_23", "bp_24", "bp_25", "bp_26", "bp_27", "bp_28",
          "bp_29", "bp_30", "bp_31", "bp_32", "bp_33", "bp_34", "bp_35",
          "bp_36", "bp_37", "bp_38", "bp_39", "bp_40", "bp_41", "bp_42",
          "bp_43", "bp_44", "bp_45", "bp_46", "bp_47", "bp_48", "bp_49",
          "bp_50", "bp_51", "bp_52", "bp_53", "bp_54", "bp_55", "rp_1",
          "rp_2", "rp_3", "rp_4", "rp_5", "rp_6", "rp_7", "rp_8",
          "rp_9", "rp_10", "rp_11", "rp_12", "rp_13", "rp_14", "rp_15",
          "rp_16", "rp_17", "rp_18", "rp_19", "rp_20", "rp_21", "rp_22",
          "rp_23", "rp_24", "rp_25", "rp_26", "rp_27", "rp_28", "rp_29",
          "rp_30", "rp_31", "rp_32", "rp_33", "rp_34", "rp_35", "rp_36",
          "rp_37", "rp_38", "rp_39", "rp_40", "rp_41", "rp_42", "rp_43",
          "rp_44", "rp_45", "rp_46", "rp_47", "rp_48", "rp_49", "rp_50",
          "rp_51", "rp_52", "rp_53", "rp_54", "rp_55"]
elabels = ["bpe_1", "bpe_2", "bpe_3", "bpe_4", "bpe_5", "bpe_6", "bpe_7",
          "bpe_8", "bpe_9", "bpe_10", "bpe_11", "bpe_12", "bpe_13", "bpe_14",
          "bpe_15", "bpe_16", "bpe_17", "bpe_18", "bpe_19", "bpe_20", "bpe_21",
          "bpe_22", "bpe_23", "bpe_24", "bpe_25", "bpe_26", "bpe_27", "bpe_28",
          "bpe_29", "bpe_30", "bpe_31", "bpe_32", "bpe_33", "bpe_34", "bpe_35",
          "bpe_36", "bpe_37", "bpe_38", "bpe_39", "bpe_40", "bpe_41", "bpe_42",
          "bpe_43", "bpe_44", "bpe_45", "bpe_46", "bpe_47", "bpe_48", "bpe_49",
          "bpe_50", "bpe_51", "bpe_52", "bpe_53", "bpe_54", "bpe_55", "rpe_1",
          "rpe_2", "rpe_3", "rpe_4", "rpe_5", "rpe_6", "rpe_7", "rpe_8",
          "rpe_9", "rpe_10", "rpe_11", "rpe_12", "rpe_13", "rpe_14", "rpe_15",
          "rpe_16", "rpe_17", "rpe_18", "rpe_19", "rpe_20", "rpe_21", "rpe_22",
          "rpe_23", "rpe_24", "rpe_25", "rpe_26", "rpe_27", "rpe_28", "rpe_29",
          "rpe_30", "rpe_31", "rpe_32", "rpe_33", "rpe_34", "rpe_35", "rpe_36",
          "rpe_37", "rpe_38", "rpe_39", "rpe_40", "rpe_41", "rpe_42", "rpe_43",
          "rpe_44", "rpe_45", "rpe_46", "rpe_47", "rpe_48", "rpe_49", "rpe_50",
          "rpe_51", "rpe_52", "rpe_53", "rpe_54", "rpe_55"]

In [7]:
for label in elabels:
    data = data[(data[label] < 10) & (data[label] > -10)]
data.shape

(319316, 230)

In [8]:
# for small datasets
# data = data.sample(n=50000,random_state=42)

In [9]:
data.shape

(319316, 230)

In [10]:
feh = data['fe_h']
teff = data['teff']
am = data['alpha_m']
logg = data['logg']
xp = []
xpe = []
for elabel,label in zip(elabels,labels):
    xp.append(data[label])
    xpe.append(data[elabel])
xp = np.array(xp)
xpe = np.array(xpe)

In [11]:
xp.T.shape

(319316, 110)

In [12]:
feh_train, feh_test, teff_train, teff_test, am_train, am_test, logg_train, logg_test, xp_train, xp_test, xpe_train, xpe_test = train_test_split(
    feh,
    teff,
    am,
    logg,
    xp.T,
    xpe.T,
    test_size=0.1,
    random_state=42
)

In [13]:
feh_train, feh_valid, teff_train, teff_valid, am_train, am_valid, logg_train, logg_valid, xp_train, xp_valid, xpe_train, xpe_valid = train_test_split(
    feh_train,
    teff_train,
    am_train,
    logg_train,
    xp_train,
    xpe_train,
    test_size=0.1,
    random_state=42
)

In [14]:
# Create a HDF5 file
hierarchicalFileName  = "/arc/home/aydanmckay/datacuts.h5"
with h5py.File(hierarchicalFileName, 'w') as hierarchicalFile:
    # Create a group under root
    train = hierarchicalFile.create_group("group_1")
    valid = hierarchicalFile.create_group("group_2")
    test = hierarchicalFile.create_group("group_3")
    
    train['data'] = np.array(
        [feh_train,
         logg_train,
         teff_train,
         # am_train
        ]
    )
    valid['data'] = np.array(
        [feh_valid,
         logg_valid,
         teff_valid,
         # am_valid
        ]
    )
    test['data'] = np.array(
        [feh_test,
         logg_test,
         teff_test,
         # am_test
        ]
    )
    train['label'] = xp_train.T
    train['e_label'] = xpe_train.T
    valid['label'] = xp_valid.T
    valid['e_label'] = xpe_valid.T
    test['label'] = xp_test.T
    test['e_label'] = xpe_test.T

    print(hierarchicalFile["/"])
    print(train)
    print(valid)
    print(train['data'])
    print(valid["label"])
    print(test['e_label'])

<HDF5 group "/" (3 members)>
<HDF5 group "/group_1" (3 members)>
<HDF5 group "/group_2" (3 members)>
<HDF5 dataset "data": shape (3, 258645), type "<f4">
<HDF5 dataset "label": shape (110, 28739), type "<f8">
<HDF5 dataset "e_label": shape (110, 31932), type "<f4">


In [15]:
scalerlist = [MinMaxScaler() for _ in range(110)]
with h5py.File("/arc/home/aydanmckay/datacuts.h5", 'r') as f:
    d = f['group_1']['label']
    dset = d[:]
    ydat = np.array([
        scaler.fit_transform(d[[it]].T).flatten() for it,scaler in enumerate(scalerlist)
    ])
    print(ydat.shape[1])
    print(f['group_1']['label'].shape[1])
    print(f['group_1']['data'].shape)

258645
258645
(3, 258645)


In [16]:
dset.shape

(110, 258645)

In [17]:
dset

array([[ 1.17999886e+04,  2.55217838e+03,  7.99308238e+03, ...,
         2.82477386e+03,  1.01576763e+04,  9.22249892e+03],
       [-2.30519800e+03, -6.70363353e+01,  5.17541032e+01, ...,
        -1.61222500e+02, -1.21487884e+03, -1.07109800e+03],
       [ 8.66803049e+01, -9.33332427e+01, -4.00106822e+02, ...,
        -9.51195629e+01, -2.37114162e+02, -1.80038812e+02],
       ...,
       [ 5.89275446e-02, -1.54444874e-01,  7.79387617e-01, ...,
         1.76813685e-01,  4.68315161e-01, -5.28626841e-01],
       [-1.10515168e-04,  8.90941260e-03, -1.24666328e-01, ...,
        -7.04084781e-02,  1.64123560e-01,  1.62819881e-01],
       [ 1.00693836e-01, -4.25276426e-02, -9.64566855e-02, ...,
        -4.04207928e-02,  1.76766339e-02,  2.75183848e-02]])

In [18]:
ydat

array([[0.31951341, 0.06909373, 0.21642693, ..., 0.07647529, 0.27504155,
        0.24971806],
       [0.49709812, 0.6517204 , 0.65992698, ..., 0.6452136 , 0.57242228,
        0.5823553 ],
       [0.62251884, 0.56044663, 0.45466507, ..., 0.55983067, 0.51086815,
        0.53054885],
       ...,
       [0.61734292, 0.59502313, 0.69270654, ..., 0.62967438, 0.66016685,
        0.55588188],
       [0.71465556, 0.71576774, 0.69929758, ..., 0.70598768, 0.73490595,
        0.73474521],
       [0.57951297, 0.55293509, 0.54292738, ..., 0.55332606, 0.56410731,
        0.56593366]])