### Preparing the data in HDF5 Formatting
Switching the .fits file to an HDF5 for better storage management and mainly practice

Importing the appropriate packages.

In [1]:
from astropy.table import Table
from astropy.io import fits
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
import h5py
from sklearn.preprocessing import StandardScaler

In [2]:
data = Table.read('/arc/home/aydanmckay/gaiahike/bp_rp_apogee.fits').to_pandas()

In [3]:
data.shape

(642528, 230)

In [4]:
data['rp_55'].describe()

count    642528.000000
mean         -0.034857
std           9.566036
min       -2129.234821
25%          -0.022924
50%           0.037749
75%           0.151573
max        4242.768069
Name: rp_55, dtype: float64

In [5]:
with pd.option_context('mode.use_inf_as_na', True):
    data.dropna(inplace=True)
data.shape

(481813, 230)

In [6]:
labels = ["bp_1", "bp_2", "bp_3", "bp_4", "bp_5", "bp_6", "bp_7",
          "bp_8", "bp_9", "bp_10", "bp_11", "bp_12", "bp_13", "bp_14",
          "bp_15", "bp_16", "bp_17", "bp_18", "bp_19", "bp_20", "bp_21",
          "bp_22", "bp_23", "bp_24", "bp_25", "bp_26", "bp_27", "bp_28",
          "bp_29", "bp_30", "bp_31", "bp_32", "bp_33", "bp_34", "bp_35",
          "bp_36", "bp_37", "bp_38", "bp_39", "bp_40", "bp_41", "bp_42",
          "bp_43", "bp_44", "bp_45", "bp_46", "bp_47", "bp_48", "bp_49",
          "bp_50", "bp_51", "bp_52", "bp_53", "bp_54", "bp_55", "rp_1",
          "rp_2", "rp_3", "rp_4", "rp_5", "rp_6", "rp_7", "rp_8",
          "rp_9", "rp_10", "rp_11", "rp_12", "rp_13", "rp_14", "rp_15",
          "rp_16", "rp_17", "rp_18", "rp_19", "rp_20", "rp_21", "rp_22",
          "rp_23", "rp_24", "rp_25", "rp_26", "rp_27", "rp_28", "rp_29",
          "rp_30", "rp_31", "rp_32", "rp_33", "rp_34", "rp_35", "rp_36",
          "rp_37", "rp_38", "rp_39", "rp_40", "rp_41", "rp_42", "rp_43",
          "rp_44", "rp_45", "rp_46", "rp_47", "rp_48", "rp_49", "rp_50",
          "rp_51", "rp_52", "rp_53", "rp_54", "rp_55"]
elabels = ["bpe_1", "bpe_2", "bpe_3", "bpe_4", "bpe_5", "bpe_6", "bpe_7",
          "bpe_8", "bpe_9", "bpe_10", "bpe_11", "bpe_12", "bpe_13", "bpe_14",
          "bpe_15", "bpe_16", "bpe_17", "bpe_18", "bpe_19", "bpe_20", "bpe_21",
          "bpe_22", "bpe_23", "bpe_24", "bpe_25", "bpe_26", "bpe_27", "bpe_28",
          "bpe_29", "bpe_30", "bpe_31", "bpe_32", "bpe_33", "bpe_34", "bpe_35",
          "bpe_36", "bpe_37", "bpe_38", "bpe_39", "bpe_40", "bpe_41", "bpe_42",
          "bpe_43", "bpe_44", "bpe_45", "bpe_46", "bpe_47", "bpe_48", "bpe_49",
          "bpe_50", "bpe_51", "bpe_52", "bpe_53", "bpe_54", "bpe_55", "rpe_1",
          "rpe_2", "rpe_3", "rpe_4", "rpe_5", "rpe_6", "rpe_7", "rpe_8",
          "rpe_9", "rpe_10", "rpe_11", "rpe_12", "rpe_13", "rpe_14", "rpe_15",
          "rpe_16", "rpe_17", "rpe_18", "rpe_19", "rpe_20", "rpe_21", "rpe_22",
          "rpe_23", "rpe_24", "rpe_25", "rpe_26", "rpe_27", "rpe_28", "rpe_29",
          "rpe_30", "rpe_31", "rpe_32", "rpe_33", "rpe_34", "rpe_35", "rpe_36",
          "rpe_37", "rpe_38", "rpe_39", "rpe_40", "rpe_41", "rpe_42", "rpe_43",
          "rpe_44", "rpe_45", "rpe_46", "rpe_47", "rpe_48", "rpe_49", "rpe_50",
          "rpe_51", "rpe_52", "rpe_53", "rpe_54", "rpe_55"]

In [7]:
for label in elabels:
    data = data[(data[label] < 10) & (data[label] > -10)]
data.shape

(319316, 230)

In [8]:
# for small datasets
data = data.sample(n=50000,random_state=42)

In [9]:
data.shape

(50000, 230)

In [10]:
scaler = StandardScaler()

In [11]:
feh = scaler.fit_transform(data[['fe_h']]).flatten()
teff = scaler.fit_transform(data[['teff']]).flatten()
am = scaler.fit_transform(data[['alpha_m']]).flatten()
logg = scaler.fit_transform(data[['logg']]).flatten()
xp = []
xpe = []
for elabel,label in zip(elabels,labels):
    xp.append(scaler.fit_transform(data[[label]]).flatten())
    xpe.append(scaler.fit_transform(data[[elabel]]).flatten())
xp = np.array(xp)
xpe = np.array(xpe)

In [12]:
xp.T.shape

(50000, 110)

In [13]:
feh_train, feh_test, teff_train, teff_test, am_train, am_test, logg_train, logg_test, xp_train, xp_test, xpe_train, xpe_test = train_test_split(
    feh,
    teff,
    am,
    logg,
    xp.T,
    xpe.T,
    test_size=0.1,
    random_state=42
)

In [16]:
scaler = StandardScaler()

In [17]:
# Create a HDF5 file
hierarchicalFileName  = "/arc/home/aydanmckay/mydataelabelsalphasmallcutsscaled.h5"
with h5py.File(hierarchicalFileName, 'w') as hierarchicalFile:
    # Create a group under root
    train = hierarchicalFile.create_group("group_1")
    test = hierarchicalFile.create_group("group_2")
    
    train['data'] = np.array(
        [feh_train,
         logg_train,
         teff_train,
         am_train
        ]
    )
    test['data'] = np.array(
        [feh_test,
         logg_test,
         teff_test,
         am_test
        ]
    )
    train['label'] = xp_train.T
    train['e_label'] = xpe_train.T
    test['label'] = xp_test.T
    test['e_label'] = xpe_test.T

    print(hierarchicalFile["/"])
    print(train)
    print(test)
    print(train['data'])
    print(test["label"])

<HDF5 group "/" (2 members)>
<HDF5 group "/group_1" (3 members)>
<HDF5 group "/group_2" (3 members)>
<HDF5 dataset "data": shape (4, 45000), type "<f4">
<HDF5 dataset "label": shape (110, 5000), type "<f8">


In [18]:
with h5py.File("/arc/home/aydanmckay/mydataelabelsalphasmallcutsscaled.h5", 'r') as f:
    d = f['group_1']['data']
    dset = d[:]
    print(f['group_1']['data'].shape[1])

45000


In [19]:
dset.shape

(4, 45000)

In [21]:
dset

array([[-1.5669455e-01, -2.2242156e-01, -9.9385935e-01, ...,
         4.3444332e-01,  1.4095641e-03,  4.5312124e-01],
       [ 1.2275302e+00, -6.0900933e-01, -5.7075495e-01, ...,
        -4.0483538e-02,  1.2299918e+00,  1.1285211e+00],
       [-1.8155335e+00, -3.1966475e-01,  4.1193143e-01, ...,
         2.8975314e-01, -1.1108677e+00,  9.9210358e-01],
       [-9.9330038e-01,  1.0917609e+00,  1.8464029e+00, ...,
        -2.0650128e-01, -1.0088865e+00, -7.0529473e-01]], dtype=float32)