### Preparing the data in HDF5 Formatting
Switching the .fits file to an HDF5 for better storage management and mainly practice

Importing the appropriate packages.

In [1]:
from astropy.table import Table
from astropy.io import fits
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
import h5py
from sklearn.preprocessing import MinMaxScaler

In [2]:
data = Table.read('/arc/home/aydanmckay/gaiahike/bp_rp_lamost_pristine.fits').to_pandas()

In [3]:
data.shape

(721171, 241)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 721171 entries, 0 to 721170
Columns: 241 entries, CaHK to rpe_55
dtypes: float32(124), float64(114), int16(1), int64(2)
memory usage: 980.7 MB


In [5]:
with pd.option_context('mode.use_inf_as_na', True):
    data.dropna(inplace=True)
data.shape

(721158, 241)

In [6]:
print([dat for dat in data.columns])

['CaHK', 'CaHKerr', 'Flag', 'EBV_1', 'source_id_1', 'RA_1', 'DEC_1', 'Gmag', 'BPmag', 'RPmag', 'FeHphot_gen', 'FeHphot_dwarf', 'FeHphot_giant', 'feh', 'ebv_2', 'phot_g_mean_mag', 'source_id_2', 'teff', 'logg', 'ra_2', 'dec_2', 'bp_1', 'bp_2', 'bp_3', 'bp_4', 'bp_5', 'bp_6', 'bp_7', 'bp_8', 'bp_9', 'bp_10', 'bp_11', 'bp_12', 'bp_13', 'bp_14', 'bp_15', 'bp_16', 'bp_17', 'bp_18', 'bp_19', 'bp_20', 'bp_21', 'bp_22', 'bp_23', 'bp_24', 'bp_25', 'bp_26', 'bp_27', 'bp_28', 'bp_29', 'bp_30', 'bp_31', 'bp_32', 'bp_33', 'bp_34', 'bp_35', 'bp_36', 'bp_37', 'bp_38', 'bp_39', 'bp_40', 'bp_41', 'bp_42', 'bp_43', 'bp_44', 'bp_45', 'bp_46', 'bp_47', 'bp_48', 'bp_49', 'bp_50', 'bp_51', 'bp_52', 'bp_53', 'bp_54', 'bp_55', 'rp_1', 'rp_2', 'rp_3', 'rp_4', 'rp_5', 'rp_6', 'rp_7', 'rp_8', 'rp_9', 'rp_10', 'rp_11', 'rp_12', 'rp_13', 'rp_14', 'rp_15', 'rp_16', 'rp_17', 'rp_18', 'rp_19', 'rp_20', 'rp_21', 'rp_22', 'rp_23', 'rp_24', 'rp_25', 'rp_26', 'rp_27', 'rp_28', 'rp_29', 'rp_30', 'rp_31', 'rp_32', 'rp_33',

In [7]:
labels = ["bp_1", "bp_2", "bp_3", "bp_4", "bp_5", "bp_6", "bp_7",
          "bp_8", "bp_9", "bp_10", "bp_11", "bp_12", "bp_13", "bp_14",
          "bp_15", "bp_16", "bp_17", "bp_18", "bp_19", "bp_20", "bp_21",
          "bp_22", "bp_23", "bp_24", "bp_25", "bp_26", "bp_27", "bp_28",
          "bp_29", "bp_30", "bp_31", "bp_32", "bp_33", "bp_34", "bp_35",
          "bp_36", "bp_37", "bp_38", "bp_39", "bp_40", "bp_41", "bp_42",
          "bp_43", "bp_44", "bp_45", "bp_46", "bp_47", "bp_48", "bp_49",
          "bp_50", "bp_51", "bp_52", "bp_53", "bp_54", "bp_55", "rp_1",
          "rp_2", "rp_3", "rp_4", "rp_5", "rp_6", "rp_7", "rp_8",
          "rp_9", "rp_10", "rp_11", "rp_12", "rp_13", "rp_14", "rp_15",
          "rp_16", "rp_17", "rp_18", "rp_19", "rp_20", "rp_21", "rp_22",
          "rp_23", "rp_24", "rp_25", "rp_26", "rp_27", "rp_28", "rp_29",
          "rp_30", "rp_31", "rp_32", "rp_33", "rp_34", "rp_35", "rp_36",
          "rp_37", "rp_38", "rp_39", "rp_40", "rp_41", "rp_42", "rp_43",
          "rp_44", "rp_45", "rp_46", "rp_47", "rp_48", "rp_49", "rp_50",
          "rp_51", "rp_52", "rp_53", "rp_54", "rp_55"]
elabels = ["bpe_1", "bpe_2", "bpe_3", "bpe_4", "bpe_5", "bpe_6", "bpe_7",
          "bpe_8", "bpe_9", "bpe_10", "bpe_11", "bpe_12", "bpe_13", "bpe_14",
          "bpe_15", "bpe_16", "bpe_17", "bpe_18", "bpe_19", "bpe_20", "bpe_21",
          "bpe_22", "bpe_23", "bpe_24", "bpe_25", "bpe_26", "bpe_27", "bpe_28",
          "bpe_29", "bpe_30", "bpe_31", "bpe_32", "bpe_33", "bpe_34", "bpe_35",
          "bpe_36", "bpe_37", "bpe_38", "bpe_39", "bpe_40", "bpe_41", "bpe_42",
          "bpe_43", "bpe_44", "bpe_45", "bpe_46", "bpe_47", "bpe_48", "bpe_49",
          "bpe_50", "bpe_51", "bpe_52", "bpe_53", "bpe_54", "bpe_55", "rpe_1",
          "rpe_2", "rpe_3", "rpe_4", "rpe_5", "rpe_6", "rpe_7", "rpe_8",
          "rpe_9", "rpe_10", "rpe_11", "rpe_12", "rpe_13", "rpe_14", "rpe_15",
          "rpe_16", "rpe_17", "rpe_18", "rpe_19", "rpe_20", "rpe_21", "rpe_22",
          "rpe_23", "rpe_24", "rpe_25", "rpe_26", "rpe_27", "rpe_28", "rpe_29",
          "rpe_30", "rpe_31", "rpe_32", "rpe_33", "rpe_34", "rpe_35", "rpe_36",
          "rpe_37", "rpe_38", "rpe_39", "rpe_40", "rpe_41", "rpe_42", "rpe_43",
          "rpe_44", "rpe_45", "rpe_46", "rpe_47", "rpe_48", "rpe_49", "rpe_50",
          "rpe_51", "rpe_52", "rpe_53", "rpe_54", "rpe_55"]

In [8]:
feh = data['feh'] # lamost
teff = data['teff'] # lamost
logg = data['logg'] # lamost
mag_labels = ['CaHK', 'phot_g_mean_mag', 'BPmag', 'RPmag'] # capital letters are from pristine, Gmag also available
scale = data['phot_g_mean_mag']
mags = np.array([data[label] for label in mag_labels])
dust = data['EBV_1'] # this is pristine, ebv_2 for pre_matched
dist = np.array([np.nan for _ in range(len(data['CaHK']))])
xp = []
xpe = []
for elabel,label in zip(elabels,labels):
    xp.append(data[label])
    xpe.append(data[elabel])
xp = np.array(xp)
xpe = np.array(xpe)

In [9]:
xp.T.shape

(721158, 110)

In [19]:
np.mean(scale)

14.721165

In [10]:
mag_train, mag_test, feh_train, feh_test, teff_train, \
teff_test, logg_train, logg_test, xp_train, xp_test, \
xpe_train, xpe_test, dust_train, dust_test, dist_train, \
dist_test = train_test_split(
    mags.T,
    feh,
    teff,
    logg,
    xp.T,
    xpe.T,
    dust,
    # dist,
    scale,
    test_size=0.1,
    random_state=42
)

In [11]:
mag_train, mag_valid, feh_train, feh_valid, teff_train, \
teff_valid, logg_train, logg_valid, xp_train, xp_valid, \
xpe_train, xpe_valid, dust_train, dust_valid, dist_train, \
dist_valid = train_test_split(
    mag_train,
    feh_train,
    teff_train,
    logg_train,
    xp_train,
    xpe_train,
    dust_train,
    dist_train,
    test_size=0.1,
    random_state=42
)

In [12]:
xp_train.shape

(584137, 110)

In [13]:
# Create a HDF5 file
hierarchicalFileName  = "/arc/home/aydanmckay/mae_tab/scale_lamost_pristine_bprp_gmag.h5"
with h5py.File(hierarchicalFileName, 'w') as hierarchicalFile:
    # Create a group under root
    
    train = hierarchicalFile.create_group("group_1")
    valid = hierarchicalFile.create_group("group_2")
    test = hierarchicalFile.create_group("group_3")
    
    train['theta'] = np.array(
        [feh_train,
         logg_train,
         teff_train
        ]
    )
    valid['theta'] = np.array(
        [feh_valid,
         logg_valid,
         teff_valid
        ]
    )
    test['theta'] = np.array(
        [feh_test,
         logg_test,
         teff_test
        ]
    )
    
    train['ext'] = dust_train
    valid['ext'] = dust_valid
    test['ext'] = dust_test
    
    train['dist'] = dist_train
    valid['dist'] = dist_valid
    test['dist'] = dist_test
    
    train['bprp'] = xp_train.T
    train['e_bprp'] = xpe_train.T
    valid['bprp'] = xp_valid.T
    valid['e_bprp'] = xpe_valid.T
    test['bprp'] = xp_test.T
    test['e_bprp'] = xpe_test.T
    
    train['mags'] = mag_train.T
    valid['mags'] = mag_valid.T
    test['mags'] = mag_test.T

    print(hierarchicalFile["/"])
    print(train)
    print(valid)
    print(train['theta'])
    print(valid["bprp"])
    print(test['e_bprp'])
    print(train['mags'])
    print(valid['ext'])

<HDF5 group "/" (3 members)>
<HDF5 group "/group_1" (6 members)>
<HDF5 group "/group_2" (6 members)>
<HDF5 dataset "theta": shape (3, 584137), type "<f4">
<HDF5 dataset "bprp": shape (110, 64905), type "<f8">
<HDF5 dataset "e_bprp": shape (110, 72116), type "<f4">
<HDF5 dataset "mags": shape (4, 584137), type "<f4">
<HDF5 dataset "ext": shape (64905,), type "<f4">


In [15]:
with h5py.File('/arc/home/aydanmckay/mae_tab/scale_lamost_pristine_bprp_gmag.h5', 'r') as f:
    d = f['group_2']['bprp']
    dn = f['group_2']['mags']
    dnset = dn[:]
    dset = d[:]
    # print(ydat.shape[1])
    print(f['group_1']['bprp'].shape[1])
    print(f['group_1']['theta'].shape)

584137
(3, 584137)


In [16]:
dnset

array([[22.069    , 20.851    , 25.115    , ..., 24.971    , 20.991    ,
        22.89     ],
       [13.831808 , 13.317917 , 16.310322 , ..., 15.463809 , 12.450099 ,
        14.9970875],
       [14.393    , 13.618    , 16.849    , ..., 16.189    , 12.979    ,
        15.382    ],
       [13.125    , 12.855    , 15.628    , ..., 14.634    , 11.765    ,
        14.441    ]], dtype=float32)

In [17]:
dset.shape

(110, 64905)

In [18]:
dset

array([[ 3.25406740e+03,  5.95745562e+03,  3.37177129e+02, ...,
         6.46883492e+02,  1.17521946e+04,  1.21837510e+03],
       [-7.78948321e+01, -1.13363294e+03, -1.02679945e+01, ...,
         4.59923201e+01, -5.72808214e+02, -1.59935907e+02],
       [-1.35362988e+02,  2.60289792e+01, -1.32290321e+01, ...,
        -3.08167639e+01, -3.80886008e+02, -1.38734809e+01],
       ...,
       [ 4.31455751e-01, -1.17054175e-02, -4.18924452e-02, ...,
         2.05807155e-01, -5.60777770e-01,  1.68940831e-01],
       [ 1.17530310e-01,  1.22111720e-01,  4.50366889e-02, ...,
        -5.94768075e-02,  2.20296816e-01,  9.63356292e-02],
       [ 8.13535474e-02,  2.05653578e-02, -1.03880124e-02, ...,
         3.56508402e-03, -6.18395461e-02, -3.95952504e-02]])

In [None]:
ydat

In [None]:
dset.T.shape