### Preparing the data in HDF5 Formatting
Switching the .fits file to an HDF5 for better storage management and mainly practice

Importing the appropriate packages.

In [10]:
from astropy.table import Table
from astropy.io import fits
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
import h5py
from sklearn.preprocessing import MinMaxScaler

In [11]:
data = Table.read('/arc/home/aydanmckay/mae_tab/bp_rp_apogee_lamost_pristine_concat.fits')
names = [name for name in data.colnames if len(data[name].shape) <= 1]
data = data[names].to_pandas()

In [12]:
data.shape

(1339266, 231)

In [13]:
data.describe()

Unnamed: 0,ra,dec,gmag,bpmag,rpmag,cahk,ext,feh,afe,logg,...,rpe_46,rpe_47,rpe_48,rpe_49,rpe_50,rpe_51,rpe_52,rpe_53,rpe_54,rpe_55
count,642528.0,642528.0,642528.0,1339266.0,1339266.0,1339266.0,535928.0,572305.0,572028.0,608715.0,...,642528.0,642528.0,642528.0,642528.0,642528.0,642528.0,642528.0,642528.0,642528.0,642528.0
mean,175.519527,7.960083,13.210089,0.0,0.0,0.0,0.220363,-0.234039,0.072113,3.094786,...,4.854606,3.789164,3.838116,3.559645,3.601596,3.097129,2.748194,2.496466,1.032495,0.589127
std,95.94366,41.185402,1.901239,0.0,0.0,0.0,1.075497,0.371967,0.102399,1.222278,...,96.184229,67.500085,71.959214,66.598513,66.852089,54.135091,48.57481,42.640041,16.972028,12.849234
min,0.000103,-87.224808,2.65168,0.0,0.0,0.0,0.002474,-2.4686,-0.714365,-0.482825,...,0.091793,0.084758,0.064332,0.065701,0.075551,0.055099,0.053387,0.048981,0.02319,0.011225
25%,89.839767,-23.176965,11.9594,0.0,0.0,0.0,0.029444,-0.40415,0.004917,2.308385,...,0.500238,0.453001,0.441088,0.393016,0.400751,0.367162,0.324858,0.293619,0.110627,0.056413
50%,178.490082,16.192937,13.1443,0.0,0.0,0.0,0.06241,-0.17464,0.042937,2.995981,...,0.972698,0.823608,0.848126,0.764647,0.757259,0.689114,0.592611,0.531611,0.203665,0.104177
75%,261.744846,41.740414,14.5815,0.0,0.0,0.0,0.208363,0.009171,0.109349,4.314407,...,2.151097,1.763614,1.771294,1.62709,1.622607,1.431115,1.250225,1.124677,0.462224,0.24094
max,359.999181,87.608246,20.164499,0.0,0.0,0.0,62.374294,0.95789,0.992785,5.370551,...,44037.816406,31293.429688,33479.109375,28020.875,29163.664062,23545.835938,22205.037109,19124.441406,7081.585449,5876.122559


In [5]:
with pd.option_context('mode.use_inf_as_na', True):
    data.dropna(inplace=True)
data.shape

(481813, 231)

In [6]:
print([dat for dat in data.columns])

['ra', 'dec', 'gmag', 'bpmag', 'rpmag', 'cahk', 'ext', 'feh', 'afe', 'logg', 'teff', 'bp_1', 'bp_2', 'bp_3', 'bp_4', 'bp_5', 'bp_6', 'bp_7', 'bp_8', 'bp_9', 'bp_10', 'bp_11', 'bp_12', 'bp_13', 'bp_14', 'bp_15', 'bp_16', 'bp_17', 'bp_18', 'bp_19', 'bp_20', 'bp_21', 'bp_22', 'bp_23', 'bp_24', 'bp_25', 'bp_26', 'bp_27', 'bp_28', 'bp_29', 'bp_30', 'bp_31', 'bp_32', 'bp_33', 'bp_34', 'bp_35', 'bp_36', 'bp_37', 'bp_38', 'bp_39', 'bp_40', 'bp_41', 'bp_42', 'bp_43', 'bp_44', 'bp_45', 'bp_46', 'bp_47', 'bp_48', 'bp_49', 'bp_50', 'bp_51', 'bp_52', 'bp_53', 'bp_54', 'bp_55', 'rp_1', 'rp_2', 'rp_3', 'rp_4', 'rp_5', 'rp_6', 'rp_7', 'rp_8', 'rp_9', 'rp_10', 'rp_11', 'rp_12', 'rp_13', 'rp_14', 'rp_15', 'rp_16', 'rp_17', 'rp_18', 'rp_19', 'rp_20', 'rp_21', 'rp_22', 'rp_23', 'rp_24', 'rp_25', 'rp_26', 'rp_27', 'rp_28', 'rp_29', 'rp_30', 'rp_31', 'rp_32', 'rp_33', 'rp_34', 'rp_35', 'rp_36', 'rp_37', 'rp_38', 'rp_39', 'rp_40', 'rp_41', 'rp_42', 'rp_43', 'rp_44', 'rp_45', 'rp_46', 'rp_47', 'rp_48', 'rp_49

In [7]:
labels = ["bp_1", "bp_2", "bp_3", "bp_4", "bp_5", "bp_6", "bp_7",
          "bp_8", "bp_9", "bp_10", "bp_11", "bp_12", "bp_13", "bp_14",
          "bp_15", "bp_16", "bp_17", "bp_18", "bp_19", "bp_20", "bp_21",
          "bp_22", "bp_23", "bp_24", "bp_25", "bp_26", "bp_27", "bp_28",
          "bp_29", "bp_30", "bp_31", "bp_32", "bp_33", "bp_34", "bp_35",
          "bp_36", "bp_37", "bp_38", "bp_39", "bp_40", "bp_41", "bp_42",
          "bp_43", "bp_44", "bp_45", "bp_46", "bp_47", "bp_48", "bp_49",
          "bp_50", "bp_51", "bp_52", "bp_53", "bp_54", "bp_55", "rp_1",
          "rp_2", "rp_3", "rp_4", "rp_5", "rp_6", "rp_7", "rp_8",
          "rp_9", "rp_10", "rp_11", "rp_12", "rp_13", "rp_14", "rp_15",
          "rp_16", "rp_17", "rp_18", "rp_19", "rp_20", "rp_21", "rp_22",
          "rp_23", "rp_24", "rp_25", "rp_26", "rp_27", "rp_28", "rp_29",
          "rp_30", "rp_31", "rp_32", "rp_33", "rp_34", "rp_35", "rp_36",
          "rp_37", "rp_38", "rp_39", "rp_40", "rp_41", "rp_42", "rp_43",
          "rp_44", "rp_45", "rp_46", "rp_47", "rp_48", "rp_49", "rp_50",
          "rp_51", "rp_52", "rp_53", "rp_54", "rp_55"]
elabels = ["bpe_1", "bpe_2", "bpe_3", "bpe_4", "bpe_5", "bpe_6", "bpe_7",
          "bpe_8", "bpe_9", "bpe_10", "bpe_11", "bpe_12", "bpe_13", "bpe_14",
          "bpe_15", "bpe_16", "bpe_17", "bpe_18", "bpe_19", "bpe_20", "bpe_21",
          "bpe_22", "bpe_23", "bpe_24", "bpe_25", "bpe_26", "bpe_27", "bpe_28",
          "bpe_29", "bpe_30", "bpe_31", "bpe_32", "bpe_33", "bpe_34", "bpe_35",
          "bpe_36", "bpe_37", "bpe_38", "bpe_39", "bpe_40", "bpe_41", "bpe_42",
          "bpe_43", "bpe_44", "bpe_45", "bpe_46", "bpe_47", "bpe_48", "bpe_49",
          "bpe_50", "bpe_51", "bpe_52", "bpe_53", "bpe_54", "bpe_55", "rpe_1",
          "rpe_2", "rpe_3", "rpe_4", "rpe_5", "rpe_6", "rpe_7", "rpe_8",
          "rpe_9", "rpe_10", "rpe_11", "rpe_12", "rpe_13", "rpe_14", "rpe_15",
          "rpe_16", "rpe_17", "rpe_18", "rpe_19", "rpe_20", "rpe_21", "rpe_22",
          "rpe_23", "rpe_24", "rpe_25", "rpe_26", "rpe_27", "rpe_28", "rpe_29",
          "rpe_30", "rpe_31", "rpe_32", "rpe_33", "rpe_34", "rpe_35", "rpe_36",
          "rpe_37", "rpe_38", "rpe_39", "rpe_40", "rpe_41", "rpe_42", "rpe_43",
          "rpe_44", "rpe_45", "rpe_46", "rpe_47", "rpe_48", "rpe_49", "rpe_50",
          "rpe_51", "rpe_52", "rpe_53", "rpe_54", "rpe_55"]

In [8]:
gmag       n/a    float64  n/a
5    bpmag      n/a    float64  n/a
6    rpmag      n/a    float64  n/a
7    cahk       n/a    float64  n/a
8    ext        n/a    float64  n/a
9    feh        n/a    float64  n/a
10   afe        n/a    float64  n/a
11   logg       n/a    float64  n/a
12   teff

SyntaxError: invalid syntax (3285507013.py, line 1)

In [None]:
# feh = data['feh'] # lamost
# teff = data['teff'] # lamost
# logg = data['logg'] # lamost
# mag_labels = ['CaHK', 'phot_g_mean_mag', 'BPmag', 'RPmag'] # capital letters are from pristine, Gmag also available
# scale = data['phot_g_mean_mag']
# mags = np.array([data[label] for label in mag_labels])
# dust = data['EBV_1'] # this is pristine, ebv_2 for pre_matched
# dist = np.array([np.nan for _ in range(len(data['CaHK']))])
feh = data['feh']
teff = data['teff']
logg = data['logg']
mag_labels = ['cahk', 'gmag', 'bpmag', 'rpmag']
scale = data['gmag']
mags = np.array([data[label] for label in mag_labels])
dust = data['ext']
dist = np.array([np.nan for _ in range(len(data['cahk']))])
xp = []
xpe = []
for elabel,label in zip(elabels,labels):
    xp.append(data[label])
    xpe.append(data[elabel])
xp = np.array(xp)
xpe = np.array(xpe)

In [None]:
xp.T.shape

In [None]:
np.mean(scale)

In [None]:
mag_train, mag_test, feh_train, feh_test, teff_train, \
teff_test, logg_train, logg_test, xp_train, xp_test, \
xpe_train, xpe_test, dust_train, dust_test, dist_train, \
dist_test = train_test_split(
    mags.T,
    feh,
    teff,
    logg,
    xp.T,
    xpe.T,
    dust,
    # dist,
    scale,
    test_size=0.1,
    random_state=42
)

In [None]:
mag_train, mag_valid, feh_train, feh_valid, teff_train, \
teff_valid, logg_train, logg_valid, xp_train, xp_valid, \
xpe_train, xpe_valid, dust_train, dust_valid, dist_train, \
dist_valid = train_test_split(
    mag_train,
    feh_train,
    teff_train,
    logg_train,
    xp_train,
    xpe_train,
    dust_train,
    dist_train,
    test_size=0.1,
    random_state=42
)

In [None]:
xp_train.shape

In [None]:
# Create a HDF5 file
hierarchicalFileName  = "/arc/home/aydanmckay/mae_tab/scale_lamost_apogee_pristine_bprp_gmag.h5"
with h5py.File(hierarchicalFileName, 'w') as hierarchicalFile:
    # Create a group under root
    
    train = hierarchicalFile.create_group("group_1")
    valid = hierarchicalFile.create_group("group_2")
    test = hierarchicalFile.create_group("group_3")
    
    train['theta'] = np.array(
        [feh_train,
         logg_train,
         teff_train
        ]
    )
    valid['theta'] = np.array(
        [feh_valid,
         logg_valid,
         teff_valid
        ]
    )
    test['theta'] = np.array(
        [feh_test,
         logg_test,
         teff_test
        ]
    )
    
    train['ext'] = dust_train
    valid['ext'] = dust_valid
    test['ext'] = dust_test
    
    train['dist'] = dist_train
    valid['dist'] = dist_valid
    test['dist'] = dist_test
    
    train['bprp'] = xp_train.T
    train['e_bprp'] = xpe_train.T
    valid['bprp'] = xp_valid.T
    valid['e_bprp'] = xpe_valid.T
    test['bprp'] = xp_test.T
    test['e_bprp'] = xpe_test.T
    
    train['mags'] = mag_train.T
    valid['mags'] = mag_valid.T
    test['mags'] = mag_test.T

    print(hierarchicalFile["/"])
    print(train)
    print(valid)
    print(train['theta'])
    print(valid["bprp"])
    print(test['e_bprp'])
    print(train['mags'])
    print(valid['ext'])

In [None]:
with h5py.File('/arc/home/aydanmckay/mae_tab/scale_lamost_apogee_pristine_bprp_gmag.h5', 'r') as f:
    d = f['group_2']['bprp']
    dn = f['group_2']['mags']
    dnset = dn[:]
    dset = d[:]
    # print(ydat.shape[1])
    print(f['group_1']['bprp'].shape[1])
    print(f['group_1']['theta'].shape)

In [None]:
dnset

In [None]:
dset.shape

In [None]:
dset

In [None]:
ydat

In [None]:
dset.T.shape