### Preparing the data in HDF5 Formatting
Switching the .fits file to an HDF5 for better storage management and mainly practice

Importing the appropriate packages.

In [1]:
from astropy.table import Table
from astropy.io import fits
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
import h5py
from sklearn.preprocessing import MinMaxScaler

In [2]:
data = Table.read('/arc/home/aydanmckay/input_catalogue_datacuts.fits').to_pandas()

In [3]:
data.shape

(3052372, 223)

In [4]:
data['Teff'].describe()

count    3.048424e+06
mean     5.577048e+03
std      8.973194e+02
min      2.000061e+03
25%      4.931297e+03
50%      5.565683e+03
75%      6.002468e+03
max      9.996015e+03
Name: Teff, dtype: float64

In [5]:
with pd.option_context('mode.use_inf_as_na', True):
    data.dropna(inplace=True)
data.shape

(3046382, 223)

In [6]:
labels = ["bp_1", "bp_2", "bp_3", "bp_4", "bp_5", "bp_6", "bp_7",
          "bp_8", "bp_9", "bp_10", "bp_11", "bp_12", "bp_13", "bp_14",
          "bp_15", "bp_16", "bp_17", "bp_18", "bp_19", "bp_20", "bp_21",
          "bp_22", "bp_23", "bp_24", "bp_25", "bp_26", "bp_27", "bp_28",
          "bp_29", "bp_30", "bp_31", "bp_32", "bp_33", "bp_34", "bp_35",
          "bp_36", "bp_37", "bp_38", "bp_39", "bp_40", "bp_41", "bp_42",
          "bp_43", "bp_44", "bp_45", "bp_46", "bp_47", "bp_48", "bp_49",
          "bp_50", "bp_51", "bp_52", "bp_53", "bp_54", "bp_55", "rp_1",
          "rp_2", "rp_3", "rp_4", "rp_5", "rp_6", "rp_7", "rp_8",
          "rp_9", "rp_10", "rp_11", "rp_12", "rp_13", "rp_14", "rp_15",
          "rp_16", "rp_17", "rp_18", "rp_19", "rp_20", "rp_21", "rp_22",
          "rp_23", "rp_24", "rp_25", "rp_26", "rp_27", "rp_28", "rp_29",
          "rp_30", "rp_31", "rp_32", "rp_33", "rp_34", "rp_35", "rp_36",
          "rp_37", "rp_38", "rp_39", "rp_40", "rp_41", "rp_42", "rp_43",
          "rp_44", "rp_45", "rp_46", "rp_47", "rp_48", "rp_49", "rp_50",
          "rp_51", "rp_52", "rp_53", "rp_54", "rp_55"]
elabels = ["bpe_1", "bpe_2", "bpe_3", "bpe_4", "bpe_5", "bpe_6", "bpe_7",
          "bpe_8", "bpe_9", "bpe_10", "bpe_11", "bpe_12", "bpe_13", "bpe_14",
          "bpe_15", "bpe_16", "bpe_17", "bpe_18", "bpe_19", "bpe_20", "bpe_21",
          "bpe_22", "bpe_23", "bpe_24", "bpe_25", "bpe_26", "bpe_27", "bpe_28",
          "bpe_29", "bpe_30", "bpe_31", "bpe_32", "bpe_33", "bpe_34", "bpe_35",
          "bpe_36", "bpe_37", "bpe_38", "bpe_39", "bpe_40", "bpe_41", "bpe_42",
          "bpe_43", "bpe_44", "bpe_45", "bpe_46", "bpe_47", "bpe_48", "bpe_49",
          "bpe_50", "bpe_51", "bpe_52", "bpe_53", "bpe_54", "bpe_55", "rpe_1",
          "rpe_2", "rpe_3", "rpe_4", "rpe_5", "rpe_6", "rpe_7", "rpe_8",
          "rpe_9", "rpe_10", "rpe_11", "rpe_12", "rpe_13", "rpe_14", "rpe_15",
          "rpe_16", "rpe_17", "rpe_18", "rpe_19", "rpe_20", "rpe_21", "rpe_22",
          "rpe_23", "rpe_24", "rpe_25", "rpe_26", "rpe_27", "rpe_28", "rpe_29",
          "rpe_30", "rpe_31", "rpe_32", "rpe_33", "rpe_34", "rpe_35", "rpe_36",
          "rpe_37", "rpe_38", "rpe_39", "rpe_40", "rpe_41", "rpe_42", "rpe_43",
          "rpe_44", "rpe_45", "rpe_46", "rpe_47", "rpe_48", "rpe_49", "rpe_50",
          "rpe_51", "rpe_52", "rpe_53", "rpe_54", "rpe_55"]

In [7]:
for label in elabels:
    data = data[(data[label] < 10) & (data[label] > -10)]
data.shape

(2712022, 223)

In [8]:
# for small datasets
# data = data.sample(n=100000,random_state=42)

In [9]:
data.shape

(2712022, 223)

In [10]:
data.describe()

Unnamed: 0,Teff,Fe/H,log_g,bp_1,bp_2,bp_3,bp_4,bp_5,bp_6,bp_7,...,rpe_46,rpe_47,rpe_48,rpe_49,rpe_50,rpe_51,rpe_52,rpe_53,rpe_54,rpe_55
count,2712022.0,2712022.0,2712022.0,2712022.0,2712022.0,2712022.0,2712022.0,2712022.0,2712022.0,2712022.0,...,2712022.0,2712022.0,2712022.0,2712022.0,2712022.0,2712022.0,2712022.0,2712022.0,2712022.0,2712022.0
mean,5560.763,-0.2718661,3.982041,2857.443,-291.5848,-41.72583,8.030056,-12.98594,0.4899001,-5.599894,...,0.5337316,0.4491993,0.4488879,0.4101714,0.4102095,0.3658979,0.3175931,0.2853877,0.118188,0.06142255
std,872.699,0.4096093,0.7868529,3494.174,643.9511,121.252,41.85671,26.18954,32.71361,17.76866,...,0.3641321,0.3316904,0.3083564,0.279758,0.2770696,0.2541609,0.2321594,0.2100467,0.08020779,0.0432402
min,2000.061,-4.487347,5.3949990000000004e-43,-0.9222329,-11587.89,-1718.666,-1205.797,-430.6089,-485.679,-1133.12,...,0.0648744,0.06264918,0.05036612,0.04899616,0.05481264,0.04443664,0.04371226,0.04235824,0.01816171,0.0112056
25%,4937.92,-0.4498981,3.776349,480.2662,-387.4779,-56.71171,-2.096837,-19.29551,-7.600808,-8.90983,...,0.2813886,0.2376655,0.221575,0.2063369,0.2159241,0.1823347,0.1623157,0.1471646,0.06919803,0.03737602
50%,5550.926,-0.2113507,4.199539,1460.717,-86.80237,-14.09114,2.909846,-5.746578,-0.774151,-1.691054,...,0.4203166,0.3512707,0.3605437,0.3267774,0.3257406,0.2922344,0.249953,0.2238688,0.09358199,0.04867428
75%,5975.766,-0.02535014,4.483057,3874.248,-4.10566,-0.1253087,16.64249,-0.8981975,5.998727,0.4102218,...,0.6698429,0.5515379,0.58513,0.5291217,0.5174929,0.4720198,0.3976967,0.354761,0.1396926,0.07059544
max,9994.979,4.591588,7.899983,36929.85,5510.93,2606.343,626.7605,627.12,900.9338,305.3074,...,6.935789,7.062778,3.908607,3.928383,4.231868,3.720495,4.36621,3.997701,1.664967,1.756402


In [11]:
feh = data['Fe/H']
teff = data['Teff']
logg = data['log_g']
xp = []
xpe = []
for elabel,label in zip(elabels,labels):
    xp.append(data[label])
    xpe.append(data[elabel])
xp = np.array(xp)
xpe = np.array(xpe)

In [12]:
xp.T.shape

(2712022, 110)

In [13]:
feh_train, feh_test, teff_train, teff_test, logg_train, logg_test, xp_train, xp_test, xpe_train, xpe_test = train_test_split(
    feh,
    teff,
    logg,
    xp.T,
    xpe.T,
    test_size=0.1,
    random_state=42
)

In [14]:
feh_train, feh_valid, teff_train, teff_valid, logg_train, logg_valid, xp_train, xp_valid, xpe_train, xpe_valid = train_test_split(
    feh_train,
    teff_train,
    logg_train,
    xp_train,
    xpe_train,
    test_size=0.1,
    random_state=42
)

In [16]:
# Create a HDF5 file
hierarchicalFileName  = "/arc/home/aydanmckay/input_catalogue_datacuts.h5"
with h5py.File(hierarchicalFileName, 'w') as hierarchicalFile:
    # Create a group under root
    train = hierarchicalFile.create_group("group_1")
    valid = hierarchicalFile.create_group("group_2")
    test = hierarchicalFile.create_group("group_3")
    
    train['data'] = np.array(
        [feh_train,
         logg_train,
         teff_train,
        ]
    )
    valid['data'] = np.array(
        [feh_valid,
         logg_valid,
         teff_valid,
        ]
    )
    test['data'] = np.array(
        [feh_test,
         logg_test,
         teff_test,
        ]
    )
    train['label'] = xp_train.T
    train['e_label'] = xpe_train.T
    valid['label'] = xp_valid.T
    valid['e_label'] = xpe_valid.T
    test['label'] = xp_test.T
    test['e_label'] = xpe_test.T

    print(hierarchicalFile["/"])
    print(train)
    print(valid)
    print(train['data'])
    print(valid["label"])
    print(test['e_label'])

<HDF5 group "/" (3 members)>
<HDF5 group "/group_1" (3 members)>
<HDF5 group "/group_2" (3 members)>
<HDF5 dataset "data": shape (3, 2196737), type "<f8">
<HDF5 dataset "label": shape (110, 244082), type "<f8">
<HDF5 dataset "e_label": shape (110, 271203), type "<f4">


In [17]:
scalerlist = [MinMaxScaler() for _ in range(110)]
with h5py.File("/arc/home/aydanmckay/input_catalogue_datacuts.h5", 'r') as f:
    d = f['group_1']['label']
    dset = d[:]
    ydat = np.array([
        scaler.fit_transform(d[[it]].T).flatten() for it,scaler in enumerate(scalerlist)
    ])
    print(ydat.shape[1])
    print(f['group_1']['label'].shape[1])
    print(f['group_1']['data'].shape)

2196737
2196737
(3, 2196737)


In [18]:
dset.shape

(110, 2196737)

In [19]:
dset

array([[ 2.81883400e+03,  1.12563800e+03,  2.40741050e+02, ...,
         2.38541368e+02,  1.04766772e+03,  1.67672333e+02],
       [-1.97867809e+02,  1.46399185e+02, -5.17441102e+01, ...,
         4.60346598e-01, -2.03776757e+02,  2.81848275e+01],
       [-7.44846716e+01, -4.54510032e+01,  5.40537537e+00, ...,
        -1.11525344e+01,  5.93269192e+00, -5.81011150e+00],
       ...,
       [ 2.80717567e-01,  1.23606306e-01,  5.30820837e-02, ...,
         1.71450877e-01,  1.27200510e-01,  9.25572597e-02],
       [-7.48794964e-02,  8.63666709e-02,  3.28213002e-02, ...,
         7.11810626e-02,  2.49668675e-02, -8.42950902e-02],
       [ 9.99931633e-03, -3.47431194e-02,  8.97592493e-02, ...,
         4.02395195e-02, -7.08080643e-03, -7.07059473e-03]])

In [20]:
ydat

array([[0.07635249, 0.03050465, 0.00654368, ..., 0.00648412, 0.0283934 ,
        0.00456515],
       [0.66612915, 0.68626311, 0.67467498, ..., 0.67772809, 0.66578357,
        0.67934951],
       [0.36219622, 0.36910371, 0.3812031 , ..., 0.37726376, 0.38132856,
        0.37853479],
       ...,
       [0.68256749, 0.67141907, 0.66641475, ..., 0.67481406, 0.67167411,
        0.66921586],
       [0.5768464 , 0.58802292, 0.5843115 , ..., 0.58697035, 0.58376709,
        0.57619377],
       [0.30878393, 0.30559242, 0.31447326, ..., 0.31094099, 0.30756559,
        0.30756632]])