### Preparing the data in HDF5 Formatting
Switching the .fits file to an HDF5 for better storage management and mainly practice

Importing the appropriate packages.

In [1]:
from astropy.table import Table
from astropy.io import fits
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
import h5py
from sklearn.preprocessing import MinMaxScaler

In [2]:
data = Table.read('/arc/home/aydanmckay/input_catalogue_simple_average_datacuts.fits').to_pandas()

In [3]:
data.shape

(6756683, 224)

In [4]:
data['Teff'].describe()

count    3.187680e+06
mean     5.576873e+03
std      9.254797e+02
min      2.000061e+03
25%      4.925000e+03
50%      5.551740e+03
75%      6.003603e+03
max      2.968421e+04
Name: Teff, dtype: float64

In [5]:
with pd.option_context('mode.use_inf_as_na', True):
    data.dropna(inplace=True)
data.shape

(3185660, 224)

In [6]:
labels = ["bp_1", "bp_2", "bp_3", "bp_4", "bp_5", "bp_6", "bp_7",
          "bp_8", "bp_9", "bp_10", "bp_11", "bp_12", "bp_13", "bp_14",
          "bp_15", "bp_16", "bp_17", "bp_18", "bp_19", "bp_20", "bp_21",
          "bp_22", "bp_23", "bp_24", "bp_25", "bp_26", "bp_27", "bp_28",
          "bp_29", "bp_30", "bp_31", "bp_32", "bp_33", "bp_34", "bp_35",
          "bp_36", "bp_37", "bp_38", "bp_39", "bp_40", "bp_41", "bp_42",
          "bp_43", "bp_44", "bp_45", "bp_46", "bp_47", "bp_48", "bp_49",
          "bp_50", "bp_51", "bp_52", "bp_53", "bp_54", "bp_55", "rp_1",
          "rp_2", "rp_3", "rp_4", "rp_5", "rp_6", "rp_7", "rp_8",
          "rp_9", "rp_10", "rp_11", "rp_12", "rp_13", "rp_14", "rp_15",
          "rp_16", "rp_17", "rp_18", "rp_19", "rp_20", "rp_21", "rp_22",
          "rp_23", "rp_24", "rp_25", "rp_26", "rp_27", "rp_28", "rp_29",
          "rp_30", "rp_31", "rp_32", "rp_33", "rp_34", "rp_35", "rp_36",
          "rp_37", "rp_38", "rp_39", "rp_40", "rp_41", "rp_42", "rp_43",
          "rp_44", "rp_45", "rp_46", "rp_47", "rp_48", "rp_49", "rp_50",
          "rp_51", "rp_52", "rp_53", "rp_54", "rp_55"]
elabels = ["bpe_1", "bpe_2", "bpe_3", "bpe_4", "bpe_5", "bpe_6", "bpe_7",
          "bpe_8", "bpe_9", "bpe_10", "bpe_11", "bpe_12", "bpe_13", "bpe_14",
          "bpe_15", "bpe_16", "bpe_17", "bpe_18", "bpe_19", "bpe_20", "bpe_21",
          "bpe_22", "bpe_23", "bpe_24", "bpe_25", "bpe_26", "bpe_27", "bpe_28",
          "bpe_29", "bpe_30", "bpe_31", "bpe_32", "bpe_33", "bpe_34", "bpe_35",
          "bpe_36", "bpe_37", "bpe_38", "bpe_39", "bpe_40", "bpe_41", "bpe_42",
          "bpe_43", "bpe_44", "bpe_45", "bpe_46", "bpe_47", "bpe_48", "bpe_49",
          "bpe_50", "bpe_51", "bpe_52", "bpe_53", "bpe_54", "bpe_55", "rpe_1",
          "rpe_2", "rpe_3", "rpe_4", "rpe_5", "rpe_6", "rpe_7", "rpe_8",
          "rpe_9", "rpe_10", "rpe_11", "rpe_12", "rpe_13", "rpe_14", "rpe_15",
          "rpe_16", "rpe_17", "rpe_18", "rpe_19", "rpe_20", "rpe_21", "rpe_22",
          "rpe_23", "rpe_24", "rpe_25", "rpe_26", "rpe_27", "rpe_28", "rpe_29",
          "rpe_30", "rpe_31", "rpe_32", "rpe_33", "rpe_34", "rpe_35", "rpe_36",
          "rpe_37", "rpe_38", "rpe_39", "rpe_40", "rpe_41", "rpe_42", "rpe_43",
          "rpe_44", "rpe_45", "rpe_46", "rpe_47", "rpe_48", "rpe_49", "rpe_50",
          "rpe_51", "rpe_52", "rpe_53", "rpe_54", "rpe_55"]

In [7]:
for label in elabels:
    data = data[(data[label] < 10) & (data[label] > -10)]
data.shape

(2760468, 224)

In [8]:
# for small datasets
data = data.sample(n=100000,random_state=42)

In [9]:
data.shape

(100000, 224)

In [10]:
data.describe()

Unnamed: 0,Source,Teff,Fe/H,log_g,bp_1,bp_2,bp_3,bp_4,bp_5,bp_6,...,rpe_46,rpe_47,rpe_48,rpe_49,rpe_50,rpe_51,rpe_52,rpe_53,rpe_54,rpe_55
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,...,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,1.357008e+18,5557.616107,-0.304066,3.95539,3094.38827,-313.901187,-46.917334,10.01099,-14.516134,-0.248156,...,0.550444,0.462359,0.463847,0.423852,0.423121,0.377364,0.327072,0.293893,0.121472,0.063002
std,1.37608e+18,883.00299,0.397614,0.787175,3755.283734,677.531515,130.630736,45.555696,28.830252,34.257128,...,0.373572,0.337473,0.317843,0.288871,0.284523,0.261381,0.23677,0.214222,0.081782,0.043547
min,10140420000000.0,2030.272949,-4.353709,0.125449,1.597008,-8093.790249,-1506.438588,-966.519375,-293.873256,-423.964689,...,0.080018,0.080117,0.061216,0.059864,0.067603,0.054412,0.050695,0.047391,0.021595,0.011369
25%,3.637488e+17,4938.55249,-0.478198,3.743457,505.050919,-415.902063,-61.686463,-2.015507,-20.894948,-8.247684,...,0.285135,0.240958,0.225372,0.209638,0.218754,0.185536,0.164691,0.149408,0.069914,0.037748
50%,8.487921e+17,5541.995117,-0.245896,4.186402,1561.657054,-93.170644,-14.944624,3.219701,-6.171808,-0.870559,...,0.431067,0.360477,0.370791,0.335921,0.334118,0.300449,0.25687,0.229912,0.095491,0.049597
75%,1.897676e+18,5974.780029,-0.058427,4.47295,4207.905548,-4.60074,-0.294461,18.544834,-0.972295,6.086163,...,0.701081,0.575179,0.61119,0.554913,0.541015,0.492351,0.415077,0.370437,0.14607,0.073704
max,6.914412e+18,29455.115234,4.088487,5.866668,31738.167769,4884.597533,2001.826776,563.420642,390.531768,843.07115,...,6.935789,5.66342,3.021415,2.873138,4.066447,2.850769,3.415339,3.086045,1.619962,1.565852


In [11]:
feh = data['Fe/H']
teff = data['Teff']
logg = data['log_g']
xp = []
xpe = []
for elabel,label in zip(elabels,labels):
    xp.append(data[label])
    xpe.append(data[elabel])
xp = np.array(xp)
xpe = np.array(xpe)

In [12]:
xp.T.shape

(100000, 110)

In [13]:
feh_train, feh_test, teff_train, teff_test, logg_train, logg_test, xp_train, xp_test, xpe_train, xpe_test = train_test_split(
    feh,
    teff,
    logg,
    xp.T,
    xpe.T,
    test_size=0.1,
    random_state=42
)

In [14]:
feh_train, feh_valid, teff_train, teff_valid, logg_train, logg_valid, xp_train, xp_valid, xpe_train, xpe_valid = train_test_split(
    feh_train,
    teff_train,
    logg_train,
    xp_train,
    xpe_train,
    test_size=0.1,
    random_state=42
)

In [15]:
# Create a HDF5 file
hierarchicalFileName  = "/arc/home/aydanmckay/input_catalogue_simple_average_datacuts_small.h5"
with h5py.File(hierarchicalFileName, 'w') as hierarchicalFile:
    # Create a group under root
    train = hierarchicalFile.create_group("group_1")
    valid = hierarchicalFile.create_group("group_2")
    test = hierarchicalFile.create_group("group_3")
    
    train['data'] = np.array(
        [feh_train,
         logg_train,
         teff_train,
        ]
    )
    valid['data'] = np.array(
        [feh_valid,
         logg_valid,
         teff_valid,
        ]
    )
    test['data'] = np.array(
        [feh_test,
         logg_test,
         teff_test,
        ]
    )
    train['label'] = xp_train.T
    train['e_label'] = xpe_train.T
    valid['label'] = xp_valid.T
    valid['e_label'] = xpe_valid.T
    test['label'] = xp_test.T
    test['e_label'] = xpe_test.T

    print(hierarchicalFile["/"])
    print(train)
    print(valid)
    print(train['data'])
    print(valid["label"])
    print(test['e_label'])

<HDF5 group "/" (3 members)>
<HDF5 group "/group_1" (3 members)>
<HDF5 group "/group_2" (3 members)>
<HDF5 dataset "data": shape (3, 81000), type "<f8">
<HDF5 dataset "label": shape (110, 9000), type "<f8">
<HDF5 dataset "e_label": shape (110, 10000), type "<f4">


In [20]:
scalerlist = [MinMaxScaler() for _ in range(110)]
with h5py.File("/arc/home/aydanmckay/input_catalogue_simple_average_datacuts_small.h5", 'r') as f:
    d = f['group_1']['label']
    dset = d[:]
    ydat = np.array([
        scaler.fit_transform(d[[it]].T).flatten() for it,scaler in enumerate(scalerlist)
    ])
    print(ydat.shape[1])
    print(f['group_1']['label'].shape[1])
    print(f['group_1']['data'].shape)

81000
81000
(3, 81000)


In [21]:
dset.shape

(110, 81000)

In [18]:
dset

array([[ 1.44275007e+03,  1.75053293e+02,  1.41029422e+03, ...,
         8.65066654e+03,  1.14079576e+03,  1.35344359e+04],
       [-2.07735711e+02,  2.42542750e+01, -2.53880404e+02, ...,
        -5.46103976e+02,  8.47314849e+01, -1.92039108e+03],
       [-9.05241543e+00, -8.57547133e+00,  3.39417630e+00, ...,
        -2.44984606e+02, -6.02366251e+01, -8.42325102e+01],
       ...,
       [-5.56626897e-02,  7.79964292e-02, -3.05931445e-01, ...,
         1.02885173e+00, -2.85930101e-01, -6.16398939e-01],
       [ 9.57125720e-02,  1.14148847e-02,  7.53857657e-02, ...,
         6.84546904e-02,  1.81205810e-01, -2.90834291e-01],
       [ 9.06119373e-03, -4.37103252e-02, -1.96612291e-02, ...,
         1.56366203e-01,  7.34393748e-02,  8.13877371e-02]])

In [19]:
ydat

array([[0.04540681, 0.00546233, 0.04438415, ..., 0.27252459, 0.03589239,
        0.42640968],
       [0.60762975, 0.62550485, 0.60407425, ..., 0.58155808, 0.63016469,
        0.47566765],
       [0.47995422, 0.48010709, 0.4839437 , ..., 0.40433134, 0.46354825,
        0.45585689],
       ...,
       [0.36635725, 0.38429685, 0.33276641, ..., 0.51191976, 0.33545098,
        0.29109576],
       [0.34996055, 0.34068258, 0.34772334, ..., 0.34696049, 0.35937011,
        0.30741644],
       [0.40600898, 0.395369  , 0.40021786, ..., 0.43570911, 0.41898912,
        0.4205917 ]])