# Munge and assemble data

Load in all of the various data files for the APOGEE DR17 x Gaia DR3 cross-match, with the BP/RP coefficients.

**TODO:**
- Figure out how to munge / keep track of the file of all XP spectra that we will apply to (so far just dealing with "training" data)

In [None]:
import pathlib

import astropy.table as at
import astropy.units as u
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import h5py

import schlummernd as sch

In [None]:
conf = sch.Config.parse_yaml('../config.yml')

In [None]:
apogee_data_path = pathlib.Path('/mnt/home/apricewhelan/data/APOGEE_DR17/')
gaia_data_path = pathlib.Path('/mnt/home/apricewhelan/data/GaiaDR3/')

In [None]:
allstar = at.Table.read(apogee_data_path / 'allStarLite-dr17-synspec_rev1.fits')
xm = at.Table.read(apogee_data_path / 'allStar-dr17-synspec-gaiadr3.fits')
xm.rename_column('GAIADR3_SOURCE_ID', 'source_id')

xm2 = at.Table.read(apogee_data_path / 'allStar-dr17-synspec-gaiadr3-gaiasourcelite.fits')

In [None]:
tbl = at.unique(at.hstack((allstar, xm)), keys='APOGEE_ID')
tbl = tbl[tbl['source_id'] != 0]
tbl = at.join(tbl, xm2, keys='source_id')
tbl = at.unique(tbl, keys='source_id')
len(tbl)

In [None]:
list(gaia_data_path.glob("*.hdf5"))

In [None]:
xp_cont_filename = pathlib.Path(gaia_data_path / 'apogee-dr17-xpcontinuous.hdf5')

In [None]:
# Read data and lightly rearrange
xp_tbl = at.Table()
with h5py.File(xp_cont_filename, 'r') as f:
    xp_tbl['source_id'] = f['source_id'][:]
    xp_tbl['bp'] = f['bp_coefficients'][:]
    xp_tbl['rp'] = f['rp_coefficients'][:]
    
    xp_tbl['bp_err'] = f['bp_coefficient_errors'][:]
    xp_tbl['rp_err'] = f['rp_coefficient_errors'][:]

In [None]:
xp_apogee_tbl = at.join(tbl, xp_tbl, keys='source_id')
len(xp_apogee_tbl)

In [None]:
delete_cols = [
    'TELESCOPE_1',
    'FIELD_1',
    'ALT_ID',
    'GLON',
    'GLAT',
    'AK_TARG',
    'AK_TARG_METHOD',
    'TARGFLAGS',
    'SURVEY',
    'PROGRAMNAME',
    'SNREV',
    'STARFLAGS',
    'ANDFLAG',
    'ANDFLAGS',
    'RV_TEFF',
    'RV_LOGG',
    'RV_FEH',
    'RV_ALPHA',
    'RV_CARB',
    'RV_CHI2',
    'RV_CCFWHM',
    'RV_AUTOFWHM',
    'RV_FLAG',
    'N_COMPONENTS',
    'MEANFIB',
    'SIGFIB',
    'MIN_H',
    'MAX_H',
    'MIN_JK',
    'MAX_JK',
    'GAIAEDR3_SOURCE_ID',
    'GAIAEDR3_PARALLAX',
    'GAIAEDR3_PARALLAX_ERROR',
    'GAIAEDR3_PMRA',
    'GAIAEDR3_PMRA_ERROR',
    'GAIAEDR3_PMDEC',
    'GAIAEDR3_PMDEC_ERROR',
    'GAIAEDR3_PHOT_G_MEAN_MAG',
    'GAIAEDR3_PHOT_BP_MEAN_MAG',
    'GAIAEDR3_PHOT_RP_MEAN_MAG',
    'GAIAEDR3_DR2_RADIAL_VELOCITY',
    'GAIAEDR3_DR2_RADIAL_VELOCITY_ERROR',
    'GAIAEDR3_R_MED_GEO',
    'GAIAEDR3_R_LO_GEO',
    'GAIAEDR3_R_HI_GEO',
    'GAIAEDR3_R_MED_PHOTOGEO',
    'GAIAEDR3_R_LO_PHOTOGEO',
    'GAIAEDR3_R_HI_PHOTOGEO',
    'ASPCAP_GRID',
    'ASPCAP_CHI2',
    'PARAMFLAG',
    'ASPCAPFLAGS',
    'FRAC_BADPIX',
    'FRAC_LOWSNR',
    'FRAC_SIGSKY',
    'X_H',
    'X_H_ERR',
    'X_M',
    'X_M_ERR',
    'ELEM_CHI2',
    'ELEMFRAC',
    'EXTRATARG',
    'MEMBERFLAG',
    'MEMBER',
    'X_H_SPEC',
    'X_M_SPEC',
    'TEFF_SPEC',
    'LOGG_SPEC',
    'C_FE_SPEC',
    'CI_FE_SPEC',
    'N_FE_SPEC',
    'O_FE_SPEC',
    'NA_FE_SPEC',
    'MG_FE_SPEC',
    'AL_FE_SPEC',
    'SI_FE_SPEC',
    'P_FE_SPEC',
    'S_FE_SPEC',
    'K_FE_SPEC',
    'CA_FE_SPEC',
    'TI_FE_SPEC',
    'TIII_FE_SPEC',
    'V_FE_SPEC',
    'CR_FE_SPEC',
    'MN_FE_SPEC',
    'FE_H_SPEC',
    'CO_FE_SPEC',
    'NI_FE_SPEC',
    'CU_FE_SPEC',
    'CE_FE_SPEC',
    'ASPCAP_ID',
    'FIELD_2',
    'TELESCOPE_2',
    'GAIADR3_AZERO_GSPPHOT',
    'GAIADR3_EBPMINRP_GSPPHOT',
    'GAIADR3_TEFF_GSPPHOT',
    'GAIADR3_LOGG_GSPPHOT',
    'GAIADR3_MH_GSPPHOT',
    'GAIADR3_TEFF_GSPSPEC',
    'GAIADR3_LOGG_GSPSPEC',
    'GAIADR3_MH_GSPSPEC',
    'GAIADR3_ALPHAFE_GSPSPEC',
    'GAIADR3_FEM_GSPSPEC',
    'GAIADR3_SIFE_GSPSPEC',
    'GAIADR3_CAFE_GSPSPEC',
    'GAIADR3_MGFE_GSPSPEC',
    'GAIADR3_NDFE_GSPSPEC',
    'GAIADR3_FEIIM_GSPSPEC',
    'GAIADR3_SFE_GSPSPEC',
    'GAIADR3_ZRFE_GSPSPEC',
    'GAIADR3_NFE_GSPSPEC',
    'GAIADR3_CRFE_GSPSPEC',
    'GAIADR3_CEFE_GSPSPEC',
    'GAIADR3_NIFE_GSPSPEC',
    'GAIADR3_AZERO_ESPHS',
    'GAIADR3_TEFF_ESPHS',
    'GAIADR3_LOGG_ESPHS',
    'random_index',
    'parallax_over_error',
    'pseudocolour',
    'pseudocolour_error',
    'ipd_frac_multi_peak',
    'ipd_frac_odd_win',
    'phot_bp_rp_excess_factor',
    'bp_rp',
    'rv_nb_transits',
    'rv_expected_sig_to_noise',
    'rv_renormalised_gof',
    'rv_chisq_pvalue',
    'phot_variable_flag',
    'in_qso_candidates',
    'in_galaxy_candidates',
    'in_andromeda_survey'
]
delete_cols = [col for col in xp_apogee_tbl.colnames if col in delete_cols]
xp_apogee_tbl.remove_columns(delete_cols)

In [None]:
for col in xp_apogee_tbl.colnames:
    if hasattr(xp_apogee_tbl[col], 'mask'):
        xp_apogee_tbl[col] = xp_apogee_tbl[col].filled(np.nan)

In [None]:
mask = xp_apogee_tbl['APOGEE_ID'] == xp_apogee_tbl['apogee_id']
print(mask.sum())
xp_apogee_tbl[mask].write(conf.data_path / 'apogee-dr17-x-gaia-dr3-xp.fits', overwrite=True)

In [None]:
fig, ax = plt.subplots(figsize=(6, 6))
H, xb, yb, _ = ax.hist2d(
    xp_apogee_tbl['TEFF'],
    xp_apogee_tbl['LOGG'],
    bins=(
        np.linspace(3000, 9000, 128),
        np.linspace(-0.5, 5.5, 128)
    ),
    norm=mpl.colors.LogNorm()
)
ax.set_xlim(xb.max(), xb.min())
ax.set_ylim(yb.max(), yb.min())

fig.tight_layout()