In [2]:
from astropy.io import fits
import pandas as pd
import numpy as np
from tqdm import tqdm

# 对apogee标签进行处理
主要操作包括：

    1.从FITS文件中提取有用的数据
    2.数据清理：删除数据中的无效行，(通过删除 STARFLAG、ASPCAPFLAG 和 PARAMFLAG 列不为零的行)
    3.将X/Fe转换成X/H
    4.保存筛选处理后的数据

（关于apogee的ASPCAP管道详情看https://www.sdss4.org/dr17/irspec/aspcap/）

## 读取FITS文件

In [3]:
# hdu = fits.open('./raw_data_catalogue_label/allStar-dr17-synspec.fits')
# hdu = fits.open(r'D:\文\jupyter\My_git\jupyter\data_preprocessing\raw_data_catalogue_label/allStar-dr17-synspec.fits')   # 之前说明文件的目录
hdu = fits.open(r'G:\Star\1_Data_download_and_preprocessing\raw_data_catalogue_label/allStar-dr17-synspec_rev1.fits')    # 新下载

In [None]:
# 查看有哪些列
hdu[1].header

## 将必要信息和标签存为DataFrame

In [4]:
# 将必要信息和标签存为DataFrame
#C、C I、N、O、Na、Mg、Al、Si、S、K、Ca、Ti、Ti II、V、Cr、Mn、Fe、Co、Ni 和 Ce。
#论文中原本的：Teff logg +  C, N, O, Mg, Al, Si, S, K, Ca, Ti, Cr, Mn, Fe, Ni
# lable_list = ['APOGEE_ID', 'RA', 'DEC', 'TEFF', 'LOGG', 'FE_H',
#               'C_FE', 'N_FE', 'O_FE', 'MG_FE', 'AL_FE', 'SI_FE', 'S_FE', 'K_FE', 'CA_FE',
#               'TI_FE', 'CR_FE', 'MN_FE', 'NI_FE', 'CU_FE',
#               'STARFLAG', 'ASPCAPFLAG', 'PARAMFLAG']
lable_list = ['APOGEE_ID', 'RA', 'DEC', 'TEFF', 'LOGG', 'FE_H',
              'C_FE', 'CI_FE', 'N_FE', 'O_FE', 'NA_FE', 'MG_FE', 'AL_FE', 'SI_FE', 'S_FE', 'K_FE', 'CA_FE',
              'TI_FE', 'TIII_FE', 'V_FE', 'CR_FE', 'MN_FE', 'CO_FE','NI_FE', 'CE_FE',
              'STARFLAG', 'ASPCAPFLAG', 'PARAMFLAG', 'VHELIO_AVG']
df_APOGEE_ASPCAP = pd.DataFrame(data=None, columns=lable_list)
for lable_name in tqdm(lable_list):
    df_APOGEE_ASPCAP[lable_name] = hdu[1].data[lable_name]

100%|██████████████████████████████████████████████████████████████████████████████████| 29/29 [00:01<00:00, 15.27it/s]


In [5]:
# 将大端字节序的数据转换为小端字节序
for col in df_APOGEE_ASPCAP.columns:
    if df_APOGEE_ASPCAP[col].dtype.byteorder == '>':  # 检查是否为大端字节序
        df_APOGEE_ASPCAP[col] = df_APOGEE_ASPCAP[col].astype(df_APOGEE_ASPCAP[col].dtype.newbyteorder('='))
df_APOGEE_ASPCAP.info()        

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 733901 entries, 0 to 733900
Data columns (total 29 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   APOGEE_ID   733901 non-null  object 
 1   RA          733900 non-null  float64
 2   DEC         733900 non-null  float64
 3   TEFF        689024 non-null  float32
 4   LOGG        689024 non-null  float32
 5   FE_H        647042 non-null  float32
 6   C_FE        643860 non-null  float32
 7   CI_FE       640781 non-null  float32
 8   N_FE        591028 non-null  float32
 9   O_FE        643504 non-null  float32
 10  NA_FE       567221 non-null  float32
 11  MG_FE       646291 non-null  float32
 12  AL_FE       572099 non-null  float32
 13  SI_FE       646151 non-null  float32
 14  S_FE        558938 non-null  float32
 15  K_FE        625849 non-null  float32
 16  CA_FE       628491 non-null  float32
 17  TI_FE       582657 non-null  float32
 18  TIII_FE     363726 non-null  float32
 19  V_

In [102]:
lable_list = ['APOGEE_ID', 'RA', 'DEC', 'TEFF', 'LOGG', 'FE_H',
              'C_FE', 'CI_FE', 'N_FE', 'O_FE', 'NA_FE', 'MG_FE', 'AL_FE', 'SI_FE', 'S_FE', 'K_FE', 'CA_FE',
              'TI_FE', 'V_FE', 'CR_FE', 'MN_FE','NI_FE', 'STARFLAG', 'ASPCAPFLAG', 'PARAMFLAG', 'VHELIO_AVG']

In [5]:
# # 保存中间数据
# df_APOGEE_ASPCAP.to_csv('./data/apogee_aspcap_data.csv', index=False)
# #%%
# df_APOGEE_ASPCAP = pd.read_csv('./data/apogee_aspcap_data.csv')
# #%%
# df_APOGEE_ASPCAP.shape

(733901, 23)

## 进行一些筛选，我这里没有筛选

In [60]:
# # 按Star Flag 筛选
# df_APOGEE_ASPCAP =df_APOGEE_ASPCAP.drop(
#     df_APOGEE_ASPCAP.index[np.where(df_APOGEE_ASPCAP['STARFLAG']!=0)[0]])
# df_APOGEE_ASPCAP.reset_index(drop=True, inplace=True)    # drop：是否删除原始索引列   inplace：是否在原始数据框上直接修改

# # 按ASPCAP Flag 筛选
# df_APOGEE_ASPCAP =df_APOGEE_ASPCAP.drop(
#     df_APOGEE_ASPCAP.index[np.where(df_APOGEE_ASPCAP['ASPCAPFLAG']!=0)[0]])
# df_APOGEE_ASPCAP.reset_index(drop=True, inplace=True)

# 按Param Flag 筛选
# df_APOGEE_ASPCAP =df_APOGEE_ASPCAP.drop(
#     df_APOGEE_ASPCAP.index[np.where(df_APOGEE_ASPCAP['PARAMFLAG']!=0)[0]])
# df_APOGEE_ASPCAP.reset_index(drop=True, inplace=True)

# df_APOGEE_ASPCAP.columns

In [61]:
df_APOGEE_ASPCAP.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 651065 entries, 0 to 651064
Data columns (total 28 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   APOGEE_ID   651065 non-null  object 
 1   RA          651065 non-null  float64
 2   DEC         651065 non-null  float64
 3   TEFF        616995 non-null  float32
 4   LOGG        616995 non-null  float32
 5   FE_H        582431 non-null  float32
 6   C_FE        580953 non-null  float32
 7   CI_FE       579492 non-null  float32
 8   N_FE        531626 non-null  float32
 9   O_FE        581683 non-null  float32
 10  NA_FE       514085 non-null  float32
 11  MG_FE       582286 non-null  float32
 12  AL_FE       514084 non-null  float32
 13  SI_FE       582289 non-null  float32
 14  S_FE        504659 non-null  float32
 15  K_FE        564314 non-null  float32
 16  CA_FE       568410 non-null  float32
 17  TI_FE       526255 non-null  float32
 18  TIII_FE     340172 non-null  float32
 19  V_

## X/Fe转换成X/H

In [6]:
# X/Fe转换成X/H

output_label = ['ASPCAP_starid', 'ASPCAP_RA' , 'ASPCAP_Dec', 'ASPCAP_RV' ,'ASPCAP_Teff[K]','ASPCAP_Logg'  , 'ASPCAP_FeH',
                'ASPCAP_CH'    , 'ASPCAP_CIH','ASPCAP_NH'  , 'ASPCAP_OH' , 'ASPCAP_NaH'   ,'ASPCAP_MgH'   , 'ASPCAP_AlH',
                'ASPCAP_SiH'   , 'ASPCAP_SH' , 'ASPCAP_KH' , 'ASPCAP_CaH', 'ASPCAP_TiH'   , 'ASPCAP_TiIIH', 'ASPCAP_VH',
                'ASPCAP_CrH'   , 'ASPCAP_MnH', 'ASPCAP_CoH', 'ASPCAP_NiH', 'ASPCAP_CeH']
df_APOGEE_ASPCAP = pd.DataFrame(data=df_APOGEE_ASPCAP[['APOGEE_ID', 'RA', 'DEC', 'VHELIO_AVG', 'TEFF','LOGG', 'FE_H',
                                                       'C_FE', 'CI_FE', 'N_FE', 'O_FE', 'NA_FE', 'MG_FE',  'AL_FE',
                                                       'SI_FE', 'S_FE', 'K_FE', 'CA_FE','TI_FE', 'TIII_FE','V_FE', 
                                                       'CR_FE', 'MN_FE', 'CO_FE','NI_FE','CE_FE',]].values,
                                columns=output_label)
df_APOGEE_ASPCAP = df_APOGEE_ASPCAP.iloc[1:]
for label_name in ['ASPCAP_CH'    , 'ASPCAP_CIH','ASPCAP_NH'  , 'ASPCAP_OH' , 'ASPCAP_NaH'   ,'ASPCAP_MgH'   , 'ASPCAP_AlH',
                'ASPCAP_SiH'   , 'ASPCAP_SH' , 'ASPCAP_KH' , 'ASPCAP_CaH', 'ASPCAP_TiH'   , 'ASPCAP_TiIIH', 'ASPCAP_VH',
                'ASPCAP_CrH'   , 'ASPCAP_MnH', 'ASPCAP_CoH', 'ASPCAP_NiH', 'ASPCAP_CeH']:
    df_APOGEE_ASPCAP[label_name] = df_APOGEE_ASPCAP[label_name] + df_APOGEE_ASPCAP['ASPCAP_FeH']
 
df_APOGEE_ASPCAP

Unnamed: 0,ASPCAP_starid,ASPCAP_RA,ASPCAP_Dec,ASPCAP_RV,ASPCAP_Teff[K],ASPCAP_Logg,ASPCAP_FeH,ASPCAP_CH,ASPCAP_CIH,ASPCAP_NH,...,ASPCAP_KH,ASPCAP_CaH,ASPCAP_TiH,ASPCAP_TiIIH,ASPCAP_VH,ASPCAP_CrH,ASPCAP_MnH,ASPCAP_CoH,ASPCAP_NiH,ASPCAP_CeH
1,2M00000002+7417074,0.000103,74.285408,-51.731453,3723.911133,0.904598,-0.16068,-0.151385,-0.167506,-0.00946,...,-0.074461,-0.259192,,,-0.268838,,,-0.08559,-0.152997,
2,2M00000019-1924498,0.000832,-19.413851,19.073862,5501.772949,4.304115,-0.27553,-0.213792,-0.2049,-0.38743,...,-0.159957,-0.178264,-0.162871,,-0.225029,-0.406512,-0.353557,,-0.2616,
3,2M00000032+5737103,0.001335,57.61953,-20.545164,6099.780762,3.67397,-0.25297,-0.14024,-0.12853,-0.13841,...,-0.110687,-0.382105,-0.082451,,-0.258439,-1.365442,-0.189337,,-0.36172,
4,2M00000032+5737103,0.001335,57.61953,-20.43465,6162.030273,3.715561,-0.21417,-0.181519,-0.143445,-0.70323,...,-0.214357,-0.385027,,,-0.273799,-0.955452,-0.367197,,-0.28361,
5,2M00000035-7323394,0.001467,-73.394287,165.671936,4555.404297,1.498851,-1.1714,-1.46211,,-0.69587,...,-2.094641,-1.505275,-1.488347,,-0.984638,-0.530866,-1.327898,-1.15043,-1.209607,-1.32656
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
733896,2M23595886-2009435,359.995258,-20.162107,-6.733492,,,,,,,...,,,,,,,,,,
733897,2M23595886+5726058,359.995265,57.434956,-60.971943,4859.908691,2.480687,-0.23656,-0.296874,-0.33649,-0.03468,...,-0.105581,-0.222782,-0.19855,0.037199,-0.040914,-0.197616,-0.236598,-0.14299,-0.214357,-0.29531
733898,2M23595921+5609479,359.996744,56.16333,-56.440689,4585.409668,2.571437,0.11482,0.103512,0.044327,0.32447,...,0.214909,0.097345,0.234423,-0.104761,-0.040488,0.139584,0.151257,0.15302,0.147813,0.011374
733899,2M23595949-7342592,359.997887,-73.716454,159.405762,4509.591309,1.334888,-1.0505,-1.66699,-1.69043,-1.05503,...,-2.005541,-1.112514,-1.152877,,-1.138738,-1.006586,-1.634798,-0.74081,-1.154007,-0.94279


In [104]:
# 保存数据
df_APOGEE_ASPCAP.to_csv(r'G:\Star\1_Data_download_and_preprocessing\raw_data_catalogue_label/allStar-dr17-all.csv', index=False)

## 由于TIII_FE、CO_FE、CE_FE的有效数据量过少，所以去掉

In [7]:

output_label = ['ASPCAP_starid', 'ASPCAP_RA' , 'ASPCAP_Dec', 'ASPCAP_RV' ,'ASPCAP_Teff[K]','ASPCAP_Logg'  , 'ASPCAP_FeH',
                'ASPCAP_CH'    , 'ASPCAP_CIH','ASPCAP_NH'  , 'ASPCAP_OH' , 'ASPCAP_NaH'   ,'ASPCAP_MgH'   , 'ASPCAP_AlH',
                'ASPCAP_SiH'   , 'ASPCAP_SH' , 'ASPCAP_KH' , 'ASPCAP_CaH', 'ASPCAP_TiH'   ,  'ASPCAP_VH',
                'ASPCAP_CrH'   , 'ASPCAP_MnH',  'ASPCAP_NiH', ]

df_APOGEE_ASPCAP = df_APOGEE_ASPCAP[output_label]
df_APOGEE_ASPCAP

Unnamed: 0,ASPCAP_starid,ASPCAP_RA,ASPCAP_Dec,ASPCAP_RV,ASPCAP_Teff[K],ASPCAP_Logg,ASPCAP_FeH,ASPCAP_CH,ASPCAP_NH,ASPCAP_OH,...,ASPCAP_AlH,ASPCAP_SiH,ASPCAP_SH,ASPCAP_KH,ASPCAP_CaH,ASPCAP_TiH,ASPCAP_VH,ASPCAP_CrH,ASPCAP_MnH,ASPCAP_NiH
1,2M00000002+7417074,0.000103,74.285408,-51.731453,3723.911133,0.904598,-0.16068,-0.151385,-0.00946,-0.077278,...,,-0.189645,,-0.074461,-0.259192,,-0.268838,,,-0.152997
2,2M00000019-1924498,0.000832,-19.413851,19.073862,5501.772949,4.304115,-0.27553,-0.213792,-0.38743,-0.040187,...,0.020774,-0.178937,-0.182198,-0.159957,-0.178264,-0.162871,-0.225029,-0.406512,-0.353557,-0.2616
3,2M00000032+5737103,0.001335,57.61953,-20.545164,6099.780762,3.67397,-0.25297,-0.14024,-0.13841,-0.207901,...,-0.193374,-0.114752,-0.34309,-0.110687,-0.382105,-0.082451,-0.258439,-1.365442,-0.189337,-0.36172
4,2M00000032+5737103,0.001335,57.61953,-20.43465,6162.030273,3.715561,-0.21417,-0.181519,-0.70323,-0.073597,...,-0.201084,-0.084094,-0.086037,-0.214357,-0.385027,,-0.273799,-0.955452,-0.367197,-0.28361
5,2M00000035-7323394,0.001467,-73.394287,165.671936,4555.404297,1.498851,-1.1714,-1.46211,-0.69587,-1.018867,...,-1.627019,-1.133748,-1.560408,-2.094641,-1.505275,-1.488347,-0.984638,-0.530866,-1.327898,-1.209607
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
733896,2M23595886-2009435,359.995258,-20.162107,-6.733492,,,,,,,...,,,,,,,,,,
733897,2M23595886+5726058,359.995265,57.434956,-60.971943,4859.908691,2.480687,-0.23656,-0.296874,-0.03468,-0.147647,...,-0.100801,-0.167242,-0.203194,-0.105581,-0.222782,-0.19855,-0.040914,-0.197616,-0.236598,-0.214357
733898,2M23595921+5609479,359.996744,56.16333,-56.440689,4585.409668,2.571437,0.11482,0.103512,0.32447,0.165394,...,0.198491,0.105432,0.220822,0.214909,0.097345,0.234423,-0.040488,0.139584,0.151257,0.147813
733899,2M23595949-7342592,359.997887,-73.716454,159.405762,4509.591309,1.334888,-1.0505,-1.66699,-1.05503,-1.114757,...,-1.579219,-1.227539,,-2.005541,-1.112514,-1.152877,-1.138738,-1.006586,-1.634798,-1.154007


In [8]:
df_APOGEE_ASPCAP.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 733900 entries, 1 to 733900
Data columns (total 22 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   ASPCAP_starid   733900 non-null  object
 1   ASPCAP_RA       733900 non-null  object
 2   ASPCAP_Dec      733900 non-null  object
 3   ASPCAP_RV       722904 non-null  object
 4   ASPCAP_Teff[K]  689023 non-null  object
 5   ASPCAP_Logg     689023 non-null  object
 6   ASPCAP_FeH      647041 non-null  object
 7   ASPCAP_CH       643859 non-null  object
 8   ASPCAP_NH       591027 non-null  object
 9   ASPCAP_OH       643503 non-null  object
 10  ASPCAP_NaH      567220 non-null  object
 11  ASPCAP_MgH      646290 non-null  object
 12  ASPCAP_AlH      572098 non-null  object
 13  ASPCAP_SiH      646150 non-null  object
 14  ASPCAP_SH       558937 non-null  object
 15  ASPCAP_KH       625848 non-null  object
 16  ASPCAP_CaH      628490 non-null  object
 17  ASPCAP_TiH      582656 non-nu

In [9]:
# 删除包含任何空值的行
df_APOGEE_ASPCAP_cleaned = df_APOGEE_ASPCAP.dropna()
df_APOGEE_ASPCAP_cleaned.shape

(491147, 22)

In [108]:
df_APOGEE_ASPCAP_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 490441 entries, 2 to 733900
Data columns (total 23 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   ASPCAP_starid   490441 non-null  object
 1   ASPCAP_RA       490441 non-null  object
 2   ASPCAP_Dec      490441 non-null  object
 3   ASPCAP_RV       490441 non-null  object
 4   ASPCAP_Teff[K]  490441 non-null  object
 5   ASPCAP_Logg     490441 non-null  object
 6   ASPCAP_FeH      490441 non-null  object
 7   ASPCAP_CH       490441 non-null  object
 8   ASPCAP_CIH      490441 non-null  object
 9   ASPCAP_NH       490441 non-null  object
 10  ASPCAP_OH       490441 non-null  object
 11  ASPCAP_NaH      490441 non-null  object
 12  ASPCAP_MgH      490441 non-null  object
 13  ASPCAP_AlH      490441 non-null  object
 14  ASPCAP_SiH      490441 non-null  object
 15  ASPCAP_SH       490441 non-null  object
 16  ASPCAP_KH       490441 non-null  object
 17  ASPCAP_CaH      490441 non-null  o

In [109]:
# 保存数据
df_APOGEE_ASPCAP_cleaned.to_csv(r'G:\Star\1_Data_download_and_preprocessing\raw_data_catalogue_label/allStar-dr17-all-notnull.csv', index=False)