In [1]:
import wget
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from astropy.io import fits
from tqdm import tqdm

## 处理

### 将星表按照需要，存到df

#### 查看fits

In [2]:
# 读取数据
hdu = fits.open(r"./DDPayne_LAMOST-DR5_recommend.fits")
hdu[1].header

XTENSION= 'BINTABLE'           /Binary table written by MWRFITS v1.12           
BITPIX  =                    8 /Required value                                  
NAXIS   =                    2 /Required value                                  
NAXIS1  =                  460 /Number of bytes per row                         
NAXIS2  =              8162566 /Number of rows                                  
PCOUNT  =                    0 /Normally 0 (no varying arrays)                  
GCOUNT  =                    1 /Required value                                  
TFIELDS =                  100 /Number of columns in table                      
COMMENT                                                                         
COMMENT  *** End of mandatory fields ***                                        
COMMENT                                                                         
COMMENT                                                                         
COMMENT  *** Column names **

#### 提取为df

In [10]:
# 将需要的目录存到df

info_list = ['SPECID', 'RA', 'DEC','SNR_G','RV',]         #  CO_FE CU_FE SpecTE没有 S_fe K_fe ,'V_fe' ddpayne没有
lable_list = ['TEFF','LOGG','FEH','MG_FE','SI_FE', 'CA_FE','TI_FE','CR_FE','NI_FE','C_FE','N_FE','O_FE' ,'AL_FE','MN_FE','NA_FE']
flag_list = [col + '_FLAG' for col in lable_list]
all_columns = info_list+lable_list+flag_list

df_DDPayne = pd.DataFrame(data=None, columns=all_columns)
for lable_name in tqdm(all_columns):
    df_DDPayne[lable_name] = hdu[1].data[lable_name]
df_DDPayne.info()

100%|██████████████████████████████████████████████████████████████████████████████████| 35/35 [00:04<00:00,  7.37it/s]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8162566 entries, 0 to 8162565
Data columns (total 35 columns):
 #   Column      Dtype 
---  ------      ----- 
 0   SPECID      object
 1   RA          >f8   
 2   DEC         >f8   
 3   SNR_G       >f4   
 4   RV          >f4   
 5   TEFF        >f4   
 6   LOGG        >f4   
 7   FEH         >f4   
 8   MG_FE       >f4   
 9   SI_FE       >f4   
 10  CA_FE       >f4   
 11  TI_FE       >f4   
 12  CR_FE       >f4   
 13  NI_FE       >f4   
 14  C_FE        >f4   
 15  N_FE        >f4   
 16  O_FE        >f4   
 17  AL_FE       >f4   
 18  MN_FE       >f4   
 19  NA_FE       >f4   
 20  TEFF_FLAG   >i2   
 21  LOGG_FLAG   >i2   
 22  FEH_FLAG    >i2   
 23  MG_FE_FLAG  >i2   
 24  SI_FE_FLAG  >i2   
 25  CA_FE_FLAG  >i2   
 26  TI_FE_FLAG  >i2   
 27  CR_FE_FLAG  >i2   
 28  NI_FE_FLAG  >i2   
 29  C_FE_FLAG   >i2   
 30  N_FE_FLAG   >i2   
 31  O_FE_FLAG   >i2   
 32  AL_FE_FLAG  >i2   
 33  MN_FE_FLAG  >i2   
 34  NA_FE_FLAG  >i2   




In [11]:
df_DDPayne.shape

(8162566, 35)

### 转换和筛选数据

In [12]:
# 将整个 DataFrame 或相关列转换为小端格式。这可以确保所有操作（如筛选、删除等）都在正确的数据类型上执行。
for col in df_DDPayne.columns:
    if df_DDPayne[col].dtype.byteorder == '>':
        df_DDPayne[col] = df_DDPayne[col].values.byteswap().newbyteorder()


#### 1. 按flag筛选，有效数据过少们无法进行匹配和统计

In [13]:
# 只保留FLAG列的值为0的行
mask = (df_DDPayne[flag_list] == 0).all(axis=1)
# 筛选数据
df_DDPayne_cleaned = df_DDPayne[mask][info_list+lable_list]
df_DDPayne_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 971723 entries, 1 to 8162565
Data columns (total 20 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   SPECID  971723 non-null  object 
 1   RA      971723 non-null  float64
 2   DEC     971723 non-null  float64
 3   SNR_G   971723 non-null  float32
 4   RV      971723 non-null  float32
 5   TEFF    971723 non-null  float32
 6   LOGG    971723 non-null  float32
 7   FEH     971723 non-null  float32
 8   MG_FE   971723 non-null  float32
 9   SI_FE   971723 non-null  float32
 10  CA_FE   971723 non-null  float32
 11  TI_FE   971723 non-null  float32
 12  CR_FE   971723 non-null  float32
 13  NI_FE   971723 non-null  float32
 14  C_FE    971723 non-null  float32
 15  N_FE    971723 non-null  float32
 16  O_FE    971723 non-null  float32
 17  AL_FE   971723 non-null  float32
 18  MN_FE   971723 non-null  float32
 19  NA_FE   971723 non-null  float32
dtypes: float32(17), float64(2), object(1)
memory usage: 

In [37]:
# 删除包含任何空值的行
df_DDPayne_cleaned = df_DDPayne_cleaned.dropna()
df_DDPayne_cleaned.shape

(971723, 20)

In [38]:
df_DDPayne = df_DDPayne_cleaned

#### 2.删除带有-999的数据

In [16]:
df_DDPayne.shape

(8162566, 35)

In [17]:
df_DDPayne_cleaned = df_DDPayne[~(df_DDPayne == -999.000000).any(axis=1)]
df_DDPayne_cleaned.shape

(3111501, 35)

In [18]:
df_DDPayne = df_DDPayne_cleaned

### 将X/Fe 转换为X/H

In [19]:
# X/Fe转换成X/H
# lable_list =['SPECID', 'RA', 'DEC','SNR_G','TEFF','LOGG','FEH','MG_FE','SI_FE', 'CA_FE','TI_FE','CR_FE','NI_FE','C_FE','N_FE','O_FE' ,'AL_FE','MN_FE','NA_FE',]    
output_label = ['SPECID', 'RA', 'DEC', 'SNR_G','DDPayne_Teff[K]','DDPayne_Logg','DDPayne_RV',
                'DDPayne_FeH', 'DDPayne_MgH', 'DDPayne_SiH', 'DDPayne_CaH',
                'DDPayne_TiH', 'DDPayne_CrH', 'DDPayne_NiH', 'DDPayne_CH', 
                'DDPayne_NH',  'DDPayne_OH',  'DDPayne_AlH', 'DDPayne_MnH','DDPayne_NaH',]


df_DDPayne = pd.DataFrame(data=df_DDPayne[['SPECID', 'RA', 'DEC','SNR_G','TEFF','LOGG','RV','FEH','MG_FE','SI_FE', 'CA_FE','TI_FE','CR_FE','NI_FE','C_FE','N_FE','O_FE' ,'AL_FE','MN_FE','NA_FE',]].values,
                                columns=output_label)
df_DDPayne = df_DDPayne.iloc[1:]
for label_name in ['DDPayne_MgH', 'DDPayne_SiH', 'DDPayne_CaH',
                'DDPayne_TiH', 'DDPayne_CrH', 'DDPayne_NiH', 'DDPayne_CH', 
                'DDPayne_NH',  'DDPayne_OH',  'DDPayne_AlH', 'DDPayne_MnH','DDPayne_NaH',]:
        df_DDPayne[label_name] = df_DDPayne[label_name] + df_DDPayne['DDPayne_FeH']
 
df_DDPayne

Unnamed: 0,SPECID,RA,DEC,SNR_G,DDPayne_Teff[K],DDPayne_Logg,DDPayne_RV,DDPayne_FeH,DDPayne_MgH,DDPayne_SiH,DDPayne_CaH,DDPayne_TiH,DDPayne_CrH,DDPayne_NiH,DDPayne_CH,DDPayne_NH,DDPayne_OH,DDPayne_AlH,DDPayne_MnH,DDPayne_NaH
1,20111024-F5902-01-079,332.234112,-1.55206,18.309999,4614.570801,3.904312,-31.610117,-0.634749,-0.485203,-1.041522,-0.320497,-0.619943,-0.290938,-0.371799,-0.714535,-0.453699,-0.279861,-0.848496,-0.353704,-1.789249
2,20111024-F5902-01-092,332.134183,-1.768805,21.82,4794.11084,3.722594,-12.630257,-0.450179,0.079589,-0.716091,-0.509178,-0.458626,-0.238567,-0.273221,-0.41661,-0.553504,-0.079993,-0.55586,-0.47405,-1.376466
3,20111024-F5902-01-144,332.547994,-1.28483,35.900002,4756.79248,3.73002,-37.00938,-0.183481,-0.038357,-0.287914,-0.275041,-0.136766,0.025426,0.109565,-0.261307,-0.34954,-0.031908,0.047533,0.294552,-1.337981
4,20111024-F5902-01-182,332.790307,-1.843152,16.93,4506.084961,3.877853,31.36129,-0.679289,-0.421703,-0.780004,-0.584128,-0.636204,-0.54267,-0.512014,-0.61007,-0.513065,-0.030036,-0.216817,-0.381376,-1.592888
5,20111024-F5902-01-186,332.687475,-2.018279,5.2,4717.894043,3.003328,27.140213,-1.060726,-0.308566,-1.583904,-0.944408,-0.994726,-1.720398,-1.114939,-1.260726,-0.925724,-1.184972,-0.159233,-1.182504,-1.919639
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3111496,20170616-HD192538S020340V01-16-219,289.688792,-0.395854,29.299999,4603.300293,2.463047,12.078639,0.016177,0.037907,0.111673,-0.03191,-0.0655,0.099256,0.084871,-0.302374,0.270249,-0.197701,0.170651,0.072253,0.273744
3111497,20170616-HD192538S020340V01-16-221,289.675397,-0.478258,10.62,4744.790527,2.853371,-14.288109,0.029655,0.14722,0.249054,-0.057084,0.132266,0.301715,0.172405,-0.206776,0.271086,-0.011483,0.393826,-0.08261,-0.134137
3111498,20170616-HD192538S020340V01-16-223,289.840401,-0.507136,28.76,5290.387695,3.515673,-49.159969,-0.050831,-0.001926,0.110299,0.024579,-0.218169,-0.061233,0.278376,-0.124683,0.766616,0.486996,0.142529,0.336595,0.10199
3111499,20170616-HD192538S020340V01-16-226,290.212199,-0.091283,8.03,4876.997559,2.655948,-43.329006,-0.216453,-0.284493,-0.228591,-0.205763,-0.251272,-0.427205,-0.088267,-0.484968,-0.047284,-0.185685,0.015196,-0.241189,-0.180634


In [20]:
df_DDPayne.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3111500 entries, 1 to 3111500
Data columns (total 20 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   SPECID           object
 1   RA               object
 2   DEC              object
 3   SNR_G            object
 4   DDPayne_Teff[K]  object
 5   DDPayne_Logg     object
 6   DDPayne_RV       object
 7   DDPayne_FeH      object
 8   DDPayne_MgH      object
 9   DDPayne_SiH      object
 10  DDPayne_CaH      object
 11  DDPayne_TiH      object
 12  DDPayne_CrH      object
 13  DDPayne_NiH      object
 14  DDPayne_CH       object
 15  DDPayne_NH       object
 16  DDPayne_OH       object
 17  DDPayne_AlH      object
 18  DDPayne_MnH      object
 19  DDPayne_NaH      object
dtypes: object(20)
memory usage: 474.8+ MB


### 保存数据

In [21]:
df_DDPayne.to_csv('df_DDPayne_del_NaN.csv', index=False) 

In [None]:
# 'C_fe' 'O_fe' 'Na_fe''Mg_fe''Al_fe''Si_fe''K_fe''Ca_fe''Sc_fe''Sc2_fe''Ti_fe''Ti2_fe''V_fe''Cr_fe''Cr2_fe''Mn_fe''Co_fe' 'Ni_fe''Cu_fe''Zn_fe''Rb_fe''Sr_fe''Y_fe''Zr_fe''Mo_fe''Ru_fe''Ba_fe''La_fe''Ce_fe''Nd_fe''Sm_fe''Eu_fe'

# flag_sp!=0  snr_c3_iraf<30