# Clean up the big .csv file for easier use with my other codes

### Run the compileOCdata.ipynb first

I will use RA, Dec, coordinates from Simbad when possible.

The final file should have columns with IDs from all sources (as available), RA, Dec, distance, Age, Metallicity, Mass, Rgc, Rhm (all as available)

In [1]:
import pandas as pd
import numpy as np
import sys
import time
import re

from astropy.coordinates import SkyCoord
from astropy import units 

from astroquery.simbad import Simbad
Simbad.TIMEOUT = 300 # sets the timeout to 60s

import warnings
warnings.filterwarnings('ignore', category=UserWarning, append=True)

import matplotlib.pyplot as plt

pd.set_option('display.max_columns', 500)
#pd.set_option('display.max_rows', 100)

%matplotlib inline



In [2]:
matchCoord = True

## Get the coordinates

In [8]:
#it turns out that Simbad coordinates are not great.  So I actually don't need to do this...
#BUT I want a way to know which stars have a good Simbad match and which don't ... so I will do this anyway
def getSimbadRADec(match_ID):

    RA = np.full(len(match_ID), np.nan, dtype='object')
    Dec = np.full(len(match_ID), np.nan, dtype='object')


#     result_table = Simbad.query_objects(match_ID) #stupidly, this does not return blank rows for missing data!
# So I'm trying to go one line at a time, but of course Simbad doesn't like that... it rejects my connection when
# I have too many requests (and what is too many?  and how long should I wait?)
    for i, ID in enumerate(match_ID):
        result_table = Simbad.query_object(ID)
        if (result_table):
            #print(result_table.columns)
            if (len(result_table) > 0):
                row = result_table[0]
                RA[i] = row['RA']
                Dec[i] = row['DEC']
        print(f'{i} {float(i)/len(match_ID):5.3f}, {ID} {RA[i]} {Dec[i]} -- ', end='')
        if (i>0 and i % 50 == 0):
            time.sleep(20)
            print('\n\n')
        #time.sleep(0.1)

        
    return RA, Dec

In [9]:
def getCoord(row):

    if (pd.notna(row['_RAJ2000_cantat-gaudin']) and pd.notna(row['_DEJ2000_cantat-gaudin'])):
        return SkyCoord(ra=row['_RAJ2000_cantat-gaudin']+' hours', dec=row['_DEJ2000_cantat-gaudin']+' degree', frame='icrs')
    
    elif (pd.notna(row['ra_mwsc']) and pd.notna(row['dec_mwsc'])):
        return SkyCoord(ra=row['ra_mwsc']+' hours', dec=row['dec_mwsc']+' degree', frame='icrs')
        
    elif (pd.notna(row['RA[hr]_kharchenko']) and pd.notna(row['Dec[deg]_kharchenko'])):
        return SkyCoord(ra=row['RA[hr]_kharchenko']*units.hourangle, dec=row['Dec[deg]_kharchenko']*units.degree, frame='icrs')

    elif (pd.notna(row['GLON[deg]_piskunov']) and pd.notna(row['GLAT[deg]_piskunov'])):
        return SkyCoord(l=row['GLON[deg]_piskunov']*units.degree, b=row['GLAT[deg]_piskunov']*units.degree, frame='galactic').icrs

    elif (pd.notna(row['ra_lynga']) and pd.notna(row['dec_lynga'])):
        return SkyCoord(ra=row['ra_lynga']*units.degree, dec=row['dec_lynga']*units.degree, frame='icrs')
    
    elif (pd.notna(row['_RA[deg]_vandenbergh']) and pd.notna(row['_Dec[deg]_vandenbergh'])):
        return SkyCoord(ra=row['_Dec[deg]_vandenbergh']*units.degree, dec=row['_Dec[deg]_vandenbergh']*units.degree, frame='icrs')

    elif (pd.notna(row['RA_2000_webda']) and pd.notna(row['Dec_2000_webda'])):
        return SkyCoord(ra=row['RA_2000_webda']+' hours', dec=row['Dec_2000_webda']+' degree', frame='icrs')
            
    elif (pd.notna(row['GLON[deg]_cantat-gaudin']) and pd.notna(row['GLAT[deg]_cantat-gaudin'])):
        return SkyCoord(l=row['GLON[deg]_cantat-gaudin']*units.degree, b=row['GLAT[deg]_cantat-gaudin']*units.degree, frame='galactic').icrs

    elif (pd.notna(row['l_webda']) and pd.notna(row['b_webda'])):
        return SkyCoord(l=row['l_webda']*units.degree, b=row['b_webda']*units.degree, frame='galactic').icrs
    
    else:
        print('NO RA, Dec : ', row['Simbad_ID'])
        return False

In [10]:
OC_df = pd.read_csv('OCcompiled.csv')
OC_df

Unnamed: 0,Simbad_ID,Name_mwsc,broad_type_mwsc,cluster_status_mwsc,ra_mwsc,dec_mwsc,lii_mwsc,bii_mwsc,core_radius_mwsc,central_radius_mwsc,cluster_radius_mwsc,pm_ra_mwsc,pm_dec_mwsc,pm_tot_error_mwsc,rad_vel_mwsc,rad_vel_error_mwsc,num_rad_vel_stars_mwsc,num_core_stars_mwsc,num_central_stars_mwsc,num_cluster_stars_mwsc,distance_mwsc,e_bv_mwsc,distance_modulus_mwsc,e_jk_mwsc,e_jh_mwsc,delta_h_mwsc,log_age_mwsc,log_age_error_mwsc,num_log_age_stars_mwsc,king_core_radius_mwsc,king_core_radius_error_mwsc,king_tidal_radius_mwsc,king_tidal_radius_error_mwsc,king_norm_factor_mwsc,king_norm_factor_error_mwsc,reference_code_mwsc,cluster_type_mwsc,metallicity_mwsc,metallicity_error_mwsc,num_metallicity_stars_mwsc,comments_mwsc,class_mwsc,Name_webda,RA_2000_webda,Dec_2000_webda,l_webda,b_webda,Dist_webda,Mod_webda,EB-V_webda,Age_webda,ST_webda,Z_webda,Diam_webda,Fe/H_webda,MRV_webda,pm RA_webda,pm Dec_webda,Measures_webda,Stars_webda,COCD_piskunov,Name_piskunov,GLON[deg]_piskunov,GLAT[deg]_piskunov,DistMod_piskunov,E(B-V)_piskunov,Dist[pc]_piskunov,logt[yr]_piskunov,rt[pc]_piskunov,e_rt[pc]_piskunov,logM[MSun]_piskunov,e_logM[MSun]_piskunov,rtA[pc]_piskunov,e_rtA[pc]_piskunov,logMA[MSun]_piskunov,e_logMA[MSun]_piskunov,MWSC_kharchenko,Name_kharchenko,Type_kharchenko,n_Type_kharchenko,RA[hr]_kharchenko,Dec[deg]_kharchenko,GLON[deg]_kharchenko,GLAT[deg]_kharchenko,r0[deg]_kharchenko,r1[deg]_kharchenko,r2[deg]_kharchenko,pmRA[mas/yr]_kharchenko,pmDec[mas/yr]_kharchenko,e_pm[mas/yr]_kharchenko,RV[km/s]_kharchenko,e_RV[km/s]_kharchenko,n_RV[km/s]_kharchenko,N1sr0_kharchenko,N1sr1_kharchenko,N1sr2_kharchenko,d[pc]_kharchenko,E(B-V)_kharchenko,appDistMod[mag]_kharchenko,E(J-Ks)_kharchenko,E(J-H)_kharchenko,dH_kharchenko,logt[yr]_kharchenko,e_logt[yr]_kharchenko,Nt_kharchenko,rc[pc]_kharchenko,e_rc[pc]_kharchenko,rt[pc]_kharchenko,e_rt[pc]_kharchenko,k[pc-2]_kharchenko,e_k[pc-2]_kharchenko,Src_kharchenko,SType_kharchenko,[Fe/H][Sun]_kharchenko,e_[Fe/H][Sun]_kharchenko,n_[Fe/H]_kharchenko,Name_salaris,dV_salaris,err_dV_salaris,[Fe/H]_salaris,err_[Fe/H]_salaris,t[Gyr]_salaris,err_t_salaris,Rgc[kpc]_salaris,z[pc]_salaris,flag_salaris,tJP94[Gyr]_salaris,Seq_vandenbergh,Name_vandenbergh,l[deg]_vandenbergh,Diam[pc]_vandenbergh,R[pc]_vandenbergh,Z[pc]_vandenbergh,E(B-V)_vandenbergh,logT[yr]_vandenbergh,SimbadName_vandenbergh,_RA[deg]_vandenbergh,_Dec[deg]_vandenbergh,_RAJ2000_cantat-gaudin,_DEJ2000_cantat-gaudin,Name_cantat-gaudin,RAJ2000[deg]_cantat-gaudin,DEJ2000[deg]_cantat-gaudin,GLON[deg]_cantat-gaudin,GLAT[deg]_cantat-gaudin,r50[deg]_cantat-gaudin,Nstars_cantat-gaudin,pmRA[mas/yr]_cantat-gaudin,pmDE[mas/yr]_cantat-gaudin,plx[mas]_cantat-gaudin,dmode[pc]_cantat-gaudin,Rgc[pc]_cantat-gaudin,SimbadName_cantat-gaudin,Name_lynga,ra_lynga,dec_lynga,distance_lynga,log_age_lynga,angular_diameter_lynga,alt_name_lynga,lii_lynga,bii_lynga,iau_num_lynga,seq_code_lynga,seq_num_lynga,prec_ra_lynga,prec_dec_lynga,lund_record_num_lynga,ocl_num_lynga,ref_angular_diameter_lynga,ref_distance_lynga,ref_log_age_lynga,metallicity_lynga,ref_metallicity_lynga,e_bv_lynga,ref_e_bv_lynga,type_flag_lynga,ref_type_flag_lynga,tr_concent_class_lynga,tr_range_class_lynga,tr_richness_class_lynga,tr_nebulosity_lynga,sb_bs_mag_lynga,sb_spect_code_lynga,sb_total_mag_lynga,sk_total_mag_lynga,sk_bv_color_lynga,sk_num_stars_lynga,ja_star_num_lynga,ja_class_lynga,ja_max_class_lynga,ja_richness_lynga,ja_e_bv_lynga,ref_ja_e_bv_lynga,ja_bv_turnoff_lynga,ref_ja_bv_turnoff_lynga,ly_tr_concent_class_lynga,ly_tr_range_class_lynga,ly_tr_richness_class_lynga,ly_tr_nebulosity_lynga,ly_member_stars_lynga,ly_angular_diameter_lynga,ly_refs_flag_lynga,radvel_weight_lynga,radvel_lynga,radvel_weight_class_lynga,ref_radvel1_lynga,ref_radvel2_lynga,ref_radvel3_lynga,ref_radvel4_lynga,ref_radvel5_lynga,basel_spect_code_lynga,basel_color_type_lynga,neg_ra_tracer_lynga,neg_lii_tracer_lynga,neg_seq_num_tracer_lynga,pos_ra_tracer_lynga,pos_lii_tracer_lynga,pos_seq_num_tracer_lynga,jdl_distance_lynga,jdl_distance_weight_lynga,jdl_turnoff_color_lynga,jdl_age_lynga,jdl_age_weight_lynga,jdl_reddening_lynga,jdl_reddening_flag_lynga,jdl_reddening_weight_lynga,ref_jdl1_lynga,ref_jdl2_lynga,ref_jdl3_lynga,ref_jdl4_lynga,ref_jdl5_lynga,ref_jdl6_lynga
0,[KPS2012]_MWSC_4688,MWSC_4688,,,23 51 54,-86 43.2,303.907,-30.295,0.020,0.100,0.185,3.20,-5.00,1.13,,,,2.0,22.0,57.0,1336.0,0.219,10.700,0.105,0.070,0.000,9.390,,,1.05,0.39,7.01,2.51,2.51,0.67,AIPk,,,,0.0,...,OPEN STAR CL...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,[KPS2012]_MWSC_5684,MWSC_5684,,,12 53 43,-86 38.9,302.968,-23.776,0.020,0.080,0.155,-13.04,0.17,1.05,,,,3.0,19.0,52.0,1432.0,0.375,10.900,0.180,0.120,0.020,9.180,0.023,3,0.61,0.42,7.54,5.79,1.88,1.09,ARIs,,,,0.0,...,OPEN STAR CL...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,[KPS2012]_MWSC_5692,MWSC_5692,,,17 47 20,-86 36.6,306.562,-26.146,0.025,0.095,0.135,-6.22,-9.02,1.38,,,,4.0,18.0,28.0,1555.0,0.437,11.100,0.210,0.140,0.020,8.930,,,0.76,0.56,5.18,3.82,1.51,0.77,ARIs,,,,0.0,...,OPEN STAR CL...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,[KPS2012]_MWSC_4005,MWSC_4005,,,00 11 28,-85 28.8,303.852,-31.577,0.012,0.100,0.165,9.31,-1.47,0.95,,,,3.0,26.0,42.0,1159.0,0.250,10.400,0.120,0.080,-0.020,9.375,,,0.36,0.13,4.69,1.85,15.94,6.19,AIPk,,,,0.0,...,OPEN STAR CL...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,[KPS2012]_MWSC_4176,MWSC_4176,,,14 27 18,-85 25.2,304.950,-22.929,0.025,0.150,0.280,-9.41,0.12,0.69,,,,4.0,53.0,155.0,1093.0,0.333,10.300,0.160,0.107,-0.030,9.315,,,0.97,0.24,6.84,1.58,6.46,1.31,AIPk,,,,0.0,...,OPEN STAR CL...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5,ESO_8-6,ESO_008-06,r,c,14 56 55,-83 26.7,306.593,-21.485,0.025,0.130,0.185,-5.26,-4.21,0.80,,,,5.0,46.0,78.0,1380.0,0.312,10.800,0.150,0.100,0.030,9.300,,,0.66,0.25,5.60,2.18,5.93,1.91,DIAS,,,,0.0,"Sparse; center is shifted to 14.9485h,-83.445d...",OPEN STAR CL...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2260.0,ESO_008-06,rc,,14.9485,-83.445,306.591,-21.481,0.025,0.130,0.185,-5.26,-4.21,0.80,,,0.0,5.0,46.0,78.0,1380.0,0.312,10.800,0.15,0.100,0.030,9.300,,,0.66,0.25,5.60,2.18,5.93,1.91,DIAS,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
6,[KPS2012]_MWSC_4219,MWSC_4219,,,15 42 14,-83 11.7,307.905,-22.041,0.015,0.115,0.200,-9.26,-3.14,0.74,,,,2.0,36.0,95.0,1606.0,0.375,11.150,0.180,0.120,0.020,9.100,,,2.81,0.76,9.78,2.13,1.71,0.41,AIPk,,,,0.0,...,OPEN STAR CL...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
7,[KPS2012]_MWSC_5575,MWSC_5575,,,01 59 42,-83 03.0,300.484,-33.751,0.015,0.090,0.150,6.33,-2.50,1.96,,,,1.0,10.0,20.0,2191.0,0.302,11.800,0.145,0.097,0.015,9.200,,,1.95,1.00,12.13,6.13,0.79,0.18,ARIs,,,,0.0,Poor RDP. ...,OPEN STAR CL...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
8,[KPS2012]_MWSC_4682,MWSC_4682,,,23 43 23,-82 57.6,305.407,-33.838,0.020,0.115,0.190,5.87,0.28,0.94,,,,2.0,22.0,58.0,1065.0,0.354,10.250,0.170,0.113,0.000,9.280,0.061,4,0.43,0.24,8.27,5.39,5.33,3.02,AIPk,,,,0.0,...,OPEN STAR CL...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
9,[KPS2012]_MWSC_5685,MWSC_5685,,,13 05 24,-82 02.6,303.443,-19.185,0.020,0.090,0.160,-3.12,-2.73,1.14,,,,3.0,25.0,64.0,1581.0,0.406,11.125,0.195,0.130,0.000,9.150,,,0.65,0.21,7.54,2.68,9.02,2.40,ARIs,,,,0.0,Poor RDP. ...,OPEN STAR CL...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [11]:
RA = np.full(len(OC_df), np.nan, dtype='object')
Dec = np.full(len(OC_df), np.nan, dtype='object')
for index, row in OC_df.iterrows():
    #RA and Dec
    c = getCoord(row)
    if (c):
        RA[index] = c.ra.to_string(sep=' ', pad=True, unit=units.hour, precision=3)
        Dec[index] = c.dec.to_string(sep=' ', pad=True, unit=units.degree, precision=3)
    else:
        print('WARNING: no coordinates!', row['Simbad_ID'])

In [12]:
#this takes a while...
if (matchCoord):
    RA_Simbad, Dec_Simbad = getSimbadRADec(OC_df['Simbad_ID'].values)
    

0 0.000, [KPS2012]_MWSC_4688 23 51 54 -86 43.2 -- 1 0.000, [KPS2012]_MWSC_5684 nan nan -- 2 0.001, [KPS2012]_MWSC_5692 nan nan -- 3 0.001, [KPS2012]_MWSC_4005 00 11 28 -85 28.8 -- 4 0.001, [KPS2012]_MWSC_4176 14 27 18 -85 25.2 -- 5 0.001, ESO_8-6 14 56 54 -83 26.7 -- 6 0.002, [KPS2012]_MWSC_4219 15 42 14 -83 11.7 -- 7 0.002, [KPS2012]_MWSC_5575 nan nan -- 8 0.002, [KPS2012]_MWSC_4682 23 43 23 -82 57.6 -- 9 0.003, [KPS2012]_MWSC_5685 nan nan -- 10 0.003, [KPS2012]_MWSC_5681 nan nan -- 11 0.003, [KPS2012]_MWSC_2014 12 20 06 -81 30.6 -- 12 0.004, [KPS2012]_MWSC_2064 12 45 43 -81 15.5 -- 13 0.004, [KPS2012]_MWSC_2049 12 34 55 -80 57.0 -- 14 0.004, [KPS2012]_MWSC_5688 nan nan -- 15 0.004, ESO_26-2 20 26 38 -80 00.0 -- 16 0.005, [KPS2012]_MWSC_4137 09 26 12 -79 44.4 -- 17 0.005, Cl_Melotte_227 20 16 48 -79 02.4 -- 18 0.005, [KPS2012]_MWSC_5679 nan nan -- 19 0.006, [KPS2012]_MWSC_5572 nan nan -- 20 0.006, [KPS2012]_MWSC_5749 nan nan -- 21 0.006, ESO_21-6 14 15 52 -78 30.9 -- 22 0.007, [KPS201

151 0.045, Cl_Hogg_15 12 43 36 -63 05.9 -- 152 0.045, [KPS2012]_MWSC_2134 13 33 50 -63 02.4 -- 153 0.046, C_1206-626 12 09 45.0 -62 59 17 -- 154 0.046, Cl_Dias_4 13 43 25 -63 00.0 -- 155 0.046, NGC_4609 12 42 18 -62 59.7 -- 156 0.047, IC_2944 11 38 20.00 -63 22 22.0 -- 157 0.047, NGC_5281 13 46 35 -62 55.0 -- 158 0.047, Ass_Cir_OB_1 14 45 -62.9 -- 159 0.047, [DBS2003]_129 12 15 24 -62 55.1 -- 160 0.048, NGC_5269 13 44 44.0 -62 54 54 -- 161 0.048, C_1437-626 14 41 29 -62 49.9 -- 162 0.048, [KPS2012]_MWSC_5696 nan nan -- 163 0.049, Cl_Trumpler_21 13 32.2 -62 48 -- 164 0.049, IC_2714 11 17 27 -62 44.0 -- 165 0.049, NGC_4184 12 13 34 -62 43.8 -- 166 0.050, [KPS2012]_MWSC_1959 11 36 58 -62 43.4 -- 167 0.050, C_1154-623 11 57 28 -62 43.0 -- 168 0.050, Cl_Danks_1 13 12 27 -62 42.1 -- 169 0.050, NGC_3036 09 49.2 -62 40 -- 170 0.051, Cl_Danks_2 13 12 55 -62 40.9 -- 171 0.051, [DBS2003]_85 13 18 31 -62 39.8 -- 172 0.051, [FSR2007]_1595 11 47 05.0 -62 38 13 -- 173 0.052, [KPR2005]_89 17 23.3 -62 

301 0.090, [KPR2005]_62 10 50 53 -60 06.0 -- 302 0.090, C_1045-598 10 47 15.2 -60 05 51 -- 303 0.090, NGC_5043 13 16.1 -60 04 -- 304 0.091, [KPS2012]_MWSC_2219 14 28 34 -60 01.2 -- 305 0.091, [KPS2012]_MWSC_5723 nan nan -- 306 0.091, Cl_Ruprecht_111 14 36 00 -59 58.8 -- 307 0.092, [KPS2012]_MWSC_4116 06 19 12 -59 58.2 -- 308 0.092, NGC_5299 nan nan -- 309 0.092, NGC_2132 05 55 19 -59 55.2 -- 310 0.092, Cl_Collinder_223 10 32 16 -60 01.2 -- 311 0.093, [DBS2003]_89 14 45 03 -59 49.5 -- 312 0.093, Cl_Feinstein_1 11 06 43 -59 49.2 -- 313 0.093, C_1225-595 12 28 44 -59 48.6 -- 314 0.094, Cl_Loden_821 13 24 21 -59 44.0 -- 315 0.094, Cl_Loden_1095 13 53 37 -59 44.7 -- 316 0.094, C_1402-594 14 05 42.00 -59 42 00.0 -- 317 0.095, C_1405-594 14 09 22 -59 43.5 -- 318 0.095, ESO_130-13 12 23 04 -59 39.9 -- 319 0.095, NAME_HD_135159_Group 15 16 44 -59 39.3 -- 320 0.095, NGC_5606 14 27 47 -59 38.4 -- 321 0.096, [KPS2012]_MWSC_2289 15 20 58 -59 37.2 -- 322 0.096, [DBS2003]_51 10 32 58 -59 37.4 -- 323 

451 0.135, Cl_Loden_189 10 50.4 -56 25 -- 452 0.135, [DBS2003]_141 15 28 32 -56 22.5 -- 453 0.135, [KPS2012]_MWSC_2109 13 19 47 -56 21.1 -- 454 0.135, C_0920-560 09 21 55 -56 19.0 -- 455 0.136, C_0949-560 09 51 36 -56 18.6 -- 456 0.136, Cl_VDBH_58 09 10 12.0 -56 15 43 -- 457 0.136, [KPS2012]_MWSC_1689 09 33 56 -56 14.9 -- 458 0.137, [KPS2012]_MWSC_2107 13 19 37 -56 10.8 -- 459 0.137, [KPS2012]_MWSC_1789 10 18 59 -56 10.2 -- 460 0.137, [KPS2012]_MWSC_2025 12 24 16 -56 09.9 -- 461 0.137, [KPS2012]_MWSC_2233 14 34 37 -56 06.9 -- 462 0.138, ESO_165-9 09 05.2 -55 58 -- 463 0.138, NAME_SCHUSTER_CL 10 04 36 -55 51.4 -- 464 0.138, Cl_Loden_1 10 05 14 -55 48.3 -- 465 0.139, [KPS2012]_MWSC_1716 09 48 36 -55 47.4 -- 466 0.139, NGC_3960 11 50 33 -55 40.4 -- 467 0.139, NGC_5823 15 05 31 -55 35.7 -- 468 0.140, [KPR2005]_66 11 13.6 -55 25 -- 469 0.140, C_1002-552 10 04 18.00 -55 26 00.0 -- 470 0.140, [SSW94]_c 10 41.5 -55 18 -- 471 0.140, Cl_Lynga_4 15 33 20 -55 14.4 -- 472 0.141, [KPS2012]_MWSC_1818

601 0.179, ESO_211-9 09 16 44 -50 17.0 -- 602 0.180, [KPS2012]_MWSC_1730 09 56 24 -50 16.2 -- 603 0.180, ESO_211-3 08 51 36 -50 14.7 -- 604 0.180, Cl_Lynga_8 16 20 04 -50 13.0 -- 605 0.180, C_0930-499 09 31 52 -50 12.9 -- 606 0.181, [KPS2012]_MWSC_2321 15 48 40 -50 12.9 -- 607 0.181, [DBS2003]_164 16 19 23 -50 09.5 -- 608 0.181, [KPS2012]_MWSC_2420 16 30 25 -50 07.5 -- 609 0.182, [KPS2012]_MWSC_1511 08 37 01 -50 02.4 -- 610 0.182, NAME_HD_80077_Group 09 15 53.0 -50 01 00 -- 611 0.182, [DBS2003]_166 16 21 27 -50 00.7 -- 612 0.183, IC_4651 17 24 49 -49 56.0 -- 613 0.183, [KPS2012]_MWSC_4545 20 07 50 -49 51.6 -- 614 0.183, [DBS2003]_102 16 15 01 -49 50.7 -- 615 0.183, NGC_6167 16 34 34 -49 46.3 -- 616 0.184, [KPS2012]_MWSC_2195 14 00 18 -49 45.9 -- 617 0.184, [KPS2012]_MWSC_2419 16 29 49 -49 45.1 -- 618 0.184, [KPS2012]_MWSC_1640 09 14 40 -49 44.7 -- 619 0.185, [DBS2003]_170 16 28 58.0 -49 36 27 -- 620 0.185, Cl_VDBH_85 10 01 52 -49 35.1 -- 621 0.185, [DBS2003]_167 16 23 24 -49 32.5 -- 62

751 0.224, Cl_Lynga_13 16 48 56 -43 25.7 -- 752 0.224, [DBS2003]_175 16 52 36 -43 23.5 -- 753 0.225, C_0840-432 08 41 39 -43 22.0 -- 754 0.225, NGC_6192 16 40 23 -43 22.0 -- 755 0.225, C_0855-430 08 57 08 -43 15.0 -- 756 0.225, NGC_3680 11 25 38 -43 14.6 -- 757 0.226, [KPS2012]_MWSC_2145 13 38 00 -43 10.8 -- 758 0.226, [KPS2012]_MWSC_1837 10 39 58 -43 07.4 -- 759 0.226, NGC_6322 17 18 25 -42 56.0 -- 760 0.227, [KPS2012]_MWSC_5604 nan nan -- 761 0.227, [KPS2012]_MWSC_2185 13 57 30 -42 52.2 -- 762 0.227, ESO_311-14 07 49.4 -42 42 -- 763 0.228, ESO_282-26 19 13 52 -42 39.0 -- 764 0.228, [KPS2012]_MWSC_1608 08 58 06 -42 36.0 -- 765 0.228, [DBS2003]_176 16 59 23 -42 34.4 -- 766 0.228, C_0846-423 08 47 54 -42 27.0 -- 767 0.229, [FSR2007]_1744 16 51 36.0 -42 24 55 -- 768 0.229, ESO_309-3 06 50.7 -42 23 -- 769 0.229, [KPS2012]_MWSC_1512 08 37 03 -42 21.9 -- 770 0.230, [KPS2012]_MWSC_1518 08 38 17 -42 21.0 -- 771 0.230, [KPS2012]_MWSC_5744 nan nan -- 772 0.230, [KPS2012]_MWSC_1726 09 54 34 -42 

KeyboardInterrupt: 

In [None]:
if (matchCoord):
    #remove any IDs from the cleaned Simbad_ID columns that were not actually matched in Simbad
    simbad_ID = OC_df['Simbad_ID'].values.copy()
    for index, row in  OC_df.iterrows():
        if (pd.isna(RA_Simbad[index])):
            simbad_ID[index] = np.nan
    print(simbad_ID)
    #write this to an intermediate file so that I don't need to match to Simbad again!
    cleanOC_df = pd.DataFrame()
    cleanOC_df['ID'] = OC_df['Simbad_ID'] #note: this contains some IDs that are not in Simbad
    cleanOC_df['RA'] = RA
    cleanOC_df['Dec'] = Dec
    cleanOC_df['ID_Simbad'] = simbad_ID
    cleanOC_df.to_csv('OCcompiled_clean_v1.csv', index=False)

In [None]:
# foo = pd.DataFrame()
# foo['ID'] = cleanOC_df['ID']
# foo['RA'] = cleanOC_df['RA']
# foo['Dec'] = cleanOC_df['Dec']
# foo['ID_Simbad'] = cleanOC_df['ID_Simbad']
# foo.to_csv('OCcompiled_clean_v1.csv', index=False)

*Below, we can work with this cleanOC_df so that I don't have to go through the Simbad matching again!*

In [None]:
cleanOC_df = pd.read_csv('OCcompiled_clean_v1.csv')
cleanOC_df

### Add all the IDs

In [None]:
#add a column with the actual Simbad ID in the cleaned table, with NaN when there isn't a match
cleanOC_df['ID_MWSC'] = OC_df['Name_mwsc']
cleanOC_df['ID_WEBDA'] = OC_df['Name_webda']
cleanOC_df['ID_Piskunov'] = OC_df['Name_piskunov']
cleanOC_df['ID_Kharchenko'] = OC_df['Name_kharchenko']
cleanOC_df['ID_Salaris'] = OC_df['Name_salaris']
cleanOC_df['ID_vandenBergh'] = OC_df['Name_vandenbergh']
cleanOC_df['ID_Cantat-Gaudin'] = OC_df['Name_cantat-gaudin']
cleanOC_df['ID_Lynga'] = OC_df['Name_lynga']

### Distance

In [None]:
def getDistance(row):
    #take a mean if there are more than 1
    distances = []
    if (pd.notna(row['distance_mwsc'])):
        distances.append(row['distance_mwsc'])
        
    if (pd.notna(row['dmode[pc]_cantat-gaudin'])):
        distances.append(row['dmode[pc]_cantat-gaudin'])    

    if (pd.notna(row['Dist_webda'])):
        distances.append(row['Dist_webda'])    
        
    if (pd.notna(row['distance_lynga'])):
        distances.append(row['distance_lynga'])
        
    if (pd.notna(row['d[pc]_kharchenko'])):
        distances.append(row['d[pc]_kharchenko'])    
        
    if (pd.notna(row['Dist[pc]_piskunov'])):
        distances.append(row['Dist[pc]_piskunov'])
        
    if (pd.notna(row['R[pc]_vandenbergh'])):
        distances.append(row['R[pc]_vandenbergh'])
        

    if (len(distances) > 0):
        distances = np.array(distances)
        return (np.mean(distances), np.std(distances)/(len(distances))**0.5, len(distances))
    else:
        print('NO DISTANCE', row['Simbad_ID'])
        return (np.nan, np.nan, np.nan)

In [None]:
dist = []
err_dist = []
n_dist = []
for index, row in OC_df.iterrows():
    m,em,nm  = getDistance(row)
    dist.append(m)    
    err_dist.append(em)
    n_dist.append(nm)
cleanOC_df['dist[pc]'] = dist
cleanOC_df['err_dist[pc]'] = err_dist
cleanOC_df['N_dist[pc]'] = n_dist

### Age

In [None]:
def getLogAge(row):
    #take a mean if there are more than 1
    ages = []
    if (pd.notna(row['log_age_mwsc'])):
        ages.append(10.**row['log_age_mwsc']/10.**6)
        
    if (pd.notna(row['Age_webda'])):
        ages.append(row['Age_webda'])
        
    if (pd.notna(row['log_age_lynga'])):
        ages.append(10.**row['log_age_lynga']/10.**6)
        
    if (pd.notna(row['logt[yr]_kharchenko'])):
        ages.append(10.**row['logt[yr]_kharchenko']/10.**6)
        
    if (pd.notna(row['logt[yr]_piskunov'])):
        ages.append(10.**row['logt[yr]_piskunov']/10.**6)
        
    if (pd.notna(row['logT[yr]_vandenbergh'])):
        if (row['logT[yr]_vandenbergh'].isnumeric()):
            ages.append(10.**row['logT[yr]_vandenbergh']/10.**6)
        
    if (pd.notna(row['t[Gyr]_salaris'])):
        ages.append(row['t[Gyr]_salaris']*10.**3)
        
    if (len(ages) > 0):
        ages = np.array(ages)
        return (np.mean(ages), np.std(ages)/(len(ages))**0.5, len(ages))
    else:
        print('NO AGE', row['Simbad_ID'])
        return (np.nan, np.nan, np.nan)

In [None]:
age = []
err_age = []
n_age = []
for index, row in OC_df.iterrows():
    m,em,nm  = getLogAge(row)
    age.append(m)    
    err_age.append(em)
    n_age.append(nm)
cleanOC_df['age[Myr])'] = age
cleanOC_df['err_age[Myr]'] = err_age
cleanOC_df['N_age[Myr]'] = n_age

### Metallicity

In [None]:
def getFeH(row):
    #take a mean if there are more than 1
    FeHs = []
    if (pd.notna(row['metallicity_mwsc'])):
        if (row['metallicity_mwsc'].isnumeric()):
            FeHs.append(row['metallicity_mwsc'])
            
    if (pd.notna(row['Fe/H_webda'])):
        FeHs.append(row['Fe/H_webda'])
        
    if (pd.notna(row['[Fe/H][Sun]_kharchenko'])):
        FeHs.append(row['[Fe/H][Sun]_kharchenko'])
        
    if (pd.notna(row['[Fe/H]_salaris'])):
        FeHs.append(row['[Fe/H]_salaris'])

    if (pd.notna(row['metallicity_lynga'])):
        FeHs.append(row['metallicity_lynga'])
        
    if (len(FeHs) > 0):
        FeHs = np.array(FeHs)
        return (np.mean(FeHs), np.std(FeHs)/(len(FeHs))**0.5, len(FeHs))
    else:
        print('NO FeH', row['Simbad_ID'])
        return (np.nan, np.nan, np.nan)

In [None]:
FeH = []
err_FeH = []
n_FeH = []
for index, row in OC_df.iterrows():
    m,em,nm  = getFeH(row)
    FeH.append(m)    
    err_FeH.append(em)
    n_FeH.append(nm)
cleanOC_df['FeH'] = FeH
cleanOC_df['err_FeH'] = err_FeH
cleanOC_df['N_FeH'] = n_FeH

### Masses

In [None]:
def getMass(row, mm = 0.5):
    #take a mean if there are more than 1
    masses = []
    if (pd.notna(row['num_cluster_stars_mwsc'])):
        masses.append(row['num_cluster_stars_mwsc']*mm)
        
    if (pd.notna(row['Nstars_cantat-gaudin'])):
        masses.append(row['Nstars_cantat-gaudin']*mm)    

    if (pd.notna(row['Stars_webda'])):
        masses.append(row['Stars_webda']*mm)    
        
    if (pd.notna(row['ly_member_stars_lynga'])):
        masses.append(row['ly_member_stars_lynga']*mm)
        
    if (pd.notna(row['N1sr2_kharchenko'])):
        masses.append(row['N1sr2_kharchenko']*mm)    
        
    if (pd.notna(row['logM[MSun]_piskunov'])):
        masses.append(10.**row['logM[MSun]_piskunov'])
        
    if (pd.notna(row['logMA[MSun]_piskunov'])):
        masses.append(10.**row['logMA[MSun]_piskunov'])
        

    if (len(masses) > 0):
        masses = np.array(masses)
        return (np.mean(masses), np.std(masses)/(len(masses))**0.5, len(masses))
    else:
        print('NO MASS', row['Simbad_ID'])
        return (np.nan, np.nan, np.nan)

In [None]:
mass = []
err_mass = []
n_mass = []
for index, row in OC_df.iterrows():
    m,em,nm  = getMass(row)
    mass.append(m)    
    err_mass.append(em)
    n_mass.append(nm)
cleanOC_df['mass[Msun]'] = mass
cleanOC_df['err_mass[Msun]'] = err_mass
cleanOC_df['N_mass[Msun]'] = n_mass

### Rgc

In [None]:
def getRgc(row):
    #take a mean if there are more than 1
    Rgcs = []
    if (pd.notna(row['Rgc[kpc]_salaris'])):
        Rgcs.append(1000*row['Rgc[kpc]_salaris'])
        
    if (pd.notna(row['Rgc[pc]_cantat-gaudin'])):
        Rgcs.append(row['Rgc[pc]_cantat-gaudin'])    
        
    if (len(Rgcs) > 0):
        Rgcs = np.array(Rgcs)
        return (np.mean(Rgcs), np.std(Rgcs)/(len(Rgcs))**0.5, len(Rgcs))
    else:
        print('NO Rgc', row['Simbad_ID'])
        return (np.nan, np.nan, np.nan)

In [None]:
rgc = []
err_rgc = []
n_rgc = []
for index, row in OC_df.iterrows():
    m,em,nm  = getRgc(row)
    rgc.append(m)    
    err_rgc.append(em)
    n_rgc.append(nm)
cleanOC_df['rgc[pc]'] = rgc
cleanOC_df['err_rgc[pc]'] = err_rgc
cleanOC_df['N_rgc[pc]'] = n_rgc

### Rhm

In [None]:
def getRhm(row):
    #take a mean if there are more than 1
    Rhms = []
    if (pd.notna(row['king_core_radius_mwsc'])):
        if (row['king_core_radius_mwsc'].isnumeric()):
            a = 2.**0.5*float(row['king_core_radius_mwsc'])
            Rhms.append(a/(2.**(2./3.) -1)**0.5) 
        
#     if (pd.notna(row['rt[pc]_piskunov'])):
#         Rhms.append(row['rt[pc]_piskunov']) #can I convert this? 

#     if (pd.notna(row['rtA[pc]_piskunov'])):
#         Rhms.append(row['rtA[pc]_piskunov']) #can I convert this? 
        
    if (pd.notna(row['rc[pc]_kharchenko'])):
        #convert using Plummer formulae
        a = 2.**0.5*row['rc[pc]_kharchenko']
        Rhms.append(a/(2.**(2./3.) -1)**0.5) 
        
    if (pd.notna(row['Diam[pc]_vandenbergh'])):
        Rhms.append(row['Diam[pc]_vandenbergh']/2.) #should I try to convert this?
        
    if (pd.notna(row['r50[deg]_cantat-gaudin']) and pd.notna(row['dmode[pc]_cantat-gaudin'])):
        d = row['dmode[pc]_cantat-gaudin']
        theta = row['r50[deg]_cantat-gaudin']/2.*np.pi/180.
        Rhms.append(d*np.tan(theta)) 
        
    if (pd.notna(row['angular_diameter_lynga']) and pd.notna(row['distance_lynga'])):
        d = row['distance_lynga']
        theta = row['angular_diameter_lynga']/2.*np.pi/180.
        Rhms.append(d*np.tan(theta))         
        

    if (len(Rhms) > 0):
        Rhms = np.array(Rhms)
        return (np.mean(Rhms), np.std(Rhms)/(len(Rhms))**0.5, len(Rhms))
    else:
        print('NO Rhm', row['Simbad_ID'])
        return (np.nan, np.nan, np.nan)

In [None]:
rhm = []
err_rhm = []
n_rhm = []
for index, row in OC_df.iterrows():
    m,em,nm  = getRhm(row)
    rhm.append(m)    
    err_rhm.append(em)
    n_rhm.append(nm)
cleanOC_df['rhm[pc]'] = rhm
cleanOC_df['err_rhm[pc]'] = err_rhm
cleanOC_df['N_rhm[pc]'] = n_rhm

## Write the file

In [None]:
cleanOC_df.to_csv('OCcompiled_clean.csv', index=False)

# Make some plots

In [None]:
cleanOC_df = pd.read_csv('OCcompiled_clean.csv')
cleanOC_df

### Make a few plots

In [None]:
f,(ax1, ax2) = plt.subplots(1,2)

ax1.hist(logAge, bins=40, density=True)
ax1.set_xlabel('log(Age [yr?])')
ax1.set_yscale('log')

ax2.hist(logMass, bins=40, density=True)
ax2.set_xlabel('log(Mass [Msun])')
ax2.set_yscale('log')

In [None]:
df = pd.read_csv("OCcompiled_hasAgeMass.csv")

data = np.vstack((df['logAge'].values, df['logMass'].values))
KDE = gaussian_kde(data)
sample = KDE.resample(size=int(1e5))

nbins = 40

f,(ax1, ax2) = plt.subplots(1,2)
ax1.hist(df['logAge'].values, bins=nbins, density=True)
ax1.hist(sample[0,:], bins=nbins, density=True, histtype='step')
ax1.set_xlabel('log(Age [yr?])')
ax1.set_yscale('log')

ax2.hist(df['logMass'].values, bins=nbins, density=True)
ax2.hist(sample[1,:], bins=nbins, density=True, histtype='step')
ax2.set_xlabel('log(Mass [Msun])')
ax2.set_yscale('log')

lt = 5
lm = 2
values = np.vstack([lt, lm])
print(KDE(values))
#NOTE: the age KDE seems to be missing the edges.  Maybe I should set those to zero automatically?