# Read in all the files, and compile them into on big .csv file

### I will use Simbad names, where available, to match everything together

*Andrew Bowen provided some of the script to read in the files. [See his GitHub repo](https://github.com/andrewbowen19/CEB_Project)*

Some alterations to files:
- changed Pismis 6 to NGC 2645 in MWSC, Piskunov and Kharchenko
- Luginbuhl-Skiff_1 is Skiff_J0614+12.9 (?) changed in WEBDA
- Skiff_2 is Skiff_J0458+43.0 (?) changed in WEBDA
- PTB 9 is a planetary nebula (not an OC --though maybe in the OC NGC 7762?) -- changed label in MWSC, removed from Kharchenko
- Collinder_258 is Harvard_5 (same PM, distance, etc.), both are in Gaia table, removed Harvard_5
- FSR_1686 is Juchert_10, changed to Juchert 10 in MWSC and Kharchenko
- changed many BH or VBH, etc. to vdBergh-Hagen
- removed Berkeley 42 from vandenbergh and Salaris because it's a GC (and included in Harris)
- removed Mrk 38 from vandenbergh because it's a pair of galaxies
- removed Mrk 50 from vandenbergh because it's a Seyfert 1 galaxy
- vdBergh-Hagen_133 is Collinder_258 is Harvard 5,removed vdBergh-Hagen 133 in lynga
- Berkeley_30 is Biurakan_9, changed to Berkeley 30 in lynga
- Berkeley_32 Biurakan_8, changed to Berkeley 32 in lynga
- removed NGC 281 from lunga (HII region)
- not sure about NGC_2579 and AH03_J0822-36.4 (possible overlap)
- removed vdBergh-Hagen_1 from vandenberg = reflection nebula
- Cl VDBH 47 is IC 2395, remove vdBergh-Hagen 47 from WEBDA and lynga
- vdBergh-Hagen_218 might be NGC_6318 , removed vdBergh-Hagen_218 from lynga
- NAME_HD_80077_Group is Pismis_11 and vdBergh-Hagen_60, removed vdBergh-Hagen_60 from lynga
- from Simbad: "NGC 2239 is an obsolete designation for NGC 2244.", removed NGC 2239 from lynga
- NAME_Trapezium_Cluster M_42, should I change Trapezium in lynga to M42? I think yes



In [1]:
import pandas as pd
import numpy as np
import sys
import time
import re

from astropy.coordinates import SkyCoord
from astropy import units 

from astroquery.simbad import Simbad
Simbad.TIMEOUT = 300 # sets the timeout to 60s

import warnings
warnings.filterwarnings('ignore', category=UserWarning, append=True)

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 50)





In [2]:
from difflib import SequenceMatcher

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [3]:
def matchToSimbad(cat_ID, fillCat = True):
    #I'd like to use the Simbad names for matching (if possible) so I reduce the duplicates, 

    match_ID = np.array([str(x) for x in cat_ID], dtype='object')
    return_ID = np.full(len(match_ID), np.nan, dtype='object')

    #fix the names
    #is this correct? : .replace('vdBergh','CL VDBH')
    for i, x in enumerate(match_ID):
        match_ID[i] = x.replace('vdBergh-Hagen_','Cl_VDBH_')\
                        .replace('vdBergh_','Cl_VDB_')\
                        .replace('FSR_','[FSR2007]_')\
                        .replace('DBSB_','[DBS2003]_')\
                        .replace('BDSB_','[BDS2003]_')\
                        .replace('BDS_','[BDS2003]_')\
                        .replace('Alessi_','Cl_Alessi_')\
                        .replace('Bica_','Cl_Bica_')\
                        .replace('Sher_','Cl_Sher_')\
                        .replace('MWSC_','[KPS2012]_MWSC_')\
                        .replace('Schuster_1','NAME_SCHUSTER_CL')\
                        .replace('ASCC_','[KPR2005]_')\
                        .replace('Andrews-Lindsay_', '[AL67]_Cl*')\
                        .replace('Arp-Madore', 'AM')\
                        .replace('LDN_988e', 'NAME_[C86]_L988_e_Cluster')\
                        .replace('Juchert-Saloran_','Juchert-Saloranta_')\
                        .replace('Ivanov', '[IBP2002]_CC')\
                        .replace('DB2001_','[DB2001]_Cl ')\
                        .replace('Havlen-Moffat_','Cl_HM ')\
                        .replace('Loden_','Cl_Loden ')\
                        .replace('Cl_Cl','Cl')\
                        .replace('[KPS2012]_[KPS2012]','[KPS2012]')\

#     result_table = Simbad.query_objects(match_ID) #stupidly, this does not return blank rows for missing data!
# below I tried matching by RA and Dec, but I'm finding that (according to authors) there are quite a few 
# unique clusters, that are very close together, and it's not clear how to get avoid mismatches.
# If it returned blank lines, then this wouldn't be an issue!!
# So I'm trying to go one line at a time, but of course Simbad doesn't like that... it rejects my connection when
# I have too many requests (and what is too many?  and how long should I wait?)
    for i, ID in enumerate(match_ID):
        result_table = Simbad.query_object(ID)
        if (result_table):
            #print(result_table)
            if (len(result_table) > 0):
                row = result_table[0]
                newID =  re.sub('\s+', ' ', row['MAIN_ID'].decode("utf-8")).strip().replace(' ','_' )
                return_ID[i] = newID
        print(f'{i} {float(i)/len(match_ID):5.3f}, {ID} {return_ID[i]} -- ', end='')
        if (i>0 and i % 50 == 0):
            time.sleep(30)
            print('\n\n')
        time.sleep(0.1)

    #fill nans with the original catalog search input?
    if (fillCat):
        for i, ID in enumerate(return_ID):
            if (pd.isna(ID)):
                return_ID[i] = match_ID[i]
        
    return return_ID

In [4]:
fixMWSC = False
fixWEBDA = False
fixPiskunov = False
fixKharchenko = False
fixSalaris = False
fixVandenBergh = False
fixGaia = False
fixLynga = False

In [5]:
#MWSC - Milky Way Star Clusters Catalog
#https://heasarc.gsfc.nasa.gov/W3Browse/all/mwsc.html
if (fixMWSC):
    mwsc_df = pd.read_csv("MWSC.txt", header=3, delimiter='|')
    mwsc_df.columns = mwsc_df.columns.str.strip()

    
    #take only the open clusters
    mwsc_df = mwsc_df.loc[(~mwsc_df['class'].str.strip().str.contains('GLOBULAR')) &
                          (~mwsc_df['class'].str.strip().str.contains('NEBULA')) &
                          (~mwsc_df['class'].str.strip().str.contains('UNIDENTIFIED'))]

    #remove the "Unnamed" columns
    mwsc_df = mwsc_df.loc[:, ~mwsc_df.columns.str.contains('^Unnamed')]

    print(set(mwsc_df['class'].values))
    
    #fix the IDs
    mwsc_ID = mwsc_df['Name']
    mwsc_ID = mwsc_ID.str.strip().str.replace( ' ','_' )
    mwsc_df['Name'] = mwsc_ID

    #add Simbad names for matching (if possible) so I reduce the duplicates 
    ID = [x for x in mwsc_df['Name']]
    simbad_ID = matchToSimbad(mwsc_ID)
    
    mwsc_df.columns = [str(col) + '_mwsc' for col in mwsc_df.columns]
    mwsc_df.insert(loc=0, column='Simbad_ID', value=simbad_ID)

    #check for duplicates
    for index, row in  mwsc_df.iterrows():
        check = mwsc_df.loc[mwsc_df['Simbad_ID'] == row['Simbad_ID']]
        if (len(check['Simbad_ID']) != 1):
            print('MWSC', row['Simbad_ID'], row['Name_mwsc'])
            print('Check', check['Simbad_ID'].values, len(check['Simbad_ID']))
            print('')
        
    mwsc_df.to_csv('MWSC_clean.csv', index=False)
    

mwsc_df = pd.read_csv('MWSC_clean.csv')
mwsc_df

{'                          OB ASSOCIATION/HII REGION', '                                  OPEN STAR CLUSTER'}
0 0.000, [KPS2012]_MWSC_4688 [KPS2012]_MWSC_4688 -- 1 0.000, [KPS2012]_MWSC_5684 nan -- 2 0.001, [KPS2012]_MWSC_5692 nan -- 3 0.001, [KPS2012]_MWSC_4005 [KPS2012]_MWSC_4005 -- 4 0.001, [KPS2012]_MWSC_4176 [KPS2012]_MWSC_4176 -- 5 0.002, ESO_008-06 ESO_8-6 -- 6 0.002, [KPS2012]_MWSC_4219 [KPS2012]_MWSC_4219 -- 7 0.002, [KPS2012]_MWSC_5575 nan -- 8 0.003, [KPS2012]_MWSC_4682 [KPS2012]_MWSC_4682 -- 9 0.003, [KPS2012]_MWSC_5685 nan -- 10 0.003, [KPS2012]_MWSC_5681 nan -- 11 0.004, [FSR2007]_1626 [KPS2012]_MWSC_2014 -- 12 0.004, [FSR2007]_1631 [KPS2012]_MWSC_2064 -- 13 0.004, [FSR2007]_1629 [KPS2012]_MWSC_2049 -- 14 0.005, [KPS2012]_MWSC_5688 nan -- 15 0.005, ESO_026-02 ESO_26-2 -- 16 0.006, [KPS2012]_MWSC_4137 [KPS2012]_MWSC_4137 -- 17 0.006, Melotte_227 Cl_Melotte_227 -- 18 0.006, [KPS2012]_MWSC_5679 nan -- 19 0.007, [KPS2012]_MWSC_5572 nan -- 20 0.007, [KPS2012]_MWSC_5749 nan --

401 0.138, Collinder_220 Cl_Collinder_220 -- 402 0.138, ESO_141-47 ESO_141-47 -- 403 0.139, Cl_Loden 1177 C_1355-576 -- 404 0.139, ESO_131-09 ESO_131-9 -- 405 0.139, Cl_Loden 1289 C_1410-576 -- 406 0.140, NGC_1252 NGC_1252 -- 407 0.140, [DBS2003]_40 [DBS2003]_40 -- 408 0.140, [DBS2003]_92 [DBS2003]_92 -- 409 0.141, [KPR2005]_59 [KPR2005]_59 -- 410 0.141, [FSR2007]_1696 [KPS2012]_MWSC_2264 -- 411 0.141, IC_2581 IC_2581 -- 412 0.142, Collinder_292 C_1546-575 -- 413 0.142, [FSR2007]_1583 [KPS2012]_MWSC_1953 -- 414 0.142, NGC_5715 NGC_5715 -- 415 0.143, [FSR2007]_1524 [KPS2012]_MWSC_1741 -- 416 0.143, Trumpler_19 C_1112-573 -- 417 0.143, [KPS2012]_MWSC_4349 [KPS2012]_MWSC_4349 -- 418 0.144, Ruprecht_91 Cl_Ruprecht_91 -- 419 0.144, NGC_6005 NGC_6005 -- 420 0.144, [FSR2007]_1527 [FSR2007]_1527 -- 421 0.145, [FSR2007]_1621 [KPS2012]_MWSC_2058 -- 422 0.145, [FSR2007]_1530 [FSR2007]_1530 -- 423 0.145, [FSR2007]_1620 [KPS2012]_MWSC_2054 -- 424 0.146, [KPS2012]_MWSC_4119 [KPS2012]_MWSC_4119 -- 42

801 0.275, Cl_VDBH_217 C_1712-407 -- 802 0.276, ESO_332-11 ESO_332-11 -- 803 0.276, Ruprecht_124 C_1724-407 -- 804 0.276, ESO_332-08 ESO_332-8 -- 805 0.277, Cl_VDBH_205 C_1652-405 -- 806 0.277, [FSR2007]_1378 [KPS2012]_MWSC_1412 -- 807 0.278, NGC_6124 NGC_6124 -- 808 0.278, ESO_332-13 ESO_332-13 -- 809 0.278, NGC_2849 NGC_2849 -- 810 0.279, [KPS2012]_MWSC_4146 [KPS2012]_MWSC_4146 -- 811 0.279, Ruprecht_56 C_0810-403 -- 812 0.279, Ruprecht_125 Cl_Ruprecht_125 -- 813 0.280, [FSR2007]_1397 [KPS2012]_MWSC_1510 -- 814 0.280, ESO_313-12 ESO_313-12 -- 815 0.280, [FSR2007]_1737 [FSR2007]_1737 -- 816 0.281, Ruprecht_64 C_0835-399 -- 817 0.281, [DBS2003]_115 [DBS2003]_115 -- 818 0.281, Trumpler_29 C_1738-400 -- 819 0.282, [FSR2007]_1390 [KPS2012]_MWSC_1494 -- 820 0.282, [FSR2007]_1380 [KPS2012]_MWSC_1443 -- 821 0.282, [KPS2012]_MWSC_5704 nan -- 822 0.283, [FSR2007]_1387 [KPS2012]_MWSC_1486 -- 823 0.283, [FSR2007]_1758 [KPS2012]_MWSC_2617 -- 824 0.283, NGC_6268 NGC_6268 -- 825 0.284, Teutsch_85 T

1201 0.413, Dutra-Cl_Bica_60 nan -- 1202 0.413, ESO_525-08 ESO_525-8 -- 1203 0.414, Ruprecht_139 Cl_Ruprecht_139 -- 1204 0.414, [FSR2007]_1346 [KPS2012]_MWSC_1601 -- 1205 0.414, Ruprecht_23 C_0728-232 -- 1206 0.415, Ruprecht_25 C_0734-232 -- 1207 0.415, NGC_6546 NGC_6546 -- 1208 0.415, [FSR2007]_1235 [KPS2012]_MWSC_0860 -- 1209 0.416, [FSR2007]_1312 [KPS2012]_MWSC_1440 -- 1210 0.416, [KPS2012]_MWSC_5311 nan -- 1211 0.416, [FSR2007]_0022 [KPS2012]_MWSC_2751 -- 1212 0.417, Ruprecht_17 Cl_Ruprecht_17 -- 1213 0.417, [KPR2005]_39 [KPR2005]_39 -- 1214 0.417, [KPS2012]_MWSC_5717 nan -- 1215 0.418, [FSR2007]_0023 [FSR2007]_0023 -- 1216 0.418, ESO_561-05 ESO_561-5 -- 1217 0.419, Haffner_5 C_0716-225 -- 1218 0.419, NGC_6531 M_21 -- 1219 0.419, Ruprecht_39 Cl_Ruprecht_39 -- 1220 0.420, [DBS2003]_10 [DBS2003]_10 -- 1221 0.420, [KPS2012]_MWSC_5316 nan -- 1222 0.420, [DBS2003]_13 [DBS2003]_13 -- 1223 0.421, [FSR2007]_1298 [KPS2012]_MWSC_1377 -- 1224 0.421, NGC_6469 NGC_6469 -- 1225 0.421, [KPR2005]_

1501 0.516, [FSR2007]_1221 [KPS2012]_MWSC_1390 -- 1502 0.517, [FSR2007]_0050 [KPS2012]_MWSC_2659 -- 1503 0.517, [FSR2007]_1163 [KPS2012]_MWSC_1048 -- 1504 0.517, NGC_6728 NGC_6728 -- 1505 0.518, [FSR2007]_1110 [KPS2012]_MWSC_0748 -- 1506 0.518, [FSR2007]_1117 [KPS2012]_MWSC_0827 -- 1507 0.518, Dolidze_27 C_1633-088 -- 1508 0.519, [KPS2012]_MWSC_5321 nan -- 1509 0.519, [FSR2007]_1170 [KPS2012]_MWSC_1111 -- 1510 0.519, [IBP2002]_CC_4 [IBP2002]_CC04 -- 1511 0.520, [FSR2007]_0041 [FSR2007]_0041 -- 1512 0.520, [FSR2007]_1159 [KPS2012]_MWSC_1036 -- 1513 0.520, [FSR2007]_0062 [KPS2012]_MWSC_2846 -- 1514 0.521, NGC_2349 NGC_2349 -- 1515 0.521, [FSR2007]_0063 [KPS2012]_MWSC_2855 -- 1516 0.521, Trumpler_34 Cl_Trumpler_34 -- 1517 0.522, Dias_3 Cl_Dias_3 -- 1518 0.522, NGC_2323 M_50 -- 1519 0.522, NGC_6664 NGC_6664 -- 1520 0.523, [BDS2003]_93 NAME_IRAS_06548-0815_Cluster -- 1521 0.523, [FSR2007]_1203 [KPS2012]_MWSC_1319 -- 1522 0.523, [FSR2007]_1105 [KPS2012]_MWSC_0770 -- 1523 0.524, [FSR2007]_125

1801 0.619, NGC_2312 NGC_2312 -- 1802 0.620, NGC_6709 NGC_6709 -- 1803 0.620, NGC_6724 NGC_6724 -- 1804 0.620, [KPS2012]_MWSC_5645 nan -- 1805 0.621, NGC_2141 NGC_2141 -- 1806 0.621, [KPS2012]_MWSC_4602 [KPS2012]_MWSC_4602 -- 1807 0.621, Poole_J1856+10.8 NAME_Poole_J1856+10.8 -- 1808 0.622, Poole_J1855+10.8 NAME_Poole_J1855+10.8 -- 1809 0.622, NGC_2259 NGC_2259 -- 1810 0.622, NGC_1662 NGC_1662 -- 1811 0.623, NGC_6525 NGC_6525 -- 1812 0.623, [FSR2007]_0979 [FSR2007]_0979 -- 1813 0.623, [FSR2007]_0106 [KPS2012]_MWSC_2724 -- 1814 0.624, NGC_6858 NGC_6858 -- 1815 0.624, Berkeley_43 Cl_Berkeley_43 -- 1816 0.624, [FSR2007]_0930 [KPS2012]_MWSC_0626 -- 1817 0.625, Dol-Dzim_2 nan -- 1818 0.625, [FSR2007]_0913 [KPS2012]_MWSC_0535 -- 1819 0.626, NGC_6738 NGC_6738 -- 1820 0.626, Dias_8 Cl_Dias_8 -- 1821 0.626, Dolidze_35 C_1924+115 -- 1822 0.627, NGC_6837 NGC_6837 -- 1823 0.627, NGC_2682 NGC_2682 -- 1824 0.627, [FSR2007]_0977 [KPS2012]_MWSC_0895 -- 1825 0.628, [FSR2007]_0968 [KPS2012]_MWSC_0785 --

2101 0.722, [KPR2005]_17 [KPR2005]_17 -- 2102 0.723, [FSR2007]_0796 [KPS2012]_MWSC_0480 -- 2103 0.723, Basel_4 C_0545+302 -- 2104 0.724, [KPS2012]_MWSC_5800 nan -- 2105 0.724, [FSR2007]_0839 [FSR2007]_0839 -- 2106 0.724, [FSR2007]_0860 [KPS2012]_MWSC_1027 -- 2107 0.725, Latham_1 NAME_Latham_1 -- 2108 0.725, Kronberger_68 Kronberger_68 -- 2109 0.725, Berkeley_17 Cl_Berkeley_17 -- 2110 0.726, IRAS_05439+3035 HHL_31 -- 2111 0.726, [BDS2003]_74 [BDS2003]_74 -- 2112 0.726, [FSR2007]_0250 [KPS2012]_MWSC_3556 -- 2113 0.727, [FSR2007]_0833 [KPS2012]_MWSC_0726 -- 2114 0.727, Skiff_J0507+30.8 NAME_Skiff_J0507+30.8 -- 2115 0.727, Kronberger_52 Kronberger_52 -- 2116 0.728, [FSR2007]_0817 [KPS2012]_MWSC_0622 -- 2117 0.728, Czernik_18 C_0424+308 -- 2118 0.728, Teutsch_45 Teutsch_45 -- 2119 0.729, Koposov_36 [FSR2007]_0814 -- 2120 0.729, Teutsch_124 Teutsch_124 -- 2121 0.729, DC_8 BD+01_4828 -- 2122 0.730, [FSR2007]_0815 [FSR2007]_0815 -- 2123 0.730, [FSR2007]_0848 [KPS2012]_MWSC_0912 -- 2124 0.730, 

2401 0.826, NGC_7093 NGC_7093 -- 2402 0.826, Barkhatova_1 nan -- 2403 0.826, Berkeley_89 Cl_Berkeley_89 -- 2404 0.827, [FSR2007]_0686 [KPS2012]_MWSC_0378 -- 2405 0.827, [FSR2007]_0241 [KPS2012]_MWSC_3236 -- 2406 0.827, Basel_12 C_2108+460 -- 2407 0.828, [FSR2007]_0254 [KPS2012]_MWSC_3310 -- 2408 0.828, [KPR2005]_124 [KPR2005]_124 -- 2409 0.828, NGC_7062 NGC_7062 -- 2410 0.829, NGC_6811 NGC_6811 -- 2411 0.829, NGC_1883 NGC_1883 -- 2412 0.829, Basel_13 C_2110+463 -- 2413 0.830, NGC_7209 NGC_7209 -- 2414 0.830, [KPR2005]_23 [KPR2005]_23 -- 2415 0.830, [KPS2012]_MWSC_5370 nan -- 2416 0.831, Berkeley_90 Cl_Berkeley_90 -- 2417 0.831, Juchert_20 Juchert_20 -- 2418 0.831, [FSR2007]_0275 [KPS2012]_MWSC_3434 -- 2419 0.832, [KPR2005]_119 [KPR2005]_119 -- 2420 0.832, SAI_141 [KPS2012]_MWSC_3458 -- 2421 0.833, [FSR2007]_0305 [KPS2012]_MWSC_3565 -- 2422 0.833, NGC_7082 NGC_7082 -- 2423 0.833, [FSR2007]_0293 [KPS2012]_MWSC_3514 -- 2424 0.834, Teutsch_156 Teutsch_156 -- 2425 0.834, [KPS2012]_MWSC_5371

2751 0.946, Czernik_44 Cl_Czernik_44 -- 2752 0.946, King_12 Cl_King_12 -- 2753 0.947, Czernik_17 Cl_Czernik_17 -- 2754 0.947, Stock_24 Cl_Stock_24 -- 2755 0.947, Teutsch_162 TYC_4051-2533-1 -- 2756 0.948, IRAS_02232+6138 NAME_W_3_OH -- 2757 0.948, Teutsch_55 Teutsch_55 -- 2758 0.948, [FSR2007]_0404 [KPS2012]_MWSC_3670 -- 2759 0.949, [FSR2007]_0524 [KPS2012]_MWSC_0092 -- 2760 0.949, Stock_17 Cl_Stock_17 -- 2761 0.949, NGC_366 NGC_366 -- 2762 0.950, Czernik_7 Cl_Czernik_7 -- 2763 0.950, Stock_3 C_0109+620 -- 2764 0.950, SAI_16 [KPS2012]_MWSC_0165 -- 2765 0.951, [KPR2005]_117 [KPR2005]_117 -- 2766 0.951, IRAS_02541+6208 IRAS_02541+6208 -- 2767 0.952, Czernik_13 C_0240+621 -- 2768 0.952, Pfleiderer_4 NAME_PWM_4 -- 2769 0.952, NGC_1502 NGC_1502 -- 2770 0.953, [FSR2007]_0346 [KPS2012]_MWSC_3495 -- 2771 0.953, Berkeley_7 Cl_Berkeley_7 -- 2772 0.953, [FSR2007]_0498 [FSR2007]_0498 -- 2773 0.954, [KPR2005]_130 [KPR2005]_130 -- 2774 0.954, IRAS_02497+6217 IRAS_02497+6217 -- 2775 0.954, [FSR2007]_

Unnamed: 0,Simbad_ID,Name_mwsc,broad_type_mwsc,cluster_status_mwsc,ra_mwsc,dec_mwsc,lii_mwsc,bii_mwsc,core_radius_mwsc,central_radius_mwsc,cluster_radius_mwsc,pm_ra_mwsc,pm_dec_mwsc,pm_tot_error_mwsc,rad_vel_mwsc,rad_vel_error_mwsc,num_rad_vel_stars_mwsc,num_core_stars_mwsc,num_central_stars_mwsc,num_cluster_stars_mwsc,distance_mwsc,e_bv_mwsc,distance_modulus_mwsc,e_jk_mwsc,e_jh_mwsc,delta_h_mwsc,log_age_mwsc,log_age_error_mwsc,num_log_age_stars_mwsc,king_core_radius_mwsc,king_core_radius_error_mwsc,king_tidal_radius_mwsc,king_tidal_radius_error_mwsc,king_norm_factor_mwsc,king_norm_factor_error_mwsc,reference_code_mwsc,cluster_type_mwsc,metallicity_mwsc,metallicity_error_mwsc,num_metallicity_stars_mwsc,comments_mwsc,class_mwsc
0,[KPS2012]_MWSC_4688,MWSC_4688,,,23 51 54,-86 43.2,303.907,-30.295,0.020,0.100,0.185,3.20,-5.00,1.13,,,,2,22,57,1336,0.219,10.700,0.105,0.070,0.000,9.390,,,1.05,0.39,7.01,2.51,2.51,0.67,AIPk,,,,0,...,OPEN STAR CL...
1,[KPS2012]_MWSC_5684,MWSC_5684,,,12 53 43,-86 38.9,302.968,-23.776,0.020,0.080,0.155,-13.04,0.17,1.05,,,,3,19,52,1432,0.375,10.900,0.180,0.120,0.020,9.180,0.023,3,0.61,0.42,7.54,5.79,1.88,1.09,ARIs,,,,0,...,OPEN STAR CL...
2,[KPS2012]_MWSC_5692,MWSC_5692,,,17 47 20,-86 36.6,306.562,-26.146,0.025,0.095,0.135,-6.22,-9.02,1.38,,,,4,18,28,1555,0.437,11.100,0.210,0.140,0.020,8.930,,,0.76,0.56,5.18,3.82,1.51,0.77,ARIs,,,,0,...,OPEN STAR CL...
3,[KPS2012]_MWSC_4005,MWSC_4005,,,00 11 28,-85 28.8,303.852,-31.577,0.012,0.100,0.165,9.31,-1.47,0.95,,,,3,26,42,1159,0.250,10.400,0.120,0.080,-0.020,9.375,,,0.36,0.13,4.69,1.85,15.94,6.19,AIPk,,,,0,...,OPEN STAR CL...
4,[KPS2012]_MWSC_4176,MWSC_4176,,,14 27 18,-85 25.2,304.950,-22.929,0.025,0.150,0.280,-9.41,0.12,0.69,,,,4,53,155,1093,0.333,10.300,0.160,0.107,-0.030,9.315,,,0.97,0.24,6.84,1.58,6.46,1.31,AIPk,,,,0,...,OPEN STAR CL...
5,ESO_8-6,ESO_008-06,r,c,14 56 55,-83 26.7,306.593,-21.485,0.025,0.130,0.185,-5.26,-4.21,0.80,,,,5,46,78,1380,0.312,10.800,0.150,0.100,0.030,9.300,,,0.66,0.25,5.60,2.18,5.93,1.91,DIAS,,,,0,"Sparse; center is shifted to 14.9485h,-83.445d...",OPEN STAR CL...
6,[KPS2012]_MWSC_4219,MWSC_4219,,,15 42 14,-83 11.7,307.905,-22.041,0.015,0.115,0.200,-9.26,-3.14,0.74,,,,2,36,95,1606,0.375,11.150,0.180,0.120,0.020,9.100,,,2.81,0.76,9.78,2.13,1.71,0.41,AIPk,,,,0,...,OPEN STAR CL...
7,[KPS2012]_MWSC_5575,MWSC_5575,,,01 59 42,-83 03.0,300.484,-33.751,0.015,0.090,0.150,6.33,-2.50,1.96,,,,1,10,20,2191,0.302,11.800,0.145,0.097,0.015,9.200,,,1.95,1.00,12.13,6.13,0.79,0.18,ARIs,,,,0,Poor RDP. ...,OPEN STAR CL...
8,[KPS2012]_MWSC_4682,MWSC_4682,,,23 43 23,-82 57.6,305.407,-33.838,0.020,0.115,0.190,5.87,0.28,0.94,,,,2,22,58,1065,0.354,10.250,0.170,0.113,0.000,9.280,0.061,4,0.43,0.24,8.27,5.39,5.33,3.02,AIPk,,,,0,...,OPEN STAR CL...
9,[KPS2012]_MWSC_5685,MWSC_5685,,,13 05 24,-82 02.6,303.443,-19.185,0.020,0.090,0.160,-3.12,-2.73,1.14,,,,3,25,64,1581,0.406,11.125,0.195,0.130,0.000,9.150,,,0.65,0.21,7.54,2.68,9.02,2.40,ARIs,,,,0,Poor RDP. ...,OPEN STAR CL...


### Downloaded from WEBDA [here](https://webda.physics.muni.cz/cluster_selall.html)

With RA from 0 to 24 and 0 to 1e6 stars. I copied the table to WEBDA.html, and removed the $<$br$>$ entries, then converted to csv with 

https://codepen.io/malahovks/pen/gLxLWX

or

https://jsfiddle.net/gengns/j1jm2tjx/

Finally, I separated the RA and DEC column header into 2 entries.

*I also have a data file from David James, that has more clusters, but I'm not sure the providence of that, so I won't use it.*

In [6]:
# # WEBDA data file (2013)
# webda_df = pd.read_fwf("WEBDA-OC-table-June2013_DavidJames.txt", 
#                        widths = [18,14,15,11,9,8,8,8,9,6,9,9,9,7,7,9], header = 0)

if (fixWEBDA):
    webda_df = pd.read_csv('WEBDA.csv')
    #fix the IDs
    webda_ID = webda_df['Name']
    webda_ID = webda_ID.str.replace( 'NGC 0','NGC ' ).str.replace( ' ','_' )

    webda_df['Name'] = webda_ID

    #add Simbad names for matching (if possible) so I reduce the duplicates 
    simbad_ID = matchToSimbad(webda_ID)

    webda_df.columns = [str(col) + '_webda' for col in webda_df.columns]
    webda_df.insert(loc=0, column='Simbad_ID', value=simbad_ID)
        
    #check for duplicates  
    for index, row in  webda_df.iterrows():
        check = webda_df.loc[webda_df['Simbad_ID'] == row['Simbad_ID']]
        if (len(check['Simbad_ID']) != 1):
            print('WEBDA', row['Simbad_ID'], row['Name_webda'])
            print('Check', check['Simbad_ID'].values, len(check['Simbad_ID']))
            print('')
        
    webda_df.to_csv('WEBDA_clean.csv', index=False)
    
webda_df = pd.read_csv('WEBDA_clean.csv')
webda_df

0 0.000, Berkeley_58 Cl_Berkeley_58 -- 1 0.001, Stock_18 Cl_Stock_18 -- 2 0.002, Berkeley_59 Cl_Berkeley_59 -- 3 0.003, Blanco_1 Cl_Blanco_1 -- 4 0.004, [KPR2005]_1 [KPR2005]_1 -- 5 0.005, Berkeley_1 Cl_Berkeley_1 -- 6 0.006, King_13 C_0007+609 -- 7 0.007, Cl_Alessi_20 [KPR2004b]_4 -- 8 0.009, [KPR2005]_2 [KPR2005]_2 -- 9 0.010, Mayer_1 NAME_Cl_Mayer_1 -- 10 0.011, King_1 C_0019+641 -- 11 0.012, Stock_20 Cl_Stock_20 -- 12 0.013, NGC_103 NGC_103 -- 13 0.014, Stock_21 C_0027+577 -- 14 0.015, NGC_129 NGC_129 -- 15 0.016, [KPR2005]_3 [KPR2005]_3 -- 16 0.017, NGC_133 NGC_133 -- 17 0.018, NGC_136 NGC_136 -- 18 0.019, King_14 C_0029+628 -- 19 0.020, NGC_146 NGC_146 -- 20 0.021, NGC_189 NGC_189 -- 21 0.022, Dias_1 Cl_Dias_1 -- 22 0.024, NGC_225 NGC_225 -- 23 0.025, Czernik_2 C_0040+598 -- 24 0.026, King_16 C_0040+639 -- 25 0.027, Berkeley_4 Cl_Berkeley_4 -- 26 0.028, NGC_188 NGC_188 -- 27 0.029, Berkeley_61 Cl_Berkeley_61 -- 28 0.030, IC_1590 IC_1590 -- 29 0.031, [KPR2005]_4 [KPR2005]_4 -- 30 

401 0.428, NGC_2669 NGC_2669 -- 402 0.429, NGC_2664 NGC_2664 -- 403 0.431, [KPR2005]_49 [KPR2005]_49 -- 404 0.432, Trumpler_10 C_0846-423 -- 405 0.433, Ruprecht_71 C_0847-465 -- 406 0.434, [KPR2005]_50 [KPR2005]_50 -- 407 0.435, NGC_2682 NGC_2682 -- 408 0.436, Ruprecht_72 C_0850-374 -- 409 0.437, Ruprecht_158 Cl_Ruprecht_158 -- 410 0.438, Cl_VDBH_56 C_0855-430 -- 411 0.439, Collinder_205 C_0858-487 -- 412 0.440, ESO_165-09 ESO_165-9 -- 413 0.441, ESO_166-04 ESO_166-4 -- 414 0.442, Pismis_11 NAME_HD_80077_Group -- 415 0.443, NGC_2818 NGC_2818 -- 416 0.444, Alicante_5 nan -- 417 0.446, [KPR2005]_51 [KPR2005]_51 -- 418 0.447, Pismis_12 ESO_261-5 -- 419 0.448, Cl_VDBH_63 ESO_212-2 -- 420 0.449, NGC_2866 NGC_2866 -- 421 0.450, Ruprecht_76 C_0922-515 -- 422 0.451, Cl_VDBH_66 NAME_UKS_2 -- 423 0.452, Cl_VDBH_67 C_0925-511 -- 424 0.453, IC_2488 IC_2488 -- 425 0.454, [KPR2005]_52 [KPR2005]_52 -- 426 0.455, NGC_2910 NGC_2910 -- 427 0.456, Turner_5 Cl_Turner_5 -- 428 0.457, NGC_2925 NGC_2925 -- 4

801 0.856, Roslund_2 C_1943+238 -- 802 0.857, Turner_1 NAME_S_Vul_Cluster -- 803 0.858, [KPR2005]_107 [KPR2005]_107 -- 804 0.859, Dias_7 Cl_Dias_7 -- 805 0.860, NGC_6828 NGC_6828 -- 806 0.861, NGC_6830 NGC_6830 -- 807 0.862, Czernik_41 Cl_Czernik_41 -- 808 0.863, Dias_8 Cl_Dias_8 -- 809 0.864, NGC_6834 NGC_6834 -- 810 0.865, Harvard_20 C_1950+182 -- 811 0.866, [KPR2005]_108 [KPR2005]_108 -- 812 0.868, [KPR2005]_109 [KPR2005]_109 -- 813 0.869, Loiano_1 NAME_Loiano_1 -- 814 0.870, Roslund_3 C_1956+203 -- 815 0.871, Berkeley_49 Cl_Berkeley_49 -- 816 0.872, Dolidze_36 [KPR2004b]_472 -- 817 0.873, [KPR2005]_110 [KPR2005]_110 -- 818 0.874, NGC_6866 NGC_6866 -- 819 0.875, Berkeley_84 Cl_Berkeley_84 -- 820 0.876, Cl_Alessi_10 [KPR2004b]_474 -- 821 0.877, Roslund_4 C_2002+290 -- 822 0.878, NGC_6863 NGC_6863 -- 823 0.879, Dolidze_38 C_2004+410 -- 824 0.880, NGC_6871 NGC_6871 -- 825 0.881, Biurakan_1 C_2005+355 -- 826 0.882, Biurakan_2 Cl_Biurakan_2 -- 827 0.884, Berkeley_50 IC_1310 -- 828 0.885,

Unnamed: 0,Simbad_ID,Name_webda,RA_2000_webda,Dec_2000_webda,l_webda,b_webda,Dist_webda,Mod_webda,EB-V_webda,Age_webda,ST_webda,Z_webda,Diam_webda,Fe/H_webda,MRV_webda,pm RA_webda,pm Dec_webda,Measures_webda,Stars_webda
0,Cl_Berkeley_58,Berkeley_58,00 00 12,+60 58 00,116.753,-1.289,3715.0,14.55,0.550,8.400,,-83.6,5.0,,,,,525,519
1,Cl_Stock_18,Stock_18,00 01 37,+64 37 30,117.624,2.268,2800.0,14.41,0.700,6.780,B0,110.8,6.0,,,,,2261,2261
2,Cl_Berkeley_59,Berkeley_59,00 02 13,+67 25 11,118.220,5.000,1000.0,13.78,1.220,6.800,,87.2,20.4,,-6.50,-4.40,0.73,27,21
3,Cl_Blanco_1,Blanco_1,00 04 07,-29 50 00,15.572,-79.261,269.0,7.18,0.010,7.796,B5,-264.3,70.0,0.23,,20.17,3.00,109,105
4,[KPR2005]_1,ASCC_1,00 09 35,+62 40 48,118.150,0.190,4000.0,13.51,0.160,8.250,,13.3,24.0,,-76.15,-2.07,0.46,32,32
5,Cl_Berkeley_1,Berkeley_1,00 09 36,+60 28 30,117.796,-1.979,2420.0,14.35,0.780,8.600,,-83.6,5.0,,,,,2800,2800
6,C_0007+609,King_13,00 10 06,+61 10 00,117.968,-1.306,3100.0,15.00,0.820,8.500,,-70.7,5.0,,,,,4253,3955
7,[KPR2004b]_4,Alessi_20,00 10 33,+58 45 35,117.640,-3.690,450.0,8.95,0.220,8.220,,-29.0,36.0,,,7.48,-2.61,42,42
8,[KPR2005]_2,ASCC_2,00 19 51,+55 42 35,118.460,-6.890,1200.0,10.71,0.100,8.830,,-144.0,36.0,,,-0.91,-3.94,57,57
9,NAME_Cl_Mayer_1,Mayer_1,00 21 54,+61 44 24,119.440,-0.930,1429.0,12.02,0.400,7.740,,-23.2,24.0,,-20.90,-5.27,-5.87,15146,15131


In [7]:
#Piskunov (2008)
if (fixPiskunov):
    piskunov_df = pd.read_fwf("Piskunov2008.table", 
                              widths = [6,18,7,7,8,6,6,6,6,6,9,9,6,6,9,9], header = None,
                              names = ['COCD','Name','GLON[deg]','GLAT[deg]','DistMod','E(B-V)','Dist[pc]',\
                                       'logt[yr]','rt[pc]','e_rt[pc]','logM[MSun]','e_logM[MSun]','rtA[pc]','e_rtA[pc]',
                                       'logMA[MSun]','e_logMA[MSun]'])

    piskunov_df.replace(-9.999,np.nan, inplace=True)
    piskunov_df.replace(-9.9,np.nan, inplace=True)

    #fix the IDs
    piskunov_ID = piskunov_df['Name']
    piskunov_ID = piskunov_ID.str.strip().str.replace(' ','_' )
    piskunov_df['Name'] = piskunov_ID
    
    #add Simbad names for matching (if possible) so I reduce the duplicates 
    simbad_ID = matchToSimbad(piskunov_ID)
    
    piskunov_df.columns = [str(col) + '_piskunov' for col in piskunov_df.columns]
    piskunov_df.insert(loc=0, column='Simbad_ID', value=simbad_ID)

    #check for duplicates
    for index, row in piskunov_df.iterrows():
        check = piskunov_df.loc[piskunov_df['Simbad_ID'] == row['Simbad_ID']]
        if (len(check['Simbad_ID']) != 1):
            print('Piskunov', row['Simbad_ID'], row['Name_piskunov'])
            print('Check', check['Simbad_ID'].values, len(check['Simbad_ID']))
            print('')
        
    piskunov_df.to_csv('Piskunov2008_clean.csv', index=False)

piskunov_df = pd.read_csv('Piskunov2008_clean.csv')
piskunov_df

0 0.000, Berkeley_58 Cl_Berkeley_58 -- 1 0.002, Berkeley_59 Cl_Berkeley_59 -- 2 0.003, Blanco_1 Cl_Blanco_1 -- 3 0.005, Cl_Alessi_20 [KPR2004b]_4 -- 4 0.006, Mayer_1 NAME_Cl_Mayer_1 -- 5 0.008, Stock_20 Cl_Stock_20 -- 6 0.009, Stock_21 C_0027+577 -- 7 0.011, NGC_129 NGC_129 -- 8 0.012, NGC_146 NGC_146 -- 9 0.014, NGC_225 NGC_225 -- 10 0.015, Berkeley_4 Cl_Berkeley_4 -- 11 0.017, IC_1590 IC_1590 -- 12 0.018, Cl_Alessi_1 NAME_Casado_Alessi_1 -- 13 0.020, NGC_381 NGC_381 -- 14 0.022, Platais_2 NAME_HIP_5671_Cluster -- 15 0.023, NGC_433 NGC_433 -- 16 0.025, NGC_457 NGC_457 -- 17 0.026, NGC_581 M_103 -- 18 0.028, NGC_637 NGC_637 -- 19 0.029, NGC_654 NGC_654 -- 20 0.031, NGC_659 NGC_659 -- 21 0.032, NGC_663 NGC_663 -- 22 0.034, Collinder_463 C_0144+717 -- 23 0.035, Stock_4 C_0149+568 -- 24 0.037, NGC_752 NGC_752 -- 25 0.038, NGC_744 NGC_744 -- 26 0.040, Stock_5 C_0200+642 -- 27 0.042, Stock_2 Cl_Stock_2 -- 28 0.043, NGC_869 NGC_869 -- 29 0.045, NGC_884 NGC_884 -- 30 0.046, NGC_886 NGC_886 --

401 0.617, NGC_6383 NGC_6383 -- 402 0.618, Trumpler_27 Cl_Trumpler_27 -- 403 0.620, Trumpler_28 Cl_Trumpler_28 -- 404 0.622, ESO_139-13 ESO_139-13 -- 405 0.623, Collinder_338 C_1734-375 -- 406 0.625, NGC_6400 NGC_6400 -- 407 0.626, NGC_6405 NGC_6405 -- 408 0.628, Trumpler_29 C_1738-400 -- 409 0.629, Cl_Alessi_9 Cl_Alessi_9 -- 410 0.631, NGC_6416 NGC_6416 -- 411 0.632, IC_4665 IC_4665 -- 412 0.634, NGC_6425 NGC_6425 -- 413 0.635, Collinder_350 C_1745+013 -- 414 0.637, Ruprecht_131 Cl_Ruprecht_131 -- 415 0.638, NGC_6444 NGC_6444 -- 416 0.640, Basel_5 C_1749-300 -- 417 0.642, Sco_OB5 Ass_Sco_OB_5 -- 418 0.643, NGC_6469 NGC_6469 -- 419 0.645, NGC_6475 NGC_6475 -- 420 0.646, Trumpler_30 C_1753-353 -- 421 0.648, NGC_6494 M_23 -- 422 0.649, Collinder_359 Cl_Melotte_186 -- 423 0.651, Ruprecht_139 Cl_Ruprecht_139 -- 424 0.652, NGC_6514 M_20 -- 425 0.654, NGC_6520 NGC_6520 -- 426 0.655, NGC_6531 M_21 -- 427 0.657, NGC_6530 NGC_6530 -- 428 0.658, NGC_6546 NGC_6546 -- 429 0.660, Cl_VDB_113 Cl_VDB_

Unnamed: 0,Simbad_ID,COCD_piskunov,Name_piskunov,GLON[deg]_piskunov,GLAT[deg]_piskunov,DistMod_piskunov,E(B-V)_piskunov,Dist[pc]_piskunov,logt[yr]_piskunov,rt[pc]_piskunov,e_rt[pc]_piskunov,logM[MSun]_piskunov,e_logM[MSun]_piskunov,rtA[pc]_piskunov,e_rtA[pc]_piskunov,logMA[MSun]_piskunov,e_logMA[MSun]_piskunov
0,Cl_Berkeley_58,1,Berkeley_58,116.73,-1.29,14.555,0.55,3715,8.20,,,,,22.9,10.9,3.380,0.623
1,Cl_Berkeley_59,2,Berkeley_59,118.22,5.00,13.782,1.22,1000,6.80,,,,,8.0,3.5,2.221,0.564
2,Cl_Blanco_1,3,Blanco_1,14.17,-79.02,7.180,0.01,269,8.32,22.8,3.8,3.646,0.219,20.0,2.4,3.480,0.160
3,[KPR2004b]_4,4,Alessi_20,117.64,-3.69,8.948,0.22,450,8.22,5.4,1.6,1.742,0.391,4.0,0.8,1.362,0.250
4,NAME_Cl_Mayer_1,5,Mayer_1,119.44,-0.93,12.015,0.40,1429,7.74,,,,,16.7,5.6,3.150,0.442
5,Cl_Stock_20,6,Stock_20,119.92,-0.10,10.413,0.20,909,8.53,,,,,6.6,1.3,1.974,0.260
6,C_0027+577,7,Stock_21,120.05,-4.83,11.447,0.40,1100,8.72,,,,,8.3,2.4,2.259,0.376
7,NGC_129,8,NGC_129,120.27,-2.54,12.759,0.55,1625,7.87,14.9,2.7,2.984,0.240,15.2,2.7,3.011,0.235
8,NGC_146,9,NGC_146,120.87,0.50,13.897,0.48,3032,7.37,,,,,20.8,7.1,3.298,0.445
9,NGC_225,10,NGC_225,122.01,-1.08,9.925,0.27,657,8.19,,,,,5.7,1.3,1.787,0.311


In [8]:
#Kharchenko (2013)
if (fixKharchenko):
    kharchenko_df = pd.read_fwf("Kharchenko2013.table", 
                              widths = [5,18,2,1,9,8,8,8,7,7,7,7,7,7,8,8,6,6,7,7,8,7,7,7,7,7,7,7,4,8,8,8,8,8,8,5,4,8,7,4], 
                              header = None,
                              names = ['MWSC','Name','Type','n_Type','RA[hr]','Dec[deg]','GLON[deg]','GLAT[deg]',
                                       'r0[deg]','r1[deg]','r2[deg]','pmRA[mas/yr]','pmDec[mas/yr]','e_pm[mas/yr]',
                                       'RV[km/s]','e_RV[km/s]','n_RV[km/s]','N1sr0','N1sr1','N1sr2','d[pc]','E(B-V)',
                                       'appDistMod[mag]','E(J-Ks)','E(J-H)','dH','logt[yr]','e_logt[yr]','Nt','rc[pc]',
                                       'e_rc[pc]','rt[pc]','e_rt[pc]','k[pc-2]','e_k[pc-2]','Src','SType','[Fe/H][Sun]',
                                       'e_[Fe/H][Sun]','n_[Fe/H]'])

    kharchenko_df['RV[km/s]'].replace(999.99,np.nan, inplace=True)
    kharchenko_df['e_RV[km/s]'].replace(99.99,np.nan, inplace=True)
    kharchenko_df['e_logt[yr]'].replace(0.000,np.nan, inplace=True)
    kharchenko_df['Nt'].replace(-1,np.nan, inplace=True)
    kharchenko_df['rc[pc]'].replace(0.00,np.nan, inplace=True)
    kharchenko_df['e_rc[pc]'].replace(0.00,np.nan, inplace=True)
    kharchenko_df['rt[pc]'].replace(0.00,np.nan, inplace=True)
    kharchenko_df['e_rt[pc]'].replace(0.00,np.nan, inplace=True)
    kharchenko_df['k[pc-2]'].replace(0.00,np.nan, inplace=True)
    kharchenko_df['e_k[pc-2]'].replace(0.00,np.nan, inplace=True)
    kharchenko_df['[Fe/H][Sun]'].replace(99.999,np.nan, inplace=True)
    kharchenko_df['e_[Fe/H][Sun]'].replace(9.99,np.nan, inplace=True)
    kharchenko_df['n_[Fe/H]'].replace(0.1,np.nan, inplace=True)

    #kharchenko_df.loc[kharchenko_df['Name'] == 'Skiff_J0458+43.0']
    
    #add Simbad names for matching (if possible) so I reduce the duplicates 
    kharchenko_ID = kharchenko_df['Name']
    simbad_ID = matchToSimbad(kharchenko_ID)

    kharchenko_df.columns = [str(col) + '_kharchenko' for col in kharchenko_df.columns]
    kharchenko_df.insert(loc=0, column='Simbad_ID', value=simbad_ID)
    
    #check for duplicates
    for index, row in kharchenko_df.iterrows():
        check = kharchenko_df.loc[kharchenko_df['Simbad_ID'] == row['Simbad_ID']]
        if (len(check['Simbad_ID']) != 1):
            print('Kharchenko', row['Simbad_ID'], row['Name_kharchenko'])
            print('Check', check['Simbad_ID'].values, len(check['Simbad_ID']))
            print('')
            
    kharchenko_df.to_csv('Kharchenko2013_clean.csv', index=False)

kharchenko_df = pd.read_csv('Kharchenko2013_clean.csv')
kharchenko_df

0 0.000, Berkeley_58 Cl_Berkeley_58 -- 1 0.000, NGC_7801 NGC_7801 -- 2 0.001, [FSR2007]_0459 [KPS2012]_MWSC_0003 -- 3 0.001, Stock_18 Cl_Stock_18 -- 4 0.001, Berkeley_59 Cl_Berkeley_59 -- 5 0.002, Cep_OB4 Ass_Cep_OB_4 -- 6 0.002, Blanco_1 Cl_Blanco_1 -- 7 0.002, Berkeley_104 Cl_Berkeley_104 -- 8 0.003, IRAS_00013+6817 IRAS_00013+6817 -- 9 0.003, [FSR2007]_0504 [KPS2012]_MWSC_0014 -- 10 0.003, BDB_94 nan -- 11 0.004, Czernik_1 C_0005+611 -- 12 0.004, SAI_1 [KPS2012]_MWSC_0018 -- 13 0.004, [KPR2005]_1 [KPR2005]_1 -- 14 0.005, Berkeley_1 Cl_Berkeley_1 -- 15 0.005, King_13 C_0007+609 -- 16 0.005, Cl_Alessi_20 [KPR2004b]_4 -- 17 0.006, [FSR2007]_0474 [KPS2012]_MWSC_0024 -- 18 0.006, [FSR2007]_0480 [KPS2012]_MWSC_0025 -- 19 0.006, PTB_7 PTB_7 -- 20 0.007, [BDS2003]_48 [BDS2003]_48 -- 21 0.007, Juchert-Saloranta_1 NAME_Juchert-Saloranta_1 -- 22 0.007, Berkeley_60 Cl_Berkeley_60 -- 23 0.008, [KPR2005]_2 [KPR2005]_2 -- 24 0.008, [FSR2007]_0486 [FSR2007]_0486 -- 25 0.008, [FSR2007]_0479 [KPS2012

401 0.133, Mamajek_3 nan -- 402 0.134, Dolidze_21 C_0524+070 -- 403 0.134, [FSR2007]_0777 [FSR2007]_0777 -- 404 0.134, [KPR2005]_19 [KPR2005]_19 -- 405 0.135, Dolidze_20 C_0525+337 -- 406 0.135, [FSR2007]_0882 [FSR2007]_0882 -- 407 0.135, Waterloo_2 Cl_Waterloo_2 -- 408 0.136, NGC_1907 NGC_1907 -- 409 0.136, Stock_8 C_0524+343 -- 410 0.136, Kronberger_1 Kronberger_1 -- 411 0.137, [KPR2005]_20 NAME_HR_1833_Cluster -- 412 0.137, NGC_1912 M_38 -- 413 0.137, [KPR2005]_21 [KPR2005]_21 -- 414 0.138, [FSR2007]_0736 [KPS2012]_MWSC_0559 -- 415 0.138, [FSR2007]_0907 [KPS2012]_MWSC_0560 -- 416 0.138, Teutsch_52 Teutsch_52 -- 417 0.139, NGC_1931 NGC_1931 -- 418 0.139, [FSR2007]_0749 [KPS2012]_MWSC_0566 -- 419 0.139, [FSR2007]_1017 [KPS2012]_MWSC_0567 -- 420 0.140, B_30 nan -- 421 0.140, NGC_1963 NGC_1963 -- 422 0.140, [FSR2007]_0884 [FSR2007]_0884 -- 423 0.141, [FSR2007]_0991 [KPS2012]_MWSC_0571 -- 424 0.141, Berkeley_20 Cl_Berkeley_20 -- 425 0.141, [FSR2007]_0761 [KPS2012]_MWSC_0574 -- 426 0.142,

801 0.267, [FSR2007]_1150 [KPS2012]_MWSC_1089 -- 802 0.267, [BDS2003]_96 [BDS2003]_96 -- 803 0.267, [FSR2007]_1178 [KPS2012]_MWSC_1091 -- 804 0.268, [FSR2007]_1192 [KPS2012]_MWSC_1092 -- 805 0.268, Ruprecht_150 C_0703-283 -- 806 0.268, Haffner_4 C_0703-149 -- 807 0.269, ESO_427-32 ESO_427-32 -- 808 0.269, [FSR2007]_1039 [KPS2012]_MWSC_1096 -- 809 0.269, [FSR2007]_1199 [KPS2012]_MWSC_1097 -- 810 0.270, Ruprecht_10 C_0704-200 -- 811 0.270, Berkeley_76 Cl_Berkeley_76 -- 812 0.270, NGC_2335 NGC_2335 -- 813 0.271, Collinder_465 C_0704-105 -- 814 0.271, Collinder_466 C_0704-107 -- 815 0.271, NGC_2331 NGC_2331 -- 816 0.272, [FSR2007]_1209 [KPS2012]_MWSC_1105 -- 817 0.272, Ruprecht_12 C_0705-281 -- 818 0.272, Ruprecht_11 C_0705-207 -- 819 0.273, [FSR2007]_1223 [KPS2012]_MWSC_1109 -- 820 0.273, [FSR2007]_1202 [KPS2012]_MWSC_1110 -- 821 0.273, [FSR2007]_1170 [KPS2012]_MWSC_1111 -- 822 0.274, [FSR2007]_1207 [KPS2012]_MWSC_1112 -- 823 0.274, [FSR2007]_0876 [KPS2012]_MWSC_1113 -- 824 0.274, NGC_233

1201 0.400, Ruprecht_67 C_0840-432 -- 1202 0.400, Mamajek_1 nan -- 1203 0.400, [DBS2003]_21 [DBS2003]_21 -- 1204 0.401, IC_2395 IC_2395 -- 1205 0.401, NGC_2659 NGC_2659 -- 1206 0.401, NGC_2660 NGC_2660 -- 1207 0.402, ESO_432-03 ESO_432-3 -- 1208 0.402, [FSR2007]_1382 [KPS2012]_MWSC_1541 -- 1209 0.402, NGC_2658 NGC_2658 -- 1210 0.403, [FSR2007]_1371 [KPS2012]_MWSC_1543 -- 1211 0.403, ESO_313-11 ESO_313-11 -- 1212 0.403, ESO_313-12 ESO_313-12 -- 1213 0.404, [FSR2007]_1399 [KPS2012]_MWSC_1546 -- 1214 0.404, Ruprecht_68 C_0842-357 -- 1215 0.404, Ruprecht_69 C_0843-474 -- 1216 0.405, SAI_94 SAI_94 -- 1217 0.405, Bochum_7 C_0843-458 -- 1218 0.405, Collinder_197 ESO_313-13 -- 1219 0.406, Collinder_196 C_0843-314 -- 1220 0.406, NGC_2670 NGC_2670 -- 1221 0.406, [FSR2007]_1434 [KPS2012]_MWSC_1555 -- 1222 0.407, NGC_2671 NGC_2671 -- 1223 0.407, ESO_260-06 ESO_260-6 -- 1224 0.407, NGC_2669 NGC_2669 -- 1225 0.408, [DBS2003]_27 [DBS2003]_27 -- 1226 0.408, NGC_2664 NGC_2664 -- 1227 0.408, ESO_211-01 

1551 0.516, [FSR2007]_1591 [KPS2012]_MWSC_1959 -- 1552 0.516, Cl_VDBH_121 [KPR2004b]_294 -- 1553 0.517, IC_2948 IC_2948 -- 1554 0.517, [KPR2005]_67 [KPR2005]_67 -- 1555 0.517, Cl_Bica_5 [KPS2012]_MWSC_1964 -- 1556 0.518, Lynga_15 C_1140-622 -- 1557 0.518, Ruprecht_95 C_1141-608 -- 1558 0.518, Stock_14 C_1141-622 -- 1559 0.519, [FSR2007]_1573 [KPS2012]_MWSC_1969 -- 1560 0.519, ESO_129-32 ESO_129-32 -- 1561 0.519, [FSR2007]_1597 [KPS2012]_MWSC_1971 -- 1562 0.520, [FSR2007]_1574 [KPS2012]_MWSC_1972 -- 1563 0.520, Cl_Loden 467 Cl_Loden_467 -- 1564 0.520, [DBS2003]_72 [DBS2003]_72 -- 1565 0.521, [FSR2007]_1595 [FSR2007]_1595 -- 1566 0.521, [DBS2003]_70 [DBS2003]_70 -- 1567 0.521, SAI_116 SAI_116 -- 1568 0.522, NGC_3909 NGC_3909 -- 1569 0.522, [DBS2003]_73 [DBS2003]_73 -- 1570 0.522, NGC_3960 NGC_3960 -- 1571 0.523, Ruprecht_96 C_1148-618 -- 1572 0.523, Cl_Loden 481 Cl_Loden_481 -- 1573 0.523, Teutsch_77 Teutsch_77 -- 1574 0.524, Cl_Loden 480 C_1153-581 -- 1575 0.524, Ruprecht_97 C_1154-623 

1901 0.633, [FSR2007]_1716 [FSR2007]_1716 -- 1902 0.633, Lynga_7 Cl_Lynga_7 -- 1903 0.633, [DBS2003]_158 [DBS2003]_158 -- 1904 0.634, [DBS2003]_144 [DBS2003]_144 -- 1905 0.634, [DBS2003]_156 [DBS2003]_156 -- 1906 0.634, [DBS2003]_157 [DBS2003]_157 -- 1907 0.635, Ruprecht_115 Cl_Ruprecht_115 -- 1908 0.635, [DBS2003]_99 [DBS2003]_99 -- 1909 0.635, NGC_6067 NGC_6067 -- 1910 0.636, Pismis_22 Cl_Pismis_22 -- 1911 0.636, Ruprecht_176 Cl_Ruprecht_176 -- 1912 0.636, [DBS2003]_102 [DBS2003]_102 -- 1913 0.637, NGC_6093 M_80 -- 1914 0.637, [DBS2003]_161 [DBS2003]_161 -- 1915 0.637, [FSR2007]_1749 [KPS2012]_MWSC_2378 -- 1916 0.638, [FSR2007]_1737 [FSR2007]_1737 -- 1917 0.638, [DBS2003]_163 [DBS2003]_163 -- 1918 0.638, Harvard_10 C_1615-548 -- 1919 0.639, NGC_6087 NGC_6087 -- 1920 0.639, [FSR2007]_1734 [KPS2012]_MWSC_2383 -- 1921 0.639, [DBS2003]_164 [DBS2003]_164 -- 1922 0.640, Lynga_8 Cl_Lynga_8 -- 1923 0.640, [DBS2003]_100 [DBS2003]_100 -- 1924 0.640, [DBS2003]_162 [DBS2003]_162 -- 1925 0.641, L

2301 0.766, [FSR2007]_0145 [KPS2012]_MWSC_2881 -- 2302 0.766, NGC_6605 NGC_6605 -- 2303 0.766, NGC_6603 NGC_6603 -- 2304 0.767, NGC_6584 NGC_6584 -- 2305 0.767, NGC_6611 M_16 -- 2306 0.767, Cl_Alessi_19 Cl_Alessi_19 -- 2307 0.768, [FSR2007]_0108 [KPS2012]_MWSC_2889 -- 2308 0.768, [FSR2007]_0076 [KPS2012]_MWSC_2891 -- 2309 0.768, NGC_6613 M_18 -- 2310 0.769, [KPR2005]_96 NAME_Cl_Ferrero_1 -- 2311 0.769, [FSR2007]_0077 [KPS2012]_MWSC_2894 -- 2312 0.769, NGC_6588 NGC_6588 -- 2313 0.770, NGC_6618 NGC_6618 -- 2314 0.770, Kronberger_2 Kronberger_2 -- 2315 0.770, Kharchenko_2 Cl_Kharchenko_2 -- 2316 0.771, NGC_6625 NGC_6625 -- 2317 0.771, [FSR2007]_0120 [KPS2012]_MWSC_2904 -- 2318 0.771, NGC_6624 NGC_6624 -- 2319 0.772, Dolidze_52 [KPS2012]_MWSC_2907 -- 2320 0.772, NGC_6626 M_28 -- 2321 0.772, [BDS2003]_8 [BDS2003]_8 -- 2322 0.773, Trumpler_33 Cl_Trumpler_33 -- 2323 0.773, Sct_OB3 Ass_Sct_OB_3 -- 2324 0.773, Ruprecht_170 Cl_Ruprecht_170 -- 2325 0.774, Dolidze_28 Cl_Dolidze_28 -- 2326 0.774, M

2651 0.882, [FSR2007]_0224 [KPS2012]_MWSC_3335 -- 2652 0.883, G_76.2-0.3 NAME_ECX6-12_IR_Cluster -- 2653 0.883, Dolidze_9 C_2023+417 -- 2654 0.883, Dolidze_11 C_2024+412 -- 2655 0.884, [FSR2007]_0210 [KPS2012]_MWSC_3340 -- 2656 0.884, ESO_026-02 ESO_26-2 -- 2657 0.884, [FSR2007]_0213 [KPS2012]_MWSC_3342 -- 2658 0.885, [DB2001]_Cl 7 NAME_DR_6_IR_Cluster -- 2659 0.885, Teutsch_30 Teutsch_30 -- 2660 0.885, IRAS_20264+4042 IRAS_20264+4042 -- 2661 0.886, Teutsch_28 Teutsch_28 -- 2662 0.886, Roslund_6 C_2027+392 -- 2663 0.886, [FSR2007]_0227 [KPS2012]_MWSC_3349 -- 2664 0.887, Dolidze_44 C_2027+415 -- 2665 0.887, IRAS_20286+4105 MITG_J203029+4115 -- 2666 0.887, Wit_4 nan -- 2667 0.888, [FSR2007]_0278 [KPS2012]_MWSC_3355 -- 2668 0.888, [FSR2007]_0128 [FSR2007]_0128 -- 2669 0.888, NGC_6939 NGC_6939 -- 2670 0.889, IRAS_20306+4005 [BNM96]_079.296+0.284 -- 2671 0.889, [FSR2007]_0236 [KPS2012]_MWSC_3362 -- 2672 0.889, Cyg_OB2 Ass_Cyg_OB_2 -- 2673 0.890, [FSR2007]_0260 [KPS2012]_MWSC_3366 -- 2674 0.

2951 0.982, Berkeley_99 Cl_Berkeley_99 -- 2952 0.982, [FSR2007]_0370 [KPS2012]_MWSC_3723 -- 2953 0.983, Dolidze_46 [KPS2012]_MWSC_3724 -- 2954 0.983, NGC_7654 M_52 -- 2955 0.983, Czernik_43 C_2323+610 -- 2956 0.984, Berkeley_100 Cl_Berkeley_100 -- 2957 0.984, [FSR2007]_0466 [KPS2012]_MWSC_3729 -- 2958 0.984, [KPR2005]_129 NAME_Alessi_J2327+55 -- 2959 0.985, [FSR2007]_0427 [KPS2012]_MWSC_3731 -- 2960 0.985, [FSR2007]_0442 [FSR2007]_0442 -- 2961 0.985, NGC_7686 NGC_7686 -- 2962 0.986, Skiff_J2330+60.2 nan -- 2963 0.986, [FSR2007]_0435 [KPS2012]_MWSC_3736 -- 2964 0.986, Berkeley_101 Cl_Berkeley_101 -- 2965 0.987, Stock_11 [KPS2012]_MWSC_3738 -- 2966 0.987, King_20 C_2331+582 -- 2967 0.987, Czernik_44 Cl_Czernik_44 -- 2968 0.988, [FSR2007]_0492 [KPS2012]_MWSC_3742 -- 2969 0.988, Stock_12 Cl_Stock_12 -- 2970 0.988, [FSR2007]_0472 [KPS2012]_MWSC_3744 -- 2971 0.989, NGC_7708 NGC_7708 -- 2972 0.989, [FSR2007]_0464 [KPS2012]_MWSC_3746 -- 2973 0.989, Aveni-Hunter_1 NAME_Aveni-Hunter_1 -- 2974 0.

Unnamed: 0,Simbad_ID,MWSC_kharchenko,Name_kharchenko,Type_kharchenko,n_Type_kharchenko,RA[hr]_kharchenko,Dec[deg]_kharchenko,GLON[deg]_kharchenko,GLAT[deg]_kharchenko,r0[deg]_kharchenko,r1[deg]_kharchenko,r2[deg]_kharchenko,pmRA[mas/yr]_kharchenko,pmDec[mas/yr]_kharchenko,e_pm[mas/yr]_kharchenko,RV[km/s]_kharchenko,e_RV[km/s]_kharchenko,n_RV[km/s]_kharchenko,N1sr0_kharchenko,N1sr1_kharchenko,N1sr2_kharchenko,d[pc]_kharchenko,E(B-V)_kharchenko,appDistMod[mag]_kharchenko,E(J-Ks)_kharchenko,E(J-H)_kharchenko,dH_kharchenko,logt[yr]_kharchenko,e_logt[yr]_kharchenko,Nt_kharchenko,rc[pc]_kharchenko,e_rc[pc]_kharchenko,rt[pc]_kharchenko,e_rt[pc]_kharchenko,k[pc-2]_kharchenko,e_k[pc-2]_kharchenko,Src_kharchenko,SType_kharchenko,[Fe/H][Sun]_kharchenko,e_[Fe/H][Sun]_kharchenko,n_[Fe/H]_kharchenko
0,Cl_Berkeley_58,1,Berkeley_58,,,0.0045,60.933,116.750,-1.326,0.025,0.087,0.155,0.56,1.56,0.26,,,0,16,88,197,2700.0,0.720,12.389,0.346,0.231,0.000,8.470,0.047,10.0,1.12,0.25,13.66,3.31,5.19,1.00,COCD,,,,0
1,NGC_7801,2,NGC_7801,,,0.0055,50.727,114.717,-11.331,0.015,0.070,0.156,-3.20,-3.47,0.71,,,0,2,14,65,1953.0,0.146,11.500,0.070,0.047,0.000,9.255,,1.0,0.61,0.33,9.93,6.14,2.67,1.44,DIAS,,,,0
2,[KPS2012]_MWSC_0003,3,FSR_0459,,,0.0085,59.242,116.446,-2.990,0.018,0.055,0.090,-1.66,-0.01,0.53,,,0,3,24,50,2926.0,1.145,12.700,0.550,0.367,0.000,7.800,,,0.39,0.21,7.65,4.63,5.62,3.53,DIAS,irc,,,0
3,Cl_Stock_18,4,Stock_18,,,0.0265,64.625,117.617,2.266,0.010,0.050,0.080,-3.59,-1.15,0.54,,,0,2,20,32,774.0,0.177,9.501,0.085,0.057,-0.030,8.680,,1.0,0.16,0.03,2.14,0.38,355.57,56.43,DIAS,,,,0
4,Cl_Berkeley_59,5,Berkeley_59,,,0.0373,67.425,118.219,5.001,0.035,0.115,0.220,-3.20,-1.11,0.38,-12.50,7.08,3,16,45,89,1000.0,1.241,10.399,0.596,0.398,-0.040,6.100,,,0.55,0.06,6.51,0.74,153.96,12.16,COCD,,,,0
5,Ass_Cep_OB_4,6,Cep_OB4,ao,,0.0490,67.500,118.299,5.062,0.120,0.345,0.760,-0.93,-2.58,0.21,0.00,,1,61,198,654,850.0,1.099,10.001,0.528,0.352,-0.030,6.100,,,9.54,2.64,13.56,1.39,16.73,10.54,MELN,ass,,,0
6,Cl_Blanco_1,7,Blanco_1,,,0.0590,-30.000,14.830,-79.098,0.500,1.400,2.350,19.71,2.28,0.18,5.48,2.04,24,82,214,266,250.0,0.012,6.994,0.006,0.004,-0.030,7.750,,,2.82,0.38,10.93,1.26,24.84,2.43,COCD,,-0.188,0.098,7
7,Cl_Berkeley_104,8,Berkeley_104,,,0.0568,63.580,117.615,1.202,0.018,0.055,0.135,-4.57,-4.37,0.44,,,0,8,34,136,3599.0,0.606,12.976,0.291,0.194,-0.005,8.805,0.031,15.0,1.13,0.27,14.92,3.92,4.51,0.99,DIAS,,0.070,,0
8,IRAS_00013+6817,9,IRAS_00013+6817,,,0.0664,68.565,118.595,6.091,0.010,0.055,0.090,-3.52,-1.55,0.87,,,0,2,9,23,1338.0,0.833,10.900,0.400,0.267,-0.020,7.150,,,0.73,0.33,5.69,2.54,1.69,0.61,BIEM,irc,,,0
9,[KPS2012]_MWSC_0014,14,FSR_0504,,,0.0970,81.840,121.227,19.116,0.010,0.045,0.080,-3.57,7.83,0.68,,,0,2,10,14,3744.0,0.104,12.900,0.050,0.033,-0.020,9.450,,,0.49,0.19,10.62,4.26,5.03,2.46,FPOS,,,,0


In [9]:
# Salaris (2004) -- no RA or Dec...
if (fixSalaris):
    #salaris_df = pd.read_csv('Salaris2004_viaWEBDA.txt', sep='\t', header = 15)
    salaris_df = pd.read_csv('Salaris2004_table1.txt', sep='\t')
    #print(salaris_df)

    #change the Hyades to Melotte 25 so that it matches with van den Bergh for position
    names = salaris_df['Name'].values
    xx = np.where(names == 'Hyades')
    names[xx] = 'Melotte_25'
    salaris_df['Name'] = [x.strip().replace(' ','_' ) for x in names]
    xx = np.where(names == 'Arp-Madore_2')
    names[xx] = 'AM_2'  

    simbad_ID = matchToSimbad(names)

    salaris_df.columns = [str(col) + '_salaris' for col in salaris_df.columns]
    salaris_df.insert(loc=0, column='Simbad_ID', value=simbad_ID)
    
    #check for duplicates
    for index, row in salaris_df.iterrows():
        check = salaris_df.loc[salaris_df['Simbad_ID'] == row['Simbad_ID']]
        if (len(check['Simbad_ID']) != 1):
            print('Salaris', row['Simbad_ID'], row['Name_salaris'])
            print('Check', check['Simbad_ID'].values, len(check['Simbad_ID']))
            print('')
            
    salaris_df.to_csv('Salaris2004_table1_clean.csv', index=False)

salaris_df = pd.read_csv('Salaris2004_table1_clean.csv')
salaris_df

0 0.000, King_2 Cl_King_2 -- 1 0.014, IC_166 IC_166 -- 2 0.029, NGC_752 NGC_752 -- 3 0.043, Berkeley_66 Cl_Berkeley_66 -- 4 0.057, NGC_1193 NGC_1193 -- 5 0.071, King_5 C_0311+525 -- 6 0.086, NGC_1245 NGC_1245 -- 7 0.100, NGC_1798 NGC_1798 -- 8 0.114, NGC_1817 NGC_1817 -- 9 0.129, Berkeley_17 Cl_Berkeley_17 -- 10 0.143, Berkeley_18 Cl_King_22 -- 11 0.157, Berkeley_20 Cl_Berkeley_20 -- 12 0.171, Berkeley_21 Cl_Berkeley_21 -- 13 0.186, Berkeley_22 Cl_Berkeley_22 -- 14 0.200, NGC_2141 NGC_2141 -- 15 0.214, NGC_2158 NGC_2158 -- 16 0.229, NGC_2194 NGC_2194 -- 17 0.243, NGC_2192 NGC_2192 -- 18 0.257, NGC_2236 NGC_2236 -- 19 0.271, NGC_2243 NGC_2243 -- 20 0.286, Trumpler_5 C_0634+094 -- 21 0.300, NGC_2266 NGC_2266 -- 22 0.314, Berkeley_29 Cl_Berkeley_29 -- 23 0.329, Berkeley_31 Cl_Berkeley_31 -- 24 0.343, Berkeley_30 Cl_Berkeley_30 -- 25 0.357, Berkeley_32 Cl_Berkeley_32 -- 26 0.371, Tombaugh_2 C_0701-207 -- 27 0.386, NGC_2324 NGC_2324 -- 28 0.400, NGC_2354 NGC_2354 -- 29 0.414, NGC_2355 NGC_2

Unnamed: 0,Simbad_ID,Name_salaris,dV_salaris,err_dV_salaris,[Fe/H]_salaris,err_[Fe/H]_salaris,t[Gyr]_salaris,err_t_salaris,Rgc[kpc]_salaris,z[pc]_salaris,flag_salaris,tJP94[Gyr]_salaris
0,Cl_King_2,King_2,2.2,0.15,0.00,0.20,5.03,1.31,12.98,-510,2,5.6
1,IC_166,IC_166,1.0,0.25,-0.27,0.15,1.32,0.43,10.74,-10,1,1.5
2,NGC_752,NGC_752,0.9,0.05,-0.09,0.06,1.24,0.20,8.75,-145,1,1.4
3,Cl_Berkeley_66,Berkeley_66,2.0,0.25,0.00,0.20,3.98,1.52,12.59,20,2,4.4
4,NGC_1193,NGC_1193,2.1,0.15,-0.35,0.11,4.23,1.08,12.00,-845,1,4.9
5,C_0311+525,King_5,0.4,0.15,-0.30,0.15,0.76,0.16,10.34,-163,2,0.9
6,NGC_1245,NGC_1245,0.7,0.15,0.10,0.15,1.06,0.23,11.09,-465,1,1.0
7,NGC_1798,NGC_1798,1.0,0.15,-0.47,0.15,1.28,0.29,11.79,290,2,1.5
8,NGC_1817,NGC_1817,0.8,0.05,-0.10,0.09,1.12,0.18,10.26,-410,1,1.3
9,Cl_Berkeley_17,Berkeley_17,2.8,0.15,-0.33,0.15,10.06,2.77,10.89,-155,1,12.6


In [10]:
#van den Bergh (2006)
#there were two rows for Berkeley 69, with slightly different values.  I kept the first one.
if (fixVandenBergh):
    vandenbergh_df = pd.read_csv('vandenbergh2006.tsv', sep='|', header = 49)

    #fix the names
    names = vandenbergh_df['SimbadName'].values
    def representsInt(s):
        try: 
            int(s)
            return True
        except ValueError:
            return False

    for i,x in enumerate(names):
        if (x[0:2] == 'Cl'):
            names[i] = x[2:]
        if (x[0:1] == 'N' and representsInt(x[1:1])):
            names[i] = 'NGC '+x[1:]
        names[i] = names[i].strip().replace('  ',' ').replace( ' ','_' )

    vandenbergh_df['Name'] = names
    
    #add Simbad names for matching (if possible) so I reduce the duplicates 
    vandenbergh_ID = vandenbergh_df['Name']
    simbad_ID = matchToSimbad(vandenbergh_ID)

    vandenbergh_df.columns = [str(col) + '_vandenbergh' for col in vandenbergh_df.columns]
    vandenbergh_df.insert(loc=0, column='Simbad_ID', value=simbad_ID)
    
    #check for duplicates
    for index, row in vandenbergh_df.iterrows():
        check = vandenbergh_df.loc[vandenbergh_df['Simbad_ID'] == row['Simbad_ID']]
        if (len(check['Simbad_ID']) != 1):
            print('van den Bergh', row['Simbad_ID'], row['Name_vandenbergh'])
            print('Check', check['Simbad_ID'].values, len(check['Simbad_ID']))
            print('')
        
        
    vandenbergh_df.to_csv('vandenbergh2006_clean.csv', index=False)

vandenbergh_df = pd.read_csv('vandenbergh2006_clean.csv')
vandenbergh_df

0 0.000, Trumpler_31 C_1756-281 -- 1 0.002, NGC_6520 NGC_6520 -- 2 0.003, NGC_6530 NGC_6530 -- 3 0.005, Bochum_14 Cl_Bochum_14 -- 4 0.007, NGC_6514 M_20 -- 5 0.008, NGC_6546 NGC_6546 -- 6 0.010, NGC_6531 M_21 -- 7 0.012, NGC_6494 M_23 -- 8 0.013, Trumpler_33 Cl_Trumpler_33 -- 9 0.015, Collinder_469 Cl_Collinder_469 -- 10 0.017, NGC_6603 NGC_6603 -- 11 0.018, IC_4725 IC_4725 -- 12 0.020, NGC_6613 M_18 -- 13 0.022, Collinder_394 C_1850-204 -- 14 0.024, NGC_6618 NGC_6618 -- 15 0.025, NGC_6716 NGC_6716 -- 16 0.027, Blanco_1 Cl_Blanco_1 -- 17 0.029, NGC_6611 M_16 -- 18 0.030, Trumpler_32 Cl_Trumpler_32 -- 19 0.032, NGC_6604 NGC_6604 -- 20 0.034, NGC_6631 NGC_6631 -- 21 0.035, Ruprecht_147 NGC_6774 -- 22 0.037, NGC_6649 NGC_6649 -- 23 0.039, NGC_6694 M_26 -- 24 0.040, NGC_6664 NGC_6664 -- 25 0.042, NGC_6683 NGC_6683 -- 26 0.044, NGC_6705 NGC_6705 -- 27 0.045, Basel_1 C_1845-059 -- 28 0.047, NGC_6704 NGC_6704 -- 29 0.049, Trumpler_35 Cl_Trumpler_35 -- 30 0.050, Collinder_359 Cl_Melotte_186 --

401 0.674, Bochum_7 C_0843-458 -- 402 0.676, NGC_2660 NGC_2660 -- 403 0.677, IC_2395 IC_2395 -- 404 0.679, Platais_9 NAME_HIP_45189_Cluster -- 405 0.681, NGC_2670 NGC_2670 -- 406 0.682, Collinder_205 C_0858-487 -- 407 0.684, IC_2391 IC_2391 -- 408 0.686, NGC_2669 NGC_2669 -- 409 0.687, Pismis_11 NAME_HD_80077_Group -- 410 0.689, NGC_2866 NGC_2866 -- 411 0.691, Ruprecht_76 C_0922-515 -- 412 0.692, NGC_2516 NGC_2516 -- 413 0.694, NGC_1252 NGC_1252 -- 414 0.696, NGC_2972 NGC_2972 -- 415 0.697, NGC_2910 NGC_2910 -- 416 0.699, Ruprecht_78 C_0927-534 -- 417 0.701, NGC_2925 NGC_2925 -- 418 0.703, Cl_VDBH_66 NAME_UKS_2 -- 419 0.704, Ruprecht_77 C_0925-549 -- 420 0.706, Ruprecht_79 C_0939-536 -- 421 0.708, Platais_8 Cl_Platais_8 -- 422 0.709, Ruprecht_82 C_0943-537 -- 423 0.711, Pismis_16 C_0949-529 -- 424 0.713, IC_2488 IC_2488 -- 425 0.714, Basel_20 C_0931-561 -- 426 0.716, Ruprecht_83 Cl_Ruprecht_83 -- 427 0.718, NGC_1901 NGC_1901 -- 428 0.719, NGC_3033 NGC_3033 -- 429 0.721, NGC_3105 NGC_31

Unnamed: 0,Simbad_ID,Seq_vandenbergh,Name_vandenbergh,l[deg]_vandenbergh,Diam[pc]_vandenbergh,R[pc]_vandenbergh,Z[pc]_vandenbergh,E(B-V)_vandenbergh,logT[yr]_vandenbergh,SimbadName_vandenbergh,_RA[deg]_vandenbergh,_Dec[deg]_vandenbergh
0,C_1756-281,1,Trumpler_31,2,1.43,986,-39,0.35,8.87,Trumpler_31,269.97500,-28.18333
1,NGC_6520,2,NGC_6520,2,2.29,1577,-78,0.43,7.72,NGC_6520,270.85000,-27.88333
2,NGC_6530,3,NGC_6530,6,5.42,1330,-31,0.33,6.87,NGC_6530,271.12500,-24.36667
3,Cl_Bochum_14,4,Bochum_14,6,0.34,578,-5,1.51,7.00,Bochum_14,270.50000,-23.70000
4,M_20,5,NGC_6514,7,6.65,816,-4,0.19,7.37,NGC_6514,270.59583,-23.03000
5,NGC_6546,6,NGC_6546,7,3.82,938,-23,0.49,7.85,NGC_6546,271.90000,-23.33333
6,M_21,7,NGC_6531,7,2.91,1205,-7,0.28,7.07,NGC_6531,271.05000,-22.48333
7,M_23,8,NGC_6494,9,5.30,628,31,0.36,8.48,NGC_6494,269.25000,-18.98333
8,Cl_Trumpler_33,10,Trumpler_33,12,2.55,1755,-99,0.36,7.68,Trumpler_33,276.17500,-19.71667
9,Cl_Collinder_469,11,Collinder_469,12,1.29,1481,-20,0.42,7.80,Collinder_469,274.10000,-18.21667


In [11]:
#Gaia DR2 from Cantat-Gaudin+, 2018
if (fixGaia):
    gaiaDR2_df = pd.read_csv('Cantat-Gaudin2018_GaiaDR2.tsv', sep='|', header = 56)

    names = gaiaDR2_df['Name']
    names = names.str.strip()

    for index, row in  gaiaDR2_df.iterrows():
    #for i,n in enumerate(names):
        if (names[index][0:3] == "ESO"):
            names[index] = row['SimbadName'].replace(' ','_')

    gaiaDR2_df['Name'] = names

    #this already contains the Simbad name, but they are not identical to what I'm finding 
    gaiaDR2_SID = gaiaDR2_df['SimbadName'].str.replace(' ','_').replace('Name','NAME').values
    idx = gaiaDR2_df.index[pd.isnull(gaiaDR2_df['SimbadName'])].tolist()
    gaiaDR2_SID[idx] = names[idx]
    simbad_ID = matchToSimbad(gaiaDR2_SID)
#     #because there was at least one bad match of Gulliver_15 to NGC 6561 (when Gulliver 15 is not in Simbad)
#     #also Gulliver 7 to Ruprecht 77
#     idx = gaiaDR2_df.index[pd.isnull(gaiaDR2_df['SimbadName'])].tolist()
#     simbad_ID[idx] = names[idx]


    gaiaDR2_df.columns = [str(col) + '_cantat-gaudin' for col in gaiaDR2_df.columns]
    gaiaDR2_df.insert(loc=0, column='Simbad_ID', value=simbad_ID)

    #check for duplicates
    for index, row in  gaiaDR2_df.iterrows():
        check = gaiaDR2_df.loc[gaiaDR2_df['Simbad_ID'] == row['Simbad_ID']]
        if (len(check['Simbad_ID']) != 1):
            print('gaiaDR2', row['Simbad_ID'], row['Name_cantat-gaudin'])
            print('Check', check['Simbad_ID'].values, len(check['Simbad_ID']))
            print('')
        
    gaiaDR2_df.to_csv('Cantat-Gaudin2018_GaiaDR2_clean.csv', index=False)

gaiaDR2_df = pd.read_csv('Cantat-Gaudin2018_GaiaDR2_clean.csv')
gaiaDR2_df#.loc[(gaiaDR2_df['Name'] == 'Collinder_258') | (gaiaDR2_df['Name'] == 'Harvard_5')]

0 0.000, [KPR2005]_10 NAME_Alessi_Teutsch_9 -- 1 0.001, [KPR2005]_101 [KPR2005]_101 -- 2 0.002, [KPR2005]_105 [KPR2005]_105 -- 3 0.002, [KPR2005]_107 [KPR2005]_107 -- 4 0.003, [KPR2005]_108 [KPR2005]_108 -- 5 0.004, [KPR2005]_11 [KPR2005]_11 -- 6 0.005, [KPR2005]_110 [KPR2005]_110 -- 7 0.006, [KPR2005]_111 [KPR2005]_111 -- 8 0.007, [KPR2005]_113 [KPR2005]_113 -- 9 0.007, [KPR2005]_114 [KPR2005]_114 -- 10 0.008, [KPR2005]_115 [KPR2005]_115 -- 11 0.009, [KPR2005]_12 [KPR2005]_12 -- 12 0.010, [KPR2005]_123 [KPR2005]_123 -- 13 0.011, [KPR2005]_127 [KPR2005]_127 -- 14 0.011, [KPR2005]_128 [KPR2005]_128 -- 15 0.012, [KPR2005]_13 [KPR2005]_13 -- 16 0.013, [KPR2005]_16 NAME_25_Ori_Group -- 17 0.014, [KPR2005]_19 [KPR2005]_19 -- 18 0.015, [KPR2005]_21 [KPR2005]_21 -- 19 0.015, [KPR2005]_23 [KPR2005]_23 -- 20 0.016, [KPR2005]_29 [KPR2005]_29 -- 21 0.017, [KPR2005]_30 [KPR2005]_30 -- 22 0.018, [KPR2005]_32 [KPR2005]_32 -- 23 0.019, [KPR2005]_41 NAME_Herschel_1 -- 24 0.020, [KPR2005]_58 [KPR2005]_

351 0.286, [FSR2007]__0442 [FSR2007]_0442 -- 352 0.287, [FSR2007]__0448 [KPS2012]_MWSC_3756 -- 353 0.287, [FSR2007]__0465 [KPS2012]_MWSC_3765 -- 354 0.288, [FSR2007]__0496 [KPS2012]_MWSC_0044 -- 355 0.289, [FSR2007]__0498 [FSR2007]_0498 -- 356 0.290, [FSR2007]__0524 [KPS2012]_MWSC_0092 -- 357 0.291, [FSR2007]__0534 [KPS2012]_MWSC_0112 -- 358 0.292, [FSR2007]__0536 [KPS2012]_MWSC_0116 -- 359 0.292, [FSR2007]__0537 [KPS2012]_MWSC_0113 -- 360 0.293, [FSR2007]__0542 [FSR2007]_0542 -- 361 0.294, [FSR2007]__0551 [KPS2012]_MWSC_0131 -- 362 0.295, [FSR2007]__0553 [KPS2012]_MWSC_0132 -- 363 0.296, [FSR2007]__0558 [KPS2012]_MWSC_0157 -- 364 0.296, [FSR2007]__0667 [KPS2012]_MWSC_0341 -- 365 0.297, [FSR2007]__0683 [KPS2012]_MWSC_0475 -- 366 0.298, [FSR2007]__0686 [KPS2012]_MWSC_0378 -- 367 0.299, [FSR2007]__0716 [KPS2012]_MWSC_0482 -- 368 0.300, [FSR2007]__0728 [KPS2012]_MWSC_0384 -- 369 0.300, [FSR2007]__0735 [FSR2007]_0735 -- 370 0.301, [FSR2007]__0771 [KPS2012]_MWSC_0460 -- 371 0.302, [FSR2007]

751 0.612, NGC_2423 NGC_2423 -- 752 0.612, NGC_2425 NGC_2425 -- 753 0.613, NGC_2428 NGC_2428 -- 754 0.614, NGC_2432 NGC_2432 -- 755 0.615, NGC_2437 NGC_2437 -- 756 0.616, NGC_2439 NGC_2439 -- 757 0.616, NGC_2447 NGC_2447 -- 758 0.617, NGC_2448 NGC_2448 -- 759 0.618, NGC_2451A NGC_2451A -- 760 0.619, NGC_2451B NAME_HIP_37742_Cluster -- 761 0.620, NGC_2453 NGC_2453 -- 762 0.621, NGC_2455 NGC_2455 -- 763 0.621, NGC_2477 NGC_2477 -- 764 0.622, NGC_2482 NGC_2482 -- 765 0.623, NGC_2489 NGC_2489 -- 766 0.624, NGC_2506 NGC_2506 -- 767 0.625, NGC_2509 NGC_2509 -- 768 0.625, NGC_2516 NGC_2516 -- 769 0.626, NGC_2527 NGC_2527 -- 770 0.627, NGC_2533 NGC_2533 -- 771 0.628, NGC_2539 NGC_2539 -- 772 0.629, NGC_2546 NGC_2546 -- 773 0.629, NGC_2547 NGC_2547 -- 774 0.630, NGC_2548 M_48 -- 775 0.631, NGC_2567 NGC_2567 -- 776 0.632, NGC_2571 NGC_2571 -- 777 0.633, NGC_2580 NGC_2580 -- 778 0.634, NGC_2587 NGC_2587 -- 779 0.634, NGC_2588 NGC_2588 -- 780 0.635, NGC_2627 NGC_2627 -- 781 0.636, NGC_2632 NGC_263

1201 0.978, Trumpler_22 Cl_Trumpler_22 -- 1202 0.979, Trumpler_23 Cl_Trumpler_23 -- 1203 0.980, Trumpler_25 C_1721-389 -- 1204 0.980, Trumpler_26 C_1725-294 -- 1205 0.981, Trumpler_28 Cl_Trumpler_28 -- 1206 0.982, Trumpler_29 C_1738-400 -- 1207 0.983, Trumpler_3 C_0307+630 -- 1208 0.984, Trumpler_30 C_1753-353 -- 1209 0.985, Trumpler_32 Cl_Trumpler_32 -- 1210 0.985, Trumpler_33 Cl_Trumpler_33 -- 1211 0.986, Trumpler_34 Cl_Trumpler_34 -- 1212 0.987, Trumpler_35 Cl_Trumpler_35 -- 1213 0.988, Trumpler_5 C_0634+094 -- 1214 0.989, Trumpler_7 Cl_Trumpler_7 -- 1215 0.989, Trumpler_9 C_0753-258 -- 1216 0.990, Turner_3 Cl_Turner_3 -- 1217 0.991, Turner_5 Cl_Turner_5 -- 1218 0.992, Turner_9 NAME_SU_Cygni_Cluster -- 1219 0.993, Waterloo_1 GSC_03719-00517 -- 1220 0.993, Waterloo_7 Cl_Waterloo_7 -- 1221 0.994, Westerlund_1 Cl_Westerlund_1 -- 1222 0.995, Westerlund_2 Cl_Westerlund_2 -- 1223 0.996, Cl_vdB_1 NAME_CV_Mon_Cluster -- 1224 0.997, Cl_vdB_80 C_0628-096 -- 1225 0.998, Cl_vdB_83 C_0638-272 --

Unnamed: 0,Simbad_ID,_RAJ2000_cantat-gaudin,_DEJ2000_cantat-gaudin,Name_cantat-gaudin,RAJ2000[deg]_cantat-gaudin,DEJ2000[deg]_cantat-gaudin,GLON[deg]_cantat-gaudin,GLAT[deg]_cantat-gaudin,r50[deg]_cantat-gaudin,Nstars_cantat-gaudin,pmRA[mas/yr]_cantat-gaudin,pmDE[mas/yr]_cantat-gaudin,plx[mas]_cantat-gaudin,dmode[pc]_cantat-gaudin,Rgc[pc]_cantat-gaudin,SimbadName_cantat-gaudin
0,NAME_Alessi_Teutsch_9,03 27 28.80,+34 58 51.6,ASCC_10,51.870,34.981,155.723,-17.770,0.558,71,-1.737,-1.368,1.459,672.0,8927.2,[KPR2005] 10
1,[KPR2005]_101,19 13 35.76,+36 22 08.4,ASCC_101,288.399,36.369,68.028,11.608,0.372,75,0.934,1.288,2.488,397.3,8202.3,[KPR2005] 101
2,[KPR2005]_105,19 42 11.52,+27 21 57.6,ASCC_105,295.548,27.366,62.825,2.063,0.648,127,1.464,-1.635,1.783,551.8,8103.0,[KPR2005] 105
3,[KPR2005]_107,19 48 39.36,+21 59 13.2,ASCC_107,297.164,21.987,58.904,-1.901,0.174,59,-0.155,-5.156,1.109,878.5,7922.3,[KPR2005] 107
4,[KPR2005]_108,19 53 13.44,+39 20 56.4,ASCC_108,298.306,39.349,74.378,6.074,0.537,230,-0.519,-1.690,0.838,1154.0,8106.7,[KPR2005] 108
5,[KPR2005]_11,03 32 13.44,+44 51 21.6,ASCC_11,53.056,44.856,150.546,-9.224,0.312,276,0.926,-3.030,1.141,854.5,9083.9,[KPR2005] 11
6,[KPR2005]_110,20 02 58.08,+33 31 40.8,ASCC_110,300.742,33.528,70.411,1.378,0.203,70,0.271,-3.132,0.497,1902.2,7908.1,[KPR2005] 110
7,[KPR2005]_111,20 11 33.84,+37 30 54.0,ASCC_111,302.891,37.515,74.714,2.056,0.537,156,-1.150,-1.524,1.166,836.9,8159.5,[KPR2005] 111
8,[KPR2005]_113,21 11 43.92,+38 38 16.8,ASCC_113,317.933,38.638,82.877,-6.589,0.529,196,0.800,-3.679,1.762,558.2,8289.5,[KPR2005] 113
9,[KPR2005]_114,21 39 57.60,+53 59 49.2,ASCC_114,324.990,53.997,97.082,1.028,0.216,150,-3.716,-3.421,1.066,913.2,8501.0,[KPR2005] 114


In [12]:
#lynga catalo: https://heasarc.gsfc.nasa.gov/W3Browse/star-catalog/lyngaclust.html
if (fixLynga):
    lynga_df = pd.read_csv('lyngaCat.txt',sep='|')

    #fix the column names
    lynga_df.rename(columns=lambda x: x.strip(), inplace=True)

    #fix all the cells
    lynga_df = lynga_df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
    lynga_df.replace(r'^\s*$', np.nan, regex=True, inplace=True)

    #take only the first entries for each name (some have multiple lines
    lynga_df = lynga_df.loc[pd.notna(lynga_df['Name'].str.strip())]

    #fix capitalization in the names!
    names = lynga_df['Name'].values
    for i,n in enumerate(names):
        if ((n[0:3] != 'NGC') and (n[0:2] != 'IC')):
            p1 = n.find(' ')
            n = n[0] + n[1:p1].lower() + n[p1:]

        if (n[0:9] == 'Vdb-hagen'):
            n = 'vdBergh-Hagen' + n[9:]

        if (n[0:10] == 'Hav-moffat'):
            n = 'Havlen-Moffat' + n[10:]

        if (n[0:7] == 'Vdbergh'):
            n = 'vdBergh' + n[7:]

        if (n[0:8] == 'Dol-dzim'):
            n = 'Dol-Dzim' + n[8:]

        if (n == 'Sigma ORI'):
            n = 'Sigma Ori'

        if (n == 'Trapezium'):
            n = 'M 42'
            
        names[i] = n.replace(' ','_')

    lynga_df['Name'] = names

    #print(lynga_df.loc[lynga_df['Name'] == 'NGC_2579'])
    
    #remove the "Unnamed" columns
    lynga_df = lynga_df.loc[:, ~lynga_df.columns.str.contains('^Unnamed')]
    
    #add Simbad names for matching (if possible) so I reduce the duplicates 
    lynga_ID = lynga_df['Name']
    simbad_ID = matchToSimbad(lynga_ID)

    lynga_df.columns = [str(col) + '_lynga' for col in lynga_df.columns]
    lynga_df.insert(loc=0, column='Simbad_ID', value=simbad_ID)
    
    #check for duplicates
    for index, row in  lynga_df.iterrows():
        check = lynga_df.loc[lynga_df['Simbad_ID'] == row['Simbad_ID']]
        if (len(check['Simbad_ID']) != 1):
            print('lynga', row['Simbad_ID'], row['Name_lynga'])
            print('Check', check['Simbad_ID'].values, len(check['Simbad_ID']))
            print('')
        
    lynga_df.to_csv('lyngaCat_clean.csv', index=False)
    
lynga_df = pd.read_csv('lyngaCat_clean.csv')
lynga_df

0 0.000, Berkeley_59 Cl_Berkeley_59 -- 1 0.001, Berkeley_104 Cl_Berkeley_104 -- 2 0.002, Blanco_1 Cl_Blanco_1 -- 3 0.003, Stock_19 C_0001+557 -- 4 0.004, Czernik_1 C_0005+611 -- 5 0.005, Berkeley_1 Cl_Berkeley_1 -- 6 0.006, King_13 C_0007+609 -- 7 0.006, Berkeley_60 Cl_Berkeley_60 -- 8 0.007, King_1 C_0019+641 -- 9 0.008, Berkeley_2 Cl_Berkeley_2 -- 10 0.009, NGC_103 NGC_103 -- 11 0.010, NGC_110 NGC_110 -- 12 0.011, NGC_129 NGC_129 -- 13 0.012, Stock_21 C_0027+577 -- 14 0.013, NGC_133 NGC_133 -- 15 0.014, NGC_136 NGC_136 -- 16 0.015, King_14 C_0029+628 -- 17 0.016, King_15 Cl_King_15 -- 18 0.017, NGC_146 NGC_146 -- 19 0.018, Stock_24 Cl_Stock_24 -- 20 0.019, NGC_189 NGC_189 -- 21 0.019, NGC_225 NGC_225 -- 22 0.020, King_16 C_0040+639 -- 23 0.021, Czernik_2 C_0040+598 -- 24 0.022, NGC_188 NGC_188 -- 25 0.023, Berkeley_4 Cl_Berkeley_4 -- 26 0.024, Berkeley_61 Cl_Berkeley_61 -- 27 0.025, Dolidze_13 C_0047+638 -- 28 0.026, King_2 Cl_King_2 -- 29 0.027, Berkeley_62 Cl_Berkeley_62 -- 30 0.02

401 0.372, NGC_2587 NGC_2587 -- 402 0.373, Collinder_187 C_0822-289 -- 403 0.374, Ruprecht_60 C_0822-470 -- 404 0.375, Ruprecht_61 C_0823-340 -- 405 0.376, Ruprecht_157 C_0827-189 -- 406 0.377, Pismis_3 C_0829-385 -- 407 0.378, Ruprecht_63 C_0831-481 -- 408 0.378, Ruprecht_62 C_0830-194 -- 409 0.379, Pismis_4 C_0832-441 -- 410 0.380, NGC_2627 NGC_2627 -- 411 0.381, Ruprecht_64 C_0835-399 -- 412 0.382, Pismis_5 ESO_313-7 -- 413 0.383, NGC_2635 NGC_2635 -- 414 0.384, Ruprecht_65 C_0837-438 -- 415 0.385, Pismis_6 NGC_2645 -- 416 0.386, NGC_2632 NGC_2632 -- 417 0.387, IC_2391 IC_2391 -- 418 0.388, Ruprecht_66 Cl_Ruprecht_66 -- 419 0.389, Pismis_7 C_0839-385 -- 420 0.390, IC_2395 IC_2395 -- 421 0.391, Pismis_8 C_0839-461 -- 422 0.391, Ruprecht_67 C_0840-432 -- 423 0.392, NGC_2660 NGC_2660 -- 424 0.393, NGC_2659 NGC_2659 -- 425 0.394, NGC_2658 NGC_2658 -- 426 0.395, Ruprecht_69 C_0843-474 -- 427 0.396, Collinder_197 ESO_313-13 -- 428 0.397, Ruprecht_68 C_0842-357 -- 429 0.398, NGC_2669 NGC_2

801 0.743, Ruprecht_147 NGC_6774 -- 802 0.744, Berkeley_44 Cl_Berkeley_44 -- 803 0.745, Berkeley_45 Cl_Berkeley_45 -- 804 0.746, NGC_6791 NGC_6791 -- 805 0.747, NGC_6793 NGC_6793 -- 806 0.748, King_25 Cl_King_25 -- 807 0.749, Collinder_399 Cl_Collinder_399 -- 808 0.750, Dolidze_35 C_1924+115 -- 809 0.750, NGC_6800 NGC_6800 -- 810 0.751, Berkeley_47 Cl_Berkeley_47 -- 811 0.752, King_26 Cl_King_26 -- 812 0.753, NGC_6802 NGC_6802 -- 813 0.754, Stock_1 C_1933+251 -- 814 0.755, NGC_6811 NGC_6811 -- 815 0.756, Collinder_401 C_1935+002 -- 816 0.757, NGC_6819 NGC_6819 -- 817 0.758, Czernik_40 Cl_Czernik_40 -- 818 0.759, NGC_6823 NGC_6823 -- 819 0.760, Roslund_1 C_1942+174 -- 820 0.761, Roslund_2 C_1943+238 -- 821 0.762, Berkeley_48 Cl_Berkeley_48 -- 822 0.763, Czernik_41 Cl_Czernik_41 -- 823 0.763, NGC_6830 NGC_6830 -- 824 0.764, NGC_6834 NGC_6834 -- 825 0.765, Harvard_20 C_1950+182 -- 826 0.766, NGC_6846 NGC_6846 -- 827 0.767, Roslund_3 C_1956+203 -- 828 0.768, Berkeley_49 Cl_Berkeley_49 -- 8

Unnamed: 0,Simbad_ID,Name_lynga,ra_lynga,dec_lynga,distance_lynga,log_age_lynga,angular_diameter_lynga,alt_name_lynga,lii_lynga,bii_lynga,iau_num_lynga,seq_code_lynga,seq_num_lynga,prec_ra_lynga,prec_dec_lynga,lund_record_num_lynga,ocl_num_lynga,ref_angular_diameter_lynga,ref_distance_lynga,ref_log_age_lynga,metallicity_lynga,ref_metallicity_lynga,e_bv_lynga,ref_e_bv_lynga,type_flag_lynga,ref_type_flag_lynga,tr_concent_class_lynga,tr_range_class_lynga,tr_richness_class_lynga,tr_nebulosity_lynga,sb_bs_mag_lynga,sb_spect_code_lynga,sb_total_mag_lynga,sk_total_mag_lynga,sk_bv_color_lynga,sk_num_stars_lynga,ja_star_num_lynga,ja_class_lynga,ja_max_class_lynga,ja_richness_lynga,ja_e_bv_lynga,ref_ja_e_bv_lynga,ja_bv_turnoff_lynga,ref_ja_bv_turnoff_lynga,ly_tr_concent_class_lynga,ly_tr_range_class_lynga,ly_tr_richness_class_lynga,ly_tr_nebulosity_lynga,ly_member_stars_lynga,ly_angular_diameter_lynga,ly_refs_flag_lynga,radvel_weight_lynga,radvel_lynga,radvel_weight_class_lynga,ref_radvel1_lynga,ref_radvel2_lynga,ref_radvel3_lynga,ref_radvel4_lynga,ref_radvel5_lynga,basel_spect_code_lynga,basel_color_type_lynga,neg_ra_tracer_lynga,neg_lii_tracer_lynga,neg_seq_num_tracer_lynga,pos_ra_tracer_lynga,pos_lii_tracer_lynga,pos_seq_num_tracer_lynga,jdl_distance_lynga,jdl_distance_weight_lynga,jdl_turnoff_color_lynga,jdl_age_lynga,jdl_age_weight_lynga,jdl_reddening_lynga,jdl_reddening_flag_lynga,jdl_reddening_weight_lynga,ref_jdl1_lynga,ref_jdl2_lynga,ref_jdl3_lynga,ref_jdl4_lynga,ref_jdl5_lynga,ref_jdl6_lynga
0,Cl_Berkeley_59,Berkeley_59,0.64425,67.37840,,,10.0,0000+671,118.25,4.95,C0000+671,3,59,0.52,3.3,3,286.0,419.0,,,,,,,,,3,2,P,,11.0,,,,,,,,,,,,,,1.0,3.0,M,N,40.0,10.0,1,,,,,,,,,0,0,2,9,1,4,10,10,,,,,,,,,,,,,,
1,Cl_Berkeley_104,Berkeley_104,0.87085,63.59506,,,4.0,0000+633,117.63,1.22,C0000+633,3,104,0.52,3.3,4,282.0,419.0,,,,,,,,,4,2,P,,16.0,,,,,,,,,,,,,,2.0,1.0,P,-,15.0,3.0,1,,,,,,,,,0,0,3,2,1044,5,7,0,,,,,,,,,,,,,,
2,Cl_Blanco_1,Blanco_1,1.06343,-29.92162,190.0,7.70,89.0,0001-302,14.97,-79.26,C0001-302,24,1,0.51,3.3,5,43.0,49.0,170.0,374.0,0.03,322.0,0.09,64,,,3,2,M,,8.0,10105.0,,,,,20.0,1.0,1.0,1.0,0.00,380.0,-0.15,380.0,4.0,3.0,M,-,30.0,70.0,1,0.0,5.0,2.0,1.0,73.0,,,,0,0,4,869,0,6,827,0,240.0,4.0,-0.15,70.0,3.0,0.02,,3.0,138.0,123.0,,,,
3,C_0001+557,Stock_19,1.09607,56.02838,,,3.0,0001+557,116.35,-6.24,C0001+557,11,19,0.52,3.3,6,274.0,437.0,,,,,,,,,2,2,P,,8.0,10105.0,,,,,,,,,,,,,3.0,1.0,P,-,6.0,2.0,1,,,,,,,,,0,0,5,1051,2,7,1052,44,,,,,,,,,,,,,,
4,C_0005+611,Czernik_1,1.92972,61.41163,,,9.0,0005+611,117.73,-1.02,C0005+611,4,1,0.52,3.3,7,283.0,117.0,,,,,,,DO,19.0,4,2,M,,,,,,,,,,,,,,,,1.0,2.0,P,-,12.0,3.0,1,,,,,,,,,0,0,6,4,0,8,8,27,,,,,,,,,,,,,,
5,Cl_Berkeley_1,Berkeley_1,2.40822,60.42822,,,5.0,0007+601,117.79,-2.03,C0007+601,3,1,0.53,3.3,8,284.0,419.0,,,,,,,,,4,2,P,,,,,,,,,,,,,,,,3.0,1.0,P,-,10.0,5.0,1,,,,,,,,,0,0,7,7,0,9,9,12,,,,,,,,,,,,,,
6,C_0007+609,King_13,2.53490,61.21153,,,7.0,0007+609,117.98,-1.28,C0007+609,10,13,0.53,3.3,9,285.0,329.0,,,,,,,,,4,2,P,,12.0,,,,,,,,,,,,,,2.0,2.0,M,-,30.0,5.0,1,,,,,,,,,0,0,8,8,1050,10,3,19,,,,,,,,,,,,,,
7,Cl_Berkeley_60,Berkeley_60,4.42607,60.96103,,,4.0,0015+606,118.85,-1.64,C0015+606,3,60,0.54,3.3,10,288.0,419.0,,,,,,,,,4,2,P,,14.0,,,,,,,,,,,,,,3.0,1.0,P,-,20.0,3.0,1,,,,,,,,,0,0,9,3,3,11,1155,30,,,,,,,,,,,,,,
8,C_0019+641,King_1,5.49230,64.39395,,,7.0,0019+641,119.75,1.69,C0019+641,10,1,0.55,3.3,11,290.0,284.0,,,,,,,,,3,2,P,,13.0,,,,,,,,,,,,,,2.0,2.0,R,-,100.0,9.0,1,,,,,,,,,0,0,10,12,0,1155,13,32,,,,,,,,,,,,,,
9,Cl_Berkeley_2,Berkeley_2,6.31620,60.39356,,,4.0,0022+601,119.70,-2.31,C0022+601,3,2,0.55,3.3,12,289.0,419.0,,,,,,,,,1,3,M,,15.0,,,,,,,,,,,,,,1.0,1.0,M,-,30.0,2.0,1,,,,,,,,,0,0,44,1155,8,13,11,29,,,,,,,,,,,,,,


In [13]:
#mwsc + webda
mwsc_webda_df = mwsc_df.join(webda_df.set_index('Simbad_ID'),
                            on='Simbad_ID',how='outer', lsuffix='_mwsc', rsuffix='_webda')
      
# + piskunov
mwsc_webda_piskunov_df = mwsc_webda_df.join(piskunov_df.set_index('Simbad_ID'), 
                            on='Simbad_ID', how='outer', rsuffix='_piskunov')

# + kharchenko
mwsc_webda_piskunov_kharchenko_df = mwsc_webda_piskunov_df.join(kharchenko_df.set_index('Simbad_ID'), 
                            on='Simbad_ID', how='outer', rsuffix='_kharchenko')

# + Salaris
mwsc_webda_piskunov_kharchenko_salaris_df = mwsc_webda_piskunov_kharchenko_df.join(salaris_df.set_index('Simbad_ID'), 
                            on='Simbad_ID', how='outer', rsuffix='_salaris')

# + vandenberg
mwsc_webda_piskunov_kharchenko_salaris_vandenbergh_df = mwsc_webda_piskunov_kharchenko_salaris_df.join(
                            vandenbergh_df.set_index('Simbad_ID'), 
                            on='Simbad_ID', how='outer', rsuffix='_vandenbergh')


# + Gaia
mwsc_webda_piskunov_kharchenko_salaris_vandenbergh_gaia_df = mwsc_webda_piskunov_kharchenko_salaris_vandenbergh_df.join(
                            gaiaDR2_df.set_index('Simbad_ID'), 
                            on='Simbad_ID', how='outer', rsuffix='_cantat-gaudin')

# + Lynga
mwsc_webda_piskunov_kharchenko_salaris_vandenbergh_gaia_lynga_df = mwsc_webda_piskunov_kharchenko_salaris_vandenbergh_gaia_df.join(
                            lynga_df.set_index('Simbad_ID'), 
                            on='Simbad_ID', how='outer', rsuffix='_lynga')

#rename
OCs = mwsc_webda_piskunov_kharchenko_salaris_vandenbergh_gaia_lynga_df.copy()

print(len(OCs.loc[pd.isnull(OCs['Simbad_ID'])]))

#reindex
OCs = OCs.reset_index(drop=True)

#if there are NaN rows; drop them
idx = OCs.index[pd.isnull(OCs['Simbad_ID'])]
OCs.drop(idx, inplace=True)

#reindex
OCs = OCs.reset_index(drop=True)


print(len(mwsc_df), len(webda_df), len(piskunov_df), len(kharchenko_df), len(salaris_df), len(vandenbergh_df),
     len(gaiaDR2_df), len(lynga_df), len(mwsc_webda_piskunov_kharchenko_salaris_vandenbergh_gaia_df),
      len(mwsc_webda_piskunov_kharchenko_salaris_vandenbergh_gaia_lynga_df), len(OCs))

print(OCs.columns.values)

#dump to a file
OCs.to_csv('OCcompiled.csv', index=False)


0
2908 936 650 3005 70 595 1228 1078 3314 3353 3353
['Simbad_ID' 'Name_mwsc' 'broad_type_mwsc' 'cluster_status_mwsc' 'ra_mwsc'
 'dec_mwsc' 'lii_mwsc' 'bii_mwsc' 'core_radius_mwsc' 'central_radius_mwsc'
 'cluster_radius_mwsc' 'pm_ra_mwsc' 'pm_dec_mwsc' 'pm_tot_error_mwsc'
 'rad_vel_mwsc' 'rad_vel_error_mwsc' 'num_rad_vel_stars_mwsc'
 'num_core_stars_mwsc' 'num_central_stars_mwsc' 'num_cluster_stars_mwsc'
 'distance_mwsc' 'e_bv_mwsc' 'distance_modulus_mwsc' 'e_jk_mwsc'
 'e_jh_mwsc' 'delta_h_mwsc' 'log_age_mwsc' 'log_age_error_mwsc'
 'num_log_age_stars_mwsc' 'king_core_radius_mwsc'
 'king_core_radius_error_mwsc' 'king_tidal_radius_mwsc'
 'king_tidal_radius_error_mwsc' 'king_norm_factor_mwsc'
 'king_norm_factor_error_mwsc' 'reference_code_mwsc' 'cluster_type_mwsc'
 'metallicity_mwsc' 'metallicity_error_mwsc' 'num_metallicity_stars_mwsc'
 'comments_mwsc' 'class_mwsc' 'Name_webda' 'RA_2000_webda'
 'Dec_2000_webda' 'l_webda' 'b_webda' 'Dist_webda' 'Mod_webda'
 'EB-V_webda' 'Age_webda' 'ST_web

In [14]:
#a quick check to make sure that items matched up
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
#print(OCs.loc[OCs['Simbad_ID'] == 'C_1440+697'].iloc[0])
print(OCs.loc[OCs['Simbad_ID'] == 'NGC_2682'].iloc[0])

Simbad_ID                                                                NGC_2682
Name_mwsc                                                                NGC_2682
broad_type_mwsc                                                                  
cluster_status_mwsc                                                              
ra_mwsc                                                                  08 51 23
dec_mwsc                                                                 +11 48.9
lii_mwsc                                                                  215.691
bii_mwsc                                                                   31.923
core_radius_mwsc                                                              0.1
central_radius_mwsc                                                          0.55
cluster_radius_mwsc                                                          1.03
pm_ra_mwsc                                                                  -7.31
pm_dec_mwsc     

In [15]:
#check for duplicates (by definition there should be none)
for index, row in  OCs.iterrows():
    check = OCs.loc[OCs['Simbad_ID'] == row['Simbad_ID']]
    if (len(check['Simbad_ID']) != 1):
        print('OCS', row['Simbad_ID'])
        print('Check', check['Simbad_ID'].values, len(check['Simbad_ID']))
        print('')

In [16]:
check = OCs.loc[OCs['Simbad_ID'].str.contains('Berkeley_69')]
print(len(check['Simbad_ID']))


1


In [17]:
check = OCs.loc[OCs['Simbad_ID'] == 'NAME_Trapezium_Cluster']
#check = mwsc_df.loc[mwsc_df['Simbad_ID'] == '[BDS2003]_73']
#check = mwsc_df.loc[mwsc_df['Name']== 'BDSB_73']
#print(len(check['Simbad_ID']))
check

Unnamed: 0,Simbad_ID,Name_mwsc,broad_type_mwsc,cluster_status_mwsc,ra_mwsc,dec_mwsc,lii_mwsc,bii_mwsc,core_radius_mwsc,central_radius_mwsc,cluster_radius_mwsc,pm_ra_mwsc,pm_dec_mwsc,pm_tot_error_mwsc,rad_vel_mwsc,rad_vel_error_mwsc,num_rad_vel_stars_mwsc,num_core_stars_mwsc,num_central_stars_mwsc,num_cluster_stars_mwsc,distance_mwsc,e_bv_mwsc,distance_modulus_mwsc,e_jk_mwsc,e_jh_mwsc,delta_h_mwsc,log_age_mwsc,log_age_error_mwsc,num_log_age_stars_mwsc,king_core_radius_mwsc,king_core_radius_error_mwsc,king_tidal_radius_mwsc,king_tidal_radius_error_mwsc,king_norm_factor_mwsc,king_norm_factor_error_mwsc,reference_code_mwsc,cluster_type_mwsc,metallicity_mwsc,metallicity_error_mwsc,num_metallicity_stars_mwsc,comments_mwsc,class_mwsc,Name_webda,RA_2000_webda,Dec_2000_webda,l_webda,b_webda,Dist_webda,Mod_webda,EB-V_webda,Age_webda,ST_webda,Z_webda,Diam_webda,Fe/H_webda,MRV_webda,pm RA_webda,pm Dec_webda,Measures_webda,Stars_webda,COCD_piskunov,Name_piskunov,GLON[deg]_piskunov,GLAT[deg]_piskunov,DistMod_piskunov,E(B-V)_piskunov,Dist[pc]_piskunov,logt[yr]_piskunov,rt[pc]_piskunov,e_rt[pc]_piskunov,logM[MSun]_piskunov,e_logM[MSun]_piskunov,rtA[pc]_piskunov,e_rtA[pc]_piskunov,logMA[MSun]_piskunov,e_logMA[MSun]_piskunov,MWSC_kharchenko,Name_kharchenko,Type_kharchenko,n_Type_kharchenko,RA[hr]_kharchenko,Dec[deg]_kharchenko,GLON[deg]_kharchenko,GLAT[deg]_kharchenko,r0[deg]_kharchenko,r1[deg]_kharchenko,r2[deg]_kharchenko,pmRA[mas/yr]_kharchenko,pmDec[mas/yr]_kharchenko,e_pm[mas/yr]_kharchenko,RV[km/s]_kharchenko,e_RV[km/s]_kharchenko,n_RV[km/s]_kharchenko,N1sr0_kharchenko,N1sr1_kharchenko,N1sr2_kharchenko,d[pc]_kharchenko,E(B-V)_kharchenko,appDistMod[mag]_kharchenko,E(J-Ks)_kharchenko,E(J-H)_kharchenko,dH_kharchenko,logt[yr]_kharchenko,e_logt[yr]_kharchenko,Nt_kharchenko,rc[pc]_kharchenko,e_rc[pc]_kharchenko,rt[pc]_kharchenko,e_rt[pc]_kharchenko,k[pc-2]_kharchenko,e_k[pc-2]_kharchenko,Src_kharchenko,SType_kharchenko,[Fe/H][Sun]_kharchenko,e_[Fe/H][Sun]_kharchenko,n_[Fe/H]_kharchenko,Name_salaris,dV_salaris,err_dV_salaris,[Fe/H]_salaris,err_[Fe/H]_salaris,t[Gyr]_salaris,err_t_salaris,Rgc[kpc]_salaris,z[pc]_salaris,flag_salaris,tJP94[Gyr]_salaris,Seq_vandenbergh,Name_vandenbergh,l[deg]_vandenbergh,Diam[pc]_vandenbergh,R[pc]_vandenbergh,Z[pc]_vandenbergh,E(B-V)_vandenbergh,logT[yr]_vandenbergh,SimbadName_vandenbergh,_RA[deg]_vandenbergh,_Dec[deg]_vandenbergh,_RAJ2000_cantat-gaudin,_DEJ2000_cantat-gaudin,Name_cantat-gaudin,RAJ2000[deg]_cantat-gaudin,DEJ2000[deg]_cantat-gaudin,GLON[deg]_cantat-gaudin,GLAT[deg]_cantat-gaudin,r50[deg]_cantat-gaudin,Nstars_cantat-gaudin,pmRA[mas/yr]_cantat-gaudin,pmDE[mas/yr]_cantat-gaudin,plx[mas]_cantat-gaudin,dmode[pc]_cantat-gaudin,Rgc[pc]_cantat-gaudin,SimbadName_cantat-gaudin,Name_lynga,ra_lynga,dec_lynga,distance_lynga,log_age_lynga,angular_diameter_lynga,alt_name_lynga,lii_lynga,bii_lynga,iau_num_lynga,seq_code_lynga,seq_num_lynga,prec_ra_lynga,prec_dec_lynga,lund_record_num_lynga,ocl_num_lynga,ref_angular_diameter_lynga,ref_distance_lynga,ref_log_age_lynga,metallicity_lynga,ref_metallicity_lynga,e_bv_lynga,ref_e_bv_lynga,type_flag_lynga,ref_type_flag_lynga,tr_concent_class_lynga,tr_range_class_lynga,tr_richness_class_lynga,tr_nebulosity_lynga,sb_bs_mag_lynga,sb_spect_code_lynga,sb_total_mag_lynga,sk_total_mag_lynga,sk_bv_color_lynga,sk_num_stars_lynga,ja_star_num_lynga,ja_class_lynga,ja_max_class_lynga,ja_richness_lynga,ja_e_bv_lynga,ref_ja_e_bv_lynga,ja_bv_turnoff_lynga,ref_ja_bv_turnoff_lynga,ly_tr_concent_class_lynga,ly_tr_range_class_lynga,ly_tr_richness_class_lynga,ly_tr_nebulosity_lynga,ly_member_stars_lynga,ly_angular_diameter_lynga,ly_refs_flag_lynga,radvel_weight_lynga,radvel_lynga,radvel_weight_class_lynga,ref_radvel1_lynga,ref_radvel2_lynga,ref_radvel3_lynga,ref_radvel4_lynga,ref_radvel5_lynga,basel_spect_code_lynga,basel_color_type_lynga,neg_ra_tracer_lynga,neg_lii_tracer_lynga,neg_seq_num_tracer_lynga,pos_ra_tracer_lynga,pos_lii_tracer_lynga,pos_seq_num_tracer_lynga,jdl_distance_lynga,jdl_distance_weight_lynga,jdl_turnoff_color_lynga,jdl_age_lynga,jdl_age_weight_lynga,jdl_reddening_lynga,jdl_reddening_flag_lynga,jdl_reddening_weight_lynga,ref_jdl1_lynga,ref_jdl2_lynga,ref_jdl3_lynga,ref_jdl4_lynga,ref_jdl5_lynga,ref_jdl6_lynga
3322,NAME_Trapezium_Cluster,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,TrapeziuM,83.83945,-5.36864,450.0,7.4,47.0,0532-054,209.01,-19.37,C0532-054,30.0,1.0,0.49,0.4,188.0,528.0,269.0,50.0,502.0,,,,,,,0.0,0.0,,N,,10006.0,,,,,15.0,1.0,1.0,1.0,0.06,210.0,-0.2,210.0,,,,,,,0.0,1.0,23.0,5.0,1.0,102.0,,,,0.0,0.0,190.0,238.0,0.0,1159.0,189.0,0.0,455.0,1.0,-0.2,41.0,1.0,0.06,V,1.0,209.0,,,,,


In [18]:
check = OCs.loc[pd.isna(OCs['Simbad_ID'])]
print(len(check['Simbad_ID']))
#check

0


## Check for duplicates in RA and Dec

In [19]:
def getCoord(row):

    if (pd.notna(row['_RAJ2000_cantat-gaudin']) and pd.notna(row['_DEJ2000_cantat-gaudin'])):
        return SkyCoord(ra=row['_RAJ2000_cantat-gaudin']+' hours', dec=row['_DEJ2000_cantat-gaudin']+' degree', frame='icrs')
    
    elif (pd.notna(row['ra_mwsc']) and pd.notna(row['dec_mwsc'])):
        return SkyCoord(ra=row['ra_mwsc']+' hours', dec=row['dec_mwsc']+' degree', frame='icrs')
        
    elif (pd.notna(row['RA[hr]_kharchenko']) and pd.notna(row['Dec[deg]_kharchenko'])):
        return SkyCoord(ra=row['RA[hr]_kharchenko']*units.hourangle, dec=row['Dec[deg]_kharchenko']*units.degree, frame='icrs')

    elif (pd.notna(row['GLON[deg]_piskunov']) and pd.notna(row['GLAT[deg]_piskunov'])):
        return SkyCoord(l=row['GLON[deg]_piskunov']*units.degree, b=row['GLAT[deg]_piskunov']*units.degree, frame='galactic').icrs

    elif (pd.notna(row['ra_lynga']) and pd.notna(row['dec_lynga'])):
        return SkyCoord(ra=row['ra_lynga']*units.degree, dec=row['dec_lynga']*units.degree, frame='icrs')
    
    elif (pd.notna(row['_RA[deg]_vandenbergh']) and pd.notna(row['_Dec[deg]_vandenbergh'])):
        return SkyCoord(ra=row['_Dec[deg]_vandenbergh']*units.degree, dec=row['_Dec[deg]_vandenbergh']*units.degree, frame='icrs')

    elif (pd.notna(row['RA_2000_webda']) and pd.notna(row['Dec_2000_webda'])):
        return SkyCoord(ra=row['RA_2000_webda']+' hours', dec=row['Dec_2000_webda']+' degree', frame='icrs')
            
    elif (pd.notna(row['GLON[deg]_cantat-gaudin']) and pd.notna(row['GLAT[deg]_cantat-gaudin'])):
        return SkyCoord(l=row['GLON[deg]_cantat-gaudin']*units.degree, b=row['GLAT[deg]_cantat-gaudin']*units.degree, frame='galactic').icrs

    elif (pd.notna(row['l_webda']) and pd.notna(row['b_webda'])):
        return SkyCoord(l=row['l_webda']*units.degree, b=row['b_webda']*units.degree, frame='galactic').icrs
    
    else:
        print('NO RA, Dec : ', row['Simbad_ID'])
        return False

In [20]:
#first get the coordinates in lists
RA = []
Dec = []
for index, row in OCs.iterrows():
    #RA and Dec
    c = getCoord(row)
    if (c):
        RA.append(c.ra.degree)
        Dec.append(c.dec.degree)

catalog = SkyCoord(ra = RA*units.degree, dec = Dec*units.degree)

In [21]:
#now match to the full catalog to see if there are duplicates
#I think these are OK overlaps (really different clusters), 
#and now using differnt order in RA and Dec, I don't see most of them below anymore
#!!! OVERLAP NGC_3590 Hogg_12 [0.46110687] 253 2967
#!!! OVERLAP C_0925-549 Ruprecht_77 [0.90177733] 473 3360
#!!! OVERLAP NGC_6997 NGC_6996 [0.91921481] 3238 3250
#!!! OVERLAP Cl_Platais_8 NAME_HIP_67014_Cluster [0.44914403] 3260 3266
#!!! OVERLAP AH03_J0822-36.4 NGC_2579 [0.97738812] 888 3046 #not sure about this; NGC 2579 is labelled as HII region in Simbad
#!!! OVERLAP Cl_Pismis_24 NGC_6357 [0.97546562] 3134 3347 #not sure about this; NGC 6357 is labelled as HII region in Simbad

#should I change Trapezium in lynga to M42? I think yes
#!!! OVERLAP NAME_Trapezium_Cluster M_42 [0.88223261] 3325 2918

max_sep = 1.0 * units.arcmin
nover = 0
for index, row in OCs.iterrows():
    c = getCoord(row)
    idx, d2d, d3d = c.match_to_catalog_sky(catalog, nthneighbor=2) #first neighbor is itself
    #print(index, row['Name'], OCs.iloc[int(idx)]['Name'], d2d.degree)
    if (d2d < max_sep and idx != index):
        print('!!! OVERLAP',row['Simbad_ID'], OCs.iloc[int(idx)]['Simbad_ID'], d2d.arcminute, index, idx)
        nover += 1

print(nover)
        


!!! OVERLAP M_42 NAME_Trapezium_Cluster [0.88223261] 2918 3322
!!! OVERLAP NGC_6997 NGC_6996 [0.91921481] 3213 3225
!!! OVERLAP NGC_6996 NGC_6997 [0.91921481] 3225 3213
!!! OVERLAP NAME_Trapezium_Cluster M_42 [0.88223261] 3322 2918
4
