# REDO AGE, MASS PLOTS

# Read in all the files, and make some plots

*Andrew Bowen provided some of the script to read in the file. [See his GitHub repo](https://github.com/andrewbowen19/CEB_Project)*

Some alterations to files:
- changed Pismis 6 to NGC 2645 in MWSC, Piskunov and Kharchenko
- Luginbuhl-Skiff_1 is Skiff_J0614+12.9 (?) changed in WEBDA
- Skiff_2 is Skiff_J0458+43.0 (?) changed in WEBDA
- PTB 9 is a planetary nebula (not an OC --though maybe in the OC NGC 7762?) -- changed label in MWSC, removed from Kharchenko
- Collinder_258 is Harvard_5 (same PM, distance, etc.), both are in Gaia table, removed Harvard_5
- FSR_1686 is Juchert_10, changed to Juchert 10 in MWSC and Kharchenko
- changed many BH or VBH, etc. to vdBergh-Hagen
- removed Berkeley 42 from vandenbergh and Salaris because it's a GC (and included in Harris)
- removed Mrk 38 from vandenbergh because it's a pair of galaxies
- removed Mrk 50 from vandenbergh because it's a Seyfert 1 galaxy
- vdBergh-Hagen_133 is Collinder_258 is Harvard 5,removed vdBergh-Hagen 133 in lynga
- Berkeley_30 is Biurakan_9, changed to Berkeley 30 in lynga
- Berkeley_32 Biurakan_8, changed to Berkeley 32 in lynga
- removed NGC 281 from lunga (HII region)
- not sure about NGC_2579 and AH03_J0822-36.4 (possible overlap)
- removed vdBergh-Hagen_1 from vandenberg = reflection nebula
- Cl VDBH 47 is IC 2395, remove vdBergh-Hagen 47 from WEBDA and lynga
- vdBergh-Hagen_218 might be NGC_6318 , removed vdBergh-Hagen_218 from lynga

In [3]:
import pandas as pd
import numpy as np
from scipy.stats import gaussian_kde
import sys
import time
import re

from astropy.coordinates import SkyCoord
from astropy import units 

from astroquery.simbad import Simbad
Simbad.TIMEOUT = 300 # sets the timeout to 60s

import warnings
warnings.filterwarnings('ignore', category=UserWarning, append=True)

import matplotlib.pyplot as plt

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 50)

%matplotlib inline



In [4]:
from difflib import SequenceMatcher

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [17]:
def OLDmatchToSimbad(cat_ID, cat_RA, cat_Dec, gal=False, max_sep = 2.0 * units.arcmin, str_overlap = 0.8, fillCat = True):
    #I'd like to use the Simbad names for matching (if possible) so I reduce the duplicates, 
    #but I need to match to coords also !

    if (gal):
        catalog = SkyCoord(l = cat_RA, b = cat_Dec, frame='galactic').icrs
    else:
        catalog = SkyCoord(ra = cat_RA, dec = cat_Dec, frame='icrs')

    match_ID = np.array([str(x) for x in cat_ID], dtype='object')
    return_ID = np.full(len(match_ID), np.nan, dtype='object')

    #fix the names
    #is this correct? : .replace('vdBergh','CL VDBH')
    for i, x in enumerate(match_ID):
        match_ID[i] = x.replace('vdBergh-Hagen_','Cl_VDBH_')\
                        .replace('vdBergh_','Cl_VDB_')\
                        .replace('FSR_','[FSR2007]_')\
                        .replace('DBSB_','[DBS2003]_')\
                        .replace('BDSB_','[BDS2003]_')\
                        .replace('BDS_','[BDS2003]_')\
                        .replace('Alessi_','Cl_Alessi_')\
                        .replace('Bica_','Cl_Bica_')\
                        .replace('Sher_','Cl_Sher_')\
                        .replace('MWSC_','[KPS2012]_MWSC_')\
                        .replace('Schuster_1','NAME_SCHUSTER_CL')\
                        .replace('ASCC_','[KPR2005]_')\
                        .replace('Andrews-Lindsay_', '[AL67]_Cl*')\
                        .replace('Arp-Madore', 'AM')\
                        .replace('LDN_988e', 'NAME_[C86]_L988_e_Cluster')\
                        .replace('Juchert-Saloran_','Juchert-Saloranta_')\
                        .replace('Ivanov', '[IBP2002]_CC')\
                        .replace('DB2001_','[DB2001]_Cl ')\
                        .replace('Cl_Cl','Cl')\
                        .replace('[KPS2012]_[KPS2012]','[KPS2012]')\

#     result_table = Simbad.query_objects(match_ID) #stupidly, this does not return blank rows for missing data!
# below I tried matching by RA and Dec, but I'm finding that (according to authors) there are quite a few 
# unique clusters, that are very close together, and it's not clear how to get avoid mismatches.
# If it returned blank lines, then this wouldn't be an issue!!
# So I'm trying to go one line at a time, but of course Simbad doesn't like that... it rejects my connection when
# I have too many requests (and what is too many?  and how long should I wait?)
    for i, ID in enumerate(match_ID):
        result_table = Simbad.query_object(ID)
        if (result_table):
            #print(result_table)
            if (len(result_table) > 0):
                row = result_table[0]
                newID =  re.sub('\s+', ' ', row['MAIN_ID'].decode("utf-8")).strip().replace(' ','_' )
                return_ID[i] = newID
        print(f'{i} {float(i)/len(match_ID):5.3f}, {ID} {return_ID[i]} -- ', end='')
        if (i>0 and i % 50 == 0):
            time.sleep(30)
            print('\n\n')
        time.sleep(0.2)


#     #print(result_table)
    
#     dist = [max_sep for x in match_ID]
#     for row in result_table:
#         c = SkyCoord(ra = row['RA'] + ' hour', dec = row['DEC'] + ' degree', frame='icrs')
#         idx, d2d, d3d = c.match_to_catalog_sky(catalog, nthneighbor=1)
#         newID =  re.sub('\s+', ' ', row['MAIN_ID'].decode("utf-8")).strip().replace(' ','_' )
#         if(~pd.isna(match_ID[idx])): #take only the nearest one (in case there are multiple matches)
#             if (d2d < dist[idx] or similar(newID, match_ID[idx]) > str_overlap):
#                 dist[idx] = d2d
#                 return_ID[idx] = newID

    #search by coordinates? no because I can't get the type of object from this query
    for i, ID in enumerate(return_ID):
        #print(ID)
        #if (pd.isna(ID)):
        #    print('No Simbad match', match_ID[i], ID)
        #    return_table = Simbad.query_region(catalog[i], radius=max_sep)
        #    return_table.pprint_all()
            
        if (pd.isna(ID) and fillCat):
            return_ID[i] = match_ID[i]
        
    return return_ID

In [22]:
def matchToSimbad(cat_ID, fillCat = True):
    #I'd like to use the Simbad names for matching (if possible) so I reduce the duplicates, 

    match_ID = np.array([str(x) for x in cat_ID], dtype='object')
    return_ID = np.full(len(match_ID), np.nan, dtype='object')

    #fix the names
    #is this correct? : .replace('vdBergh','CL VDBH')
    for i, x in enumerate(match_ID):
        match_ID[i] = x.replace('vdBergh-Hagen_','Cl_VDBH_')\
                        .replace('vdBergh_','Cl_VDB_')\
                        .replace('FSR_','[FSR2007]_')\
                        .replace('DBSB_','[DBS2003]_')\
                        .replace('BDSB_','[BDS2003]_')\
                        .replace('BDS_','[BDS2003]_')\
                        .replace('Alessi_','Cl_Alessi_')\
                        .replace('Bica_','Cl_Bica_')\
                        .replace('Sher_','Cl_Sher_')\
                        .replace('MWSC_','[KPS2012]_MWSC_')\
                        .replace('Schuster_1','NAME_SCHUSTER_CL')\
                        .replace('ASCC_','[KPR2005]_')\
                        .replace('Andrews-Lindsay_', '[AL67]_Cl*')\
                        .replace('Arp-Madore', 'AM')\
                        .replace('LDN_988e', 'NAME_[C86]_L988_e_Cluster')\
                        .replace('Juchert-Saloran_','Juchert-Saloranta_')\
                        .replace('Ivanov', '[IBP2002]_CC')\
                        .replace('DB2001_','[DB2001]_Cl ')\
                        .replace('Cl_Cl','Cl')\
                        .replace('[KPS2012]_[KPS2012]','[KPS2012]')\

#     result_table = Simbad.query_objects(match_ID) #stupidly, this does not return blank rows for missing data!
# below I tried matching by RA and Dec, but I'm finding that (according to authors) there are quite a few 
# unique clusters, that are very close together, and it's not clear how to get avoid mismatches.
# If it returned blank lines, then this wouldn't be an issue!!
# So I'm trying to go one line at a time, but of course Simbad doesn't like that... it rejects my connection when
# I have too many requests (and what is too many?  and how long should I wait?)
    for i, ID in enumerate(match_ID):
        result_table = Simbad.query_object(ID)
        if (result_table):
            #print(result_table)
            if (len(result_table) > 0):
                row = result_table[0]
                newID =  re.sub('\s+', ' ', row['MAIN_ID'].decode("utf-8")).strip().replace(' ','_' )
                return_ID[i] = newID
        print(f'{i} {float(i)/len(match_ID):5.3f}, {ID} {return_ID[i]} -- ', end='')
        if (i>0 and i % 100 == 0):
            time.sleep(30)
            print('\n\n')
        time.sleep(0.2)

    #fill nans with the original catalog search input?
    if (fillCat):
        for i, ID in enumerate(return_ID):
            if (pd.isna(ID)):
                return_ID[i] = match_ID[i]
        
    return return_ID

In [18]:
fixMWSC = True
fixWEBDA = True
fixPiskunov = True
fixKharchenko = True
fixSalaris = True
fixVandenBergh = True
fixGaia = True
fixLynga = True

In [19]:
#MWSC - Milky Way Star Clusters Catalog
#https://heasarc.gsfc.nasa.gov/W3Browse/all/mwsc.html
if (fixMWSC):
    mwsc_df = pd.read_csv("MWSC.txt", header=3, delimiter='|')
    mwsc_df.columns = mwsc_df.columns.str.strip()

    
    #take only the open clusters
    mwsc_df = mwsc_df.loc[(~mwsc_df['class'].str.strip().str.contains('GLOBULAR')) &
                          (~mwsc_df['class'].str.strip().str.contains('NEBULA')) &
                          (~mwsc_df['class'].str.strip().str.contains('UNIDENTIFIED'))]

    #remove the "Unnamed" columns
    mwsc_df = mwsc_df.loc[:, ~mwsc_df.columns.str.contains('^Unnamed')]

    print(set(mwsc_df['class'].values))
    
    #fix the IDs
    mwsc_ID = mwsc_df['Name']
    mwsc_ID = mwsc_ID.str.strip().str.replace( ' ','_' )
    mwsc_df['Name'] = mwsc_ID

    #add Simbad names for matching (if possible) so I reduce the duplicates 
    RA = [x + ' hour' for x in mwsc_df['ra']]
    Dec = [x + 'degree' for x in mwsc_df['dec']]
    ID = [x for x in mwsc_df['Name']]
    simbad_ID = matchToSimbad(mwsc_ID, RA, Dec)
    
    mwsc_df.insert(loc=0, column='Simbad_ID', value=simbad_ID)

    #check for duplicates
    for index, row in  mwsc_df.iterrows():
        check = mwsc_df.loc[mwsc_df['Simbad_ID'] == row['Simbad_ID']]
        if (len(check['Simbad_ID']) != 1):
            print('MWSC', row['Simbad_ID'], row['Name'])
            print('Check', check['Simbad_ID'].values, len(check['Simbad_ID']))
            print('')
        
    mwsc_df.to_csv('MWSC_clean.csv', index=False)
    

mwsc_df = pd.read_csv('MWSC_clean.csv')
mwsc_df

{'                                  OPEN STAR CLUSTER', '                          OB ASSOCIATION/HII REGION'}


0 0.000, [KPS2012]_MWSC_4688 [KPS2012]_MWSC_4688 -- 1 0.000, [KPS2012]_MWSC_5684 nan -- 2 0.001, [KPS2012]_MWSC_5692 nan -- 3 0.001, [KPS2012]_MWSC_4005 [KPS2012]_MWSC_4005 -- 4 0.001, [KPS2012]_MWSC_4176 [KPS2012]_MWSC_4176 -- 5 0.002, ESO_008-06 ESO_8-6 -- 6 0.002, [KPS2012]_MWSC_4219 [KPS2012]_MWSC_4219 -- 7 0.002, [KPS2012]_MWSC_5575 nan -- 8 0.003, [KPS2012]_MWSC_4682 [KPS2012]_MWSC_4682 -- 9 0.003, [KPS2012]_MWSC_5685 nan -- 10 0.003, [KPS2012]_MWSC_5681 nan -- 11 0.004, [FSR2007]_1626 [KPS2012]_MWSC_2014 -- 12 0.004, [FSR2007]_1631 [KPS2012]_MWSC_2064 -- 13 0.004, [FSR2007]_1629 [KPS2012]_MWSC_2049 -- 14 0.005, [KPS2012]_MWSC_5688 nan -- 15 0.005, ESO_026-02 ESO_26-2 -- 16 0.006, [KPS2012]_MWSC_4137 [KPS2012]_MWSC_4137 -- 17 0.006, Melotte_227 Cl_Melotte_227 -- 18 0.006, [KPS2012]_MWSC_5679 nan -- 19 0.007, [KPS2012]_MWSC_5572 nan -- 20 0.007, [KPS2012]_MWSC_5749 nan -- 21 0.007, ESO_021-06 ESO_21-6 -- 22 0.008, [KPS2012]_MWSC_5672 nan -- 23 0.008, [KPS2012]_MWSC_4114 [KPS2012]_

209 0.072, Loden_1101 nan -- 210 0.072, ESO_137-43 ESO_137-43 -- 211 0.073, Kronberger_39 Kronberger_39 -- 212 0.073, ESO_137-23 ESO_137-23 -- 213 0.073, Bochum_12 C_1055-614 -- 214 0.074, Loden_1409 nan -- 215 0.074, Ruprecht_92 Cl_Ruprecht_92 -- 216 0.074, [DBS2003]_77 [DBS2003]_77 -- 217 0.075, NGC_3766 NGC_3766 -- 218 0.075, Trumpler_11 C_1003-613 -- 219 0.075, [DBS2003]_80 [DBS2003]_80 -- 220 0.076, Ruprecht_105 Cl_Ruprecht_105 -- 221 0.076, [FSR2007]_1668 [KPS2012]_MWSC_2180 -- 222 0.076, [DBS2003]_66 [DBS2003]_66 -- 223 0.077, Cl_VDBH_110 C_1105-612 -- 224 0.077, [FSR2007]_1666 [KPS2012]_MWSC_2167 -- 225 0.077, Loden_1256 nan -- 226 0.078, Ruprecht_93 Cl_Ruprecht_93 -- 227 0.078, Hogg_17 Cl_Hogg_17 -- 228 0.078, Collinder_272 Cl_Collinder_272 -- 229 0.079, Lynga_2 Cl_Lynga_2 -- 230 0.079, [DBS2003]_62 [DBS2003]_62 -- 231 0.079, Loden_481 nan -- 232 0.080, Ruprecht_161 C_1007-609 -- 233 0.080, Shorlin_2 nan -- 234 0.080, NGC_4103 NGC_4103 -- 235 0.081, Shorlin_1 NAME_Shorlin_1 --

428 0.147, [DBS2003]_125 [DBS2003]_125 -- 429 0.148, IC_2488 IC_2488 -- 430 0.148, [FSR2007]_1701 [KPS2012]_MWSC_2296 -- 431 0.148, Turner_7 Cl_Turner_7 -- 432 0.149, ESO_175-06 ESO_175-6 -- 433 0.149, Ruprecht_114 C_1602-567 -- 434 0.149, [KPR2005]_61 [KPR2005]_61 -- 435 0.150, [FSR2007]_1569 [KPS2012]_MWSC_1935 -- 436 0.150, [FSR2007]_1602 [KPS2012]_MWSC_2008 -- 437 0.150, Loden_89 nan -- 438 0.151, Loden_112 nan -- 439 0.151, [DBS2003]_93 [DBS2003]_93 -- 440 0.151, Lynga_5 Cl_Lynga_5 -- 441 0.152, [FSR2007]_1521 [FSR2007]_1521 -- 442 0.152, NGC_5662 NGC_5662 -- 443 0.152, Loden_27 nan -- 444 0.153, Cl_VDBH_78 C_0942-563 -- 445 0.153, [FSR2007]_1697 [KPS2012]_MWSC_2274 -- 446 0.153, NGC_5999 NGC_5999 -- 447 0.154, Basel_20 C_0931-561 -- 448 0.154, Ruprecht_165 C_1225-561 -- 449 0.154, NGC_3033 NGC_3033 -- 450 0.155, Cl_VDBH_92 Cl_VDBH_92 -- 451 0.155, Loden_189 nan -- 452 0.155, [DBS2003]_141 [DBS2003]_141 -- 453 0.156, [FSR2007]_1648 [KPS2012]_MWSC_2109 -- 454 0.156, Ruprecht_75 C_0

ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))

### Downloaded from WEBDA [here](https://webda.physics.muni.cz/cluster_selall.html)

With RA from 0 to 24 and 0 to 1e6 stars. I copied the table to WEBDA.html, and removed the $<$br$>$ entries, then converted to csv with 

https://codepen.io/malahovks/pen/gLxLWX

or

https://jsfiddle.net/gengns/j1jm2tjx/

Finally, I separated the RA and DEC column header into 2 entries.

*I also have a data file from David James, that has more clusters, but I'm not sure the providence of that, so I won't use it.*

In [27]:
# # WEBDA data file (2013)
# webda_df = pd.read_fwf("WEBDA-OC-table-June2013_DavidJames.txt", 
#                        widths = [18,14,15,11,9,8,8,8,9,6,9,9,9,7,7,9], header = 0)

if (fixWEBDA):
    webda_df = pd.read_csv('WEBDA.csv')
    #fix the IDs
    webda_ID = webda_df['Name']
    webda_ID = webda_ID.str.replace( 'NGC 0','NGC ' ).str.replace( ' ','_' )

    webda_df['Name'] = webda_ID

    #add Simbad names for matching (if possible) so I reduce the duplicates 
    RA = [x + ' hour' for x in webda_df['RA_2000']]
    Dec = [x + 'degree' for x in webda_df['Dec_2000']]
    simbad_ID = matchToSimbad(webda_ID, RA, Dec)
    webda_df.insert(loc=0, column='Simbad_ID', value=simbad_ID)
        
    #check for duplicates  
    for index, row in  webda_df.iterrows():
        check = webda_df.loc[webda_df['Simbad_ID'] == row['Simbad_ID']]
        if (len(check['Simbad_ID']) != 1):
            print('WEBDA', row['Simbad_ID'], row['Name'])
            print('Check', check['Simbad_ID'].values, len(check['Simbad_ID']))
            print('')
        
    webda_df.to_csv('WEBDA_clean.csv', index=False)
    
webda_df = pd.read_csv('WEBDA_clean.csv')
webda_df

Unnamed: 0,Simbad_ID,Name,RA_2000,Dec_2000,l,b,Dist,Mod,EB-V,Age,ST,Z,Diam,Fe/H,MRV,pm RA,pm Dec,Measures,Stars
0,Cl_Berkeley_58,Berkeley_58,00 00 12,+60 58 00,116.753,-1.289,3715.0,14.55,0.550,8.400,,-83.6,5.0,,,,,525,519
1,Cl_Stock_18,Stock_18,00 01 37,+64 37 30,117.624,2.268,2800.0,14.41,0.700,6.780,B0,110.8,6.0,,,,,2261,2261
2,Cl_Berkeley_59,Berkeley_59,00 02 13,+67 25 11,118.220,5.000,1000.0,13.78,1.220,6.800,,87.2,20.4,,-6.50,-4.40,0.73,27,21
3,Cl_Blanco_1,Blanco_1,00 04 07,-29 50 00,15.572,-79.261,269.0,7.18,0.010,7.796,B5,-264.3,70.0,0.23,,20.17,3.00,109,105
4,[KPR2005]_1,ASCC_1,00 09 35,+62 40 48,118.150,0.190,4000.0,13.51,0.160,8.250,,13.3,24.0,,-76.15,-2.07,0.46,32,32
5,Cl_Berkeley_1,Berkeley_1,00 09 36,+60 28 30,117.796,-1.979,2420.0,14.35,0.780,8.600,,-83.6,5.0,,,,,2800,2800
6,C_0007+609,King_13,00 10 06,+61 10 00,117.968,-1.306,3100.0,15.00,0.820,8.500,,-70.7,5.0,,,,,4253,3955
7,[KPR2004b]_4,Alessi_20,00 10 33,+58 45 35,117.640,-3.690,450.0,8.95,0.220,8.220,,-29.0,36.0,,,7.48,-2.61,42,42
8,[KPR2005]_2,ASCC_2,00 19 51,+55 42 35,118.460,-6.890,1200.0,10.71,0.100,8.830,,-144.0,36.0,,,-0.91,-3.94,57,57
9,NAME_Cl_Mayer_1,Mayer_1,00 21 54,+61 44 24,119.440,-0.930,1429.0,12.02,0.400,7.740,,-23.2,24.0,,-20.90,-5.27,-5.87,15146,15131


In [28]:
#Piskunov (2008)
if (fixPiskunov):
    piskunov_df = pd.read_fwf("Piskunov2008.table", 
                              widths = [6,18,7,7,8,6,6,6,6,6,9,9,6,6,9,9], header = None,
                              names = ['COCD','Name','GLON[deg]','GLAT[deg]','DistMod','E(B-V)','Dist[pc]',\
                                       'logt[yr]','rt[pc]','e_rt[pc]','logM[MSun]','e_logM[MSun]','rtA[pc]','e_rtA[pc]',
                                       'logMA[MSun]','e_logMA[MSun]'])

    piskunov_df.replace(-9.999,np.nan, inplace=True)
    piskunov_df.replace(-9.9,np.nan, inplace=True)

    #fix the IDs
    piskunov_ID = piskunov_df['Name']
    piskunov_ID = piskunov_ID.str.strip().str.replace(' ','_' )
    piskunov_df['Name'] = piskunov_ID
    
    #add Simbad names for matching (if possible) so I reduce the duplicates 
    l = [x*units.degree for x in piskunov_df['GLON[deg]']]
    b = [x*units.degree for x in piskunov_df['GLAT[deg]']]
    simbad_ID = matchToSimbad(piskunov_ID, l, b, gal=True)
    piskunov_df.insert(loc=0, column='Simbad_ID', value=simbad_ID)


    #check for duplicates
    for index, row in piskunov_df.iterrows():
        check = piskunov_df.loc[piskunov_df['Simbad_ID'] == row['Simbad_ID']]
        if (len(check['Simbad_ID']) != 1):
            print('Piskunov', row['Simbad_ID'], row['Name'])
            print('Check', check['Simbad_ID'].values, len(check['Simbad_ID']))
            print('')
        
    piskunov_df.to_csv('Piskunov2008_clean.csv', index=False)

piskunov_df = pd.read_csv('Piskunov2008_clean.csv')
piskunov_df

Unnamed: 0,Simbad_ID,COCD,Name,GLON[deg],GLAT[deg],DistMod,E(B-V),Dist[pc],logt[yr],rt[pc],e_rt[pc],logM[MSun],e_logM[MSun],rtA[pc],e_rtA[pc],logMA[MSun],e_logMA[MSun]
0,Cl_Berkeley_58,1,Berkeley_58,116.73,-1.29,14.555,0.55,3715,8.20,,,,,22.9,10.9,3.380,0.623
1,Cl_Berkeley_59,2,Berkeley_59,118.22,5.00,13.782,1.22,1000,6.80,,,,,8.0,3.5,2.221,0.564
2,Cl_Blanco_1,3,Blanco_1,14.17,-79.02,7.180,0.01,269,8.32,22.8,3.8,3.646,0.219,20.0,2.4,3.480,0.160
3,[KPR2004b]_4,4,Alessi_20,117.64,-3.69,8.948,0.22,450,8.22,5.4,1.6,1.742,0.391,4.0,0.8,1.362,0.250
4,NAME_Cl_Mayer_1,5,Mayer_1,119.44,-0.93,12.015,0.40,1429,7.74,,,,,16.7,5.6,3.150,0.442
5,Cl_Stock_20,6,Stock_20,119.92,-0.10,10.413,0.20,909,8.53,,,,,6.6,1.3,1.974,0.260
6,C_0027+577,7,Stock_21,120.05,-4.83,11.447,0.40,1100,8.72,,,,,8.3,2.4,2.259,0.376
7,NGC_129,8,NGC_129,120.27,-2.54,12.759,0.55,1625,7.87,14.9,2.7,2.984,0.240,15.2,2.7,3.011,0.235
8,NGC_146,9,NGC_146,120.87,0.50,13.897,0.48,3032,7.37,,,,,20.8,7.1,3.298,0.445
9,NGC_225,10,NGC_225,122.01,-1.08,9.925,0.27,657,8.19,,,,,5.7,1.3,1.787,0.311


In [29]:
#Kharchenko (2013)
if (fixKharchenko):
    kharchenko_df = pd.read_fwf("Kharchenko2013.table", 
                              widths = [5,18,2,1,9,8,8,8,7,7,7,7,7,7,8,8,6,6,7,7,8,7,7,7,7,7,7,7,4,8,8,8,8,8,8,5,4,8,7,4], 
                              header = None,
                              names = ['MWSC','Name','Type','n_Type','RA[hr]','Dec[deg]','GLON[deg]','GLAT[deg]',
                                       'r0[deg]','r1[deg]','r2[deg]','pmRA[mas/yr]','pmDec[mas/yr]','e_pm[mas/yr]',
                                       'RV[km/s]','e_RV[km/s]','n_RV[km/s]','N1sr0','N1sr1','N1sr2','d[pc]','E(B-V)',
                                       'appDistMod[mag]','E(J-Ks)','E(J-H)','dH','logt[yr]','e_logt[yr]','Nt','rc[pc]',
                                       'e_rc[pc]','rt[pc]','e_rt[pc]','k[pc-2]','e_k[pc-2]','Src','SType','[Fe/H][Sun]',
                                       'e_[Fe/H][Sun]','n_[Fe/H]'])

    kharchenko_df['RV[km/s]'].replace(999.99,np.nan, inplace=True)
    kharchenko_df['e_RV[km/s]'].replace(99.99,np.nan, inplace=True)
    kharchenko_df['e_logt[yr]'].replace(0.000,np.nan, inplace=True)
    kharchenko_df['Nt'].replace(-1,np.nan, inplace=True)
    kharchenko_df['rc[pc]'].replace(0.00,np.nan, inplace=True)
    kharchenko_df['e_rc[pc]'].replace(0.00,np.nan, inplace=True)
    kharchenko_df['rt[pc]'].replace(0.00,np.nan, inplace=True)
    kharchenko_df['e_rt[pc]'].replace(0.00,np.nan, inplace=True)
    kharchenko_df['k[pc-2]'].replace(0.00,np.nan, inplace=True)
    kharchenko_df['e_k[pc-2]'].replace(0.00,np.nan, inplace=True)
    kharchenko_df['[Fe/H][Sun]'].replace(99.999,np.nan, inplace=True)
    kharchenko_df['e_[Fe/H][Sun]'].replace(9.99,np.nan, inplace=True)
    kharchenko_df['n_[Fe/H]'].replace(0.1,np.nan, inplace=True)

    #kharchenko_df.loc[kharchenko_df['Name'] == 'Skiff_J0458+43.0']
    
    #add Simbad names for matching (if possible) so I reduce the duplicates 
    kharchenko_ID = kharchenko_df['Name']
    RA = [x*units.hourangle for x in kharchenko_df['RA[hr]']]
    Dec = [x*units.degree for x in kharchenko_df['Dec[deg]']]
    simbad_ID = matchToSimbad(kharchenko_ID, RA, Dec)
    kharchenko_df.insert(loc=0, column='Simbad_ID', value=simbad_ID)
    
    kharchenko_df.to_csv('Kharchenko2013_clean.csv', index=False)

kharchenko_df = pd.read_csv('Kharchenko2013_clean.csv')
kharchenko_df

Unnamed: 0,Simbad_ID,MWSC,Name,Type,n_Type,RA[hr],Dec[deg],GLON[deg],GLAT[deg],r0[deg],r1[deg],r2[deg],pmRA[mas/yr],pmDec[mas/yr],e_pm[mas/yr],RV[km/s],e_RV[km/s],n_RV[km/s],N1sr0,N1sr1,N1sr2,d[pc],E(B-V),appDistMod[mag],E(J-Ks),E(J-H),dH,logt[yr],e_logt[yr],Nt,rc[pc],e_rc[pc],rt[pc],e_rt[pc],k[pc-2],e_k[pc-2],Src,SType,[Fe/H][Sun],e_[Fe/H][Sun],n_[Fe/H]
0,Cl_Berkeley_58,1,Berkeley_58,,,0.0045,60.933,116.750,-1.326,0.025,0.087,0.155,0.56,1.56,0.26,,,0,16,88,197,2700.0,0.720,12.389,0.346,0.231,0.000,8.470,0.047,10.0,1.12,0.25,13.66,3.31,5.19,1.00,COCD,,,,0
1,NGC_7801,2,NGC_7801,,,0.0055,50.727,114.717,-11.331,0.015,0.070,0.156,-3.20,-3.47,0.71,,,0,2,14,65,1953.0,0.146,11.500,0.070,0.047,0.000,9.255,,1.0,0.61,0.33,9.93,6.14,2.67,1.44,DIAS,,,,0
2,[KPS2012]_MWSC_0003,3,FSR_0459,,,0.0085,59.242,116.446,-2.990,0.018,0.055,0.090,-1.66,-0.01,0.53,,,0,3,24,50,2926.0,1.145,12.700,0.550,0.367,0.000,7.800,,,0.39,0.21,7.65,4.63,5.62,3.53,DIAS,irc,,,0
3,Cl_Stock_18,4,Stock_18,,,0.0265,64.625,117.617,2.266,0.010,0.050,0.080,-3.59,-1.15,0.54,,,0,2,20,32,774.0,0.177,9.501,0.085,0.057,-0.030,8.680,,1.0,0.16,0.03,2.14,0.38,355.57,56.43,DIAS,,,,0
4,Cl_Berkeley_59,5,Berkeley_59,,,0.0373,67.425,118.219,5.001,0.035,0.115,0.220,-3.20,-1.11,0.38,-12.50,7.08,3,16,45,89,1000.0,1.241,10.399,0.596,0.398,-0.040,6.100,,,0.55,0.06,6.51,0.74,153.96,12.16,COCD,,,,0
5,Cep_OB4,6,Cep_OB4,ao,,0.0490,67.500,118.299,5.062,0.120,0.345,0.760,-0.93,-2.58,0.21,0.00,,1,61,198,654,850.0,1.099,10.001,0.528,0.352,-0.030,6.100,,,9.54,2.64,13.56,1.39,16.73,10.54,MELN,ass,,,0
6,Cl_Blanco_1,7,Blanco_1,,,0.0590,-30.000,14.830,-79.098,0.500,1.400,2.350,19.71,2.28,0.18,5.48,2.04,24,82,214,266,250.0,0.012,6.994,0.006,0.004,-0.030,7.750,,,2.82,0.38,10.93,1.26,24.84,2.43,COCD,,-0.188,0.098,7
7,Cl_Berkeley_104,8,Berkeley_104,,,0.0568,63.580,117.615,1.202,0.018,0.055,0.135,-4.57,-4.37,0.44,,,0,8,34,136,3599.0,0.606,12.976,0.291,0.194,-0.005,8.805,0.031,15.0,1.13,0.27,14.92,3.92,4.51,0.99,DIAS,,0.070,,0
8,IRAS_00013+6817,9,IRAS_00013+6817,,,0.0664,68.565,118.595,6.091,0.010,0.055,0.090,-3.52,-1.55,0.87,,,0,2,9,23,1338.0,0.833,10.900,0.400,0.267,-0.020,7.150,,,0.73,0.33,5.69,2.54,1.69,0.61,BIEM,irc,,,0
9,[KPS2012]_MWSC_0014,14,FSR_0504,,,0.0970,81.840,121.227,19.116,0.010,0.045,0.080,-3.57,7.83,0.68,,,0,2,10,14,3744.0,0.104,12.900,0.050,0.033,-0.020,9.450,,,0.49,0.19,10.62,4.26,5.03,2.46,FPOS,,,,0


In [30]:
# Salaris (2004) -- no RA or Dec...
if (fixSalaris):
    #salaris_df = pd.read_csv('Salaris2004_viaWEBDA.txt', sep='\t', header = 15)
    salaris_df = pd.read_csv('Salaris2004_table1.txt', sep='\t')
    #print(salaris_df)

    #change the Hyades to Melotte 25 so that it matches with van den Bergh for position
    names = salaris_df['Name'].values
    xx = np.where(names == 'Hyades')
    names[xx] = 'Melotte_25'
    salaris_df['Name'] = [x.strip().replace(' ','_' ) for x in names]
    xx = np.where(names == 'Arp-Madore_2')
    names[xx] = 'AM_2'  

    #no RA and Dec, so can't use the function above
    #this is missing 1 name.  I will look manually... it's Arp-Madore_2', fixed above
    result_table = Simbad.query_objects(names)
    print(len(names), len(result_table))
    #print(names, '\n', result_table['MAIN_ID'].data)
    simbad_ID = names
    for i, row in enumerate(result_table):
        simbad_ID[i] =  re.sub('\s+', ' ', row['MAIN_ID'].decode("utf-8")).strip().replace(' ','_' )
    salaris_df.insert(loc=0, column='Simbad_ID', value=simbad_ID)
    
    #check for duplicates
    for index, row in salaris_df.iterrows():
        check = salaris_df.loc[salaris_df['Simbad_ID'] == row['Simbad_ID']]
        if (len(check['Simbad_ID']) != 1):
            print('Salaris', row['Simbad_ID'], row['Name'])
            print('Check', check['Simbad_ID'].values, len(check['Simbad_ID']))
            print('')
            
    salaris_df.to_csv('Salaris2004_table1_clean.csv', index=False)

salaris_df = pd.read_csv('Salaris2004_table1_clean.csv')
salaris_df

70 70


Unnamed: 0,Simbad_ID,Name,dV,err_dV,[Fe/H],err_[Fe/H],t[Gyr],err_t,Rgc[kpc],z[pc],flag,tJP94[Gyr]
0,Cl_King_2,Cl_King_2,2.2,0.15,0.0,0.2,5.03,1.31,12.98,-510,2,5.6
1,IC_166,IC_166,1.0,0.25,-0.27,0.15,1.32,0.43,10.74,-10,1,1.5
2,NGC_752,NGC_752,0.9,0.05,-0.09,0.06,1.24,0.2,8.75,-145,1,1.4
3,Cl_Berkeley_66,Cl_Berkeley_66,2.0,0.25,0.0,0.2,3.98,1.52,12.59,20,2,4.4
4,NGC_1193,NGC_1193,2.1,0.15,-0.35,0.11,4.23,1.08,12.0,-845,1,4.9
5,C_0311+525,C_0311+525,0.4,0.15,-0.3,0.15,0.76,0.16,10.34,-163,2,0.9
6,NGC_1245,NGC_1245,0.7,0.15,0.1,0.15,1.06,0.23,11.09,-465,1,1.0
7,NGC_1798,NGC_1798,1.0,0.15,-0.47,0.15,1.28,0.29,11.79,290,2,1.5
8,NGC_1817,NGC_1817,0.8,0.05,-0.1,0.09,1.12,0.18,10.26,-410,1,1.3
9,Cl_Berkeley_17,Cl_Berkeley_17,2.8,0.15,-0.33,0.15,10.06,2.77,10.89,-155,1,12.6


In [31]:
#van den Bergh (2006)
#there were two rows for Berkeley 69, with slightly different values.  I kept the first one.
if (fixVandenBergh):
    vandenbergh_df = pd.read_csv('vandenbergh2006.tsv', sep='|', header = 49)

    #fix the names
    names = vandenbergh_df['SimbadName'].values
    def representsInt(s):
        try: 
            int(s)
            return True
        except ValueError:
            return False

    for i,x in enumerate(names):
        if (x[0:2] == 'Cl'):
            names[i] = x[2:]
        if (x[0:1] == 'N' and representsInt(x[1:1])):
            names[i] = 'NGC '+x[1:]
        names[i] = names[i].strip().replace('  ',' ').replace( ' ','_' )

    vandenbergh_df['Name'] = names
    
    #add Simbad names for matching (if possible) so I reduce the duplicates 
    vandenbergh_ID = vandenbergh_df['Name']
    RA = [x*units.degree for x in vandenbergh_df['_RA[deg]']]
    Dec = [x*units.degree for x in vandenbergh_df['_Dec[deg]']]
    simbad_ID = matchToSimbad(vandenbergh_ID, RA, Dec)
    vandenbergh_df.insert(loc=0, column='Simbad_ID', value=simbad_ID)
    
    #check for duplicates
    for index, row in vandenbergh_df.iterrows():
        check = vandenbergh_df.loc[vandenbergh_df['Simbad_ID'] == row['Simbad_ID']]
        if (len(check['Simbad_ID']) != 1):
            print('van den Bergh', row['Simbad_ID'], row['Name'])
            print('Check', check['Simbad_ID'].values, len(check['Simbad_ID']))
            print('')
        
        
    vandenbergh_df.to_csv('vandenbergh2006_clean.csv', index=False)

vandenbergh_df = pd.read_csv('vandenbergh2006_clean.csv')
vandenbergh_df

Unnamed: 0,Simbad_ID,Seq,Name,l[deg],Diam[pc],R[pc],Z[pc],E(B-V),logT[yr],SimbadName,_RA[deg],_Dec[deg]
0,Trumpler_31,1,Trumpler_31,2,1.43,986,-39,0.35,8.87,Trumpler_31,269.97500,-28.18333
1,NGC_6520,2,NGC_6520,2,2.29,1577,-78,0.43,7.72,NGC_6520,270.85000,-27.88333
2,NGC_6530,3,NGC_6530,6,5.42,1330,-31,0.33,6.87,NGC_6530,271.12500,-24.36667
3,Cl_Bochum_14,4,Bochum_14,6,0.34,578,-5,1.51,7.00,Bochum_14,270.50000,-23.70000
4,NGC_6514,5,NGC_6514,7,6.65,816,-4,0.19,7.37,NGC_6514,270.59583,-23.03000
5,NGC_6546,6,NGC_6546,7,3.82,938,-23,0.49,7.85,NGC_6546,271.90000,-23.33333
6,M_21,7,NGC_6531,7,2.91,1205,-7,0.28,7.07,NGC_6531,271.05000,-22.48333
7,M_23,8,NGC_6494,9,5.30,628,31,0.36,8.48,NGC_6494,269.25000,-18.98333
8,Cl_Trumpler_33,10,Trumpler_33,12,2.55,1755,-99,0.36,7.68,Trumpler_33,276.17500,-19.71667
9,Cl_Collinder_469,11,Collinder_469,12,1.29,1481,-20,0.42,7.80,Collinder_469,274.10000,-18.21667


In [32]:
#Gaia DR2 from Cantat-Gaudin+, 2018
if (fixGaia):
    gaiaDR2_df = pd.read_csv('Cantat-Gaudin2018_GaiaDR2.tsv', sep='|', header = 56)

    names = gaiaDR2_df['Name']
    names = names.str.strip()

    for index, row in  gaiaDR2_df.iterrows():
    #for i,n in enumerate(names):
        if (names[index][0:3] == "ESO"):
            names[index] = row['SimbadName'].replace(' ','_')

    gaiaDR2_df['Name'] = names

    #this already contains the Simbad name, but they are not identical to what I'm finding 
    gaiaDR2_SID = gaiaDR2_df['SimbadName'].str.replace(' ','_').replace('Name','NAME').values
    #idx = gaiaDR2_df.index[pd.isnull(gaiaDR2_df['SimbadName'])].tolist()
    #gaiaDR2_SID[idx] = names[idx]
    RA = [x + ' hour' for x in gaiaDR2_df['_RAJ2000']]
    Dec = [x + ' degree' for x in gaiaDR2_df['_DEJ2000']]
    simbad_ID = matchToSimbad(gaiaDR2_SID, RA, Dec)
    #because there was at least one bad match of Gulliver_15 to NGC 6561 (when Gulliver 15 is not in Simbad)
    #also Gulliver 7 to Ruprecht 77
    idx = gaiaDR2_df.index[pd.isnull(gaiaDR2_df['SimbadName'])].tolist()
    simbad_ID[idx] = names[idx]
    gaiaDR2_df.insert(loc=0, column='Simbad_ID', value=simbad_ID)

    #check for duplicates
    for index, row in  gaiaDR2_df.iterrows():
        check = gaiaDR2_df.loc[gaiaDR2_df['Simbad_ID'] == row['Simbad_ID']]
        if (len(check['Simbad_ID']) != 1):
            print('gaiaDR2', row['Simbad_ID'], row['Name'])
            print('Check', check['Simbad_ID'].values, len(check['Simbad_ID']))
            print('')
        
    gaiaDR2_df.to_csv('Cantat-Gaudin2018_GaiaDR2_clean.csv', index=False)

gaiaDR2_df = pd.read_csv('Cantat-Gaudin2018_GaiaDR2_clean.csv')
gaiaDR2_df#.loc[(gaiaDR2_df['Name'] == 'Collinder_258') | (gaiaDR2_df['Name'] == 'Harvard_5')]

Unnamed: 0,Simbad_ID,_RAJ2000,_DEJ2000,Name,RAJ2000[deg],DEJ2000[deg],GLON[deg],GLAT[deg],r50[deg],Nstars,pmRA[mas/yr],pmDE[mas/yr],plx[mas],dmode[pc],Rgc[pc],SimbadName
0,[KPR2005]_10,03 27 28.80,+34 58 51.6,ASCC_10,51.870,34.981,155.723,-17.770,0.558,71,-1.737,-1.368,1.459,672.0,8927.2,[KPR2005] 10
1,[KPR2005]_101,19 13 35.76,+36 22 08.4,ASCC_101,288.399,36.369,68.028,11.608,0.372,75,0.934,1.288,2.488,397.3,8202.3,[KPR2005] 101
2,[KPR2005]_105,19 42 11.52,+27 21 57.6,ASCC_105,295.548,27.366,62.825,2.063,0.648,127,1.464,-1.635,1.783,551.8,8103.0,[KPR2005] 105
3,[KPR2005]_107,19 48 39.36,+21 59 13.2,ASCC_107,297.164,21.987,58.904,-1.901,0.174,59,-0.155,-5.156,1.109,878.5,7922.3,[KPR2005] 107
4,[KPR2005]_108,19 53 13.44,+39 20 56.4,ASCC_108,298.306,39.349,74.378,6.074,0.537,230,-0.519,-1.690,0.838,1154.0,8106.7,[KPR2005] 108
5,[KPR2005]_11,03 32 13.44,+44 51 21.6,ASCC_11,53.056,44.856,150.546,-9.224,0.312,276,0.926,-3.030,1.141,854.5,9083.9,[KPR2005] 11
6,[KPR2005]_110,20 02 58.08,+33 31 40.8,ASCC_110,300.742,33.528,70.411,1.378,0.203,70,0.271,-3.132,0.497,1902.2,7908.1,[KPR2005] 110
7,[KPR2005]_111,20 11 33.84,+37 30 54.0,ASCC_111,302.891,37.515,74.714,2.056,0.537,156,-1.150,-1.524,1.166,836.9,8159.5,[KPR2005] 111
8,[KPR2005]_113,21 11 43.92,+38 38 16.8,ASCC_113,317.933,38.638,82.877,-6.589,0.529,196,0.800,-3.679,1.762,558.2,8289.5,[KPR2005] 113
9,[KPR2005]_114,21 39 57.60,+53 59 49.2,ASCC_114,324.990,53.997,97.082,1.028,0.216,150,-3.716,-3.421,1.066,913.2,8501.0,[KPR2005] 114


In [33]:
#lynga catalo: https://heasarc.gsfc.nasa.gov/W3Browse/star-catalog/lyngaclust.html
if (fixLynga):
    lynga_df = pd.read_csv('lyngaCat.txt',sep='|')

    #fix the column names
    lynga_df.rename(columns=lambda x: x.strip(), inplace=True)

    #fix all the cells
    lynga_df = lynga_df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
    lynga_df.replace(r'^\s*$', np.nan, regex=True, inplace=True)

    #take only the first entries for each name (some have multiple lines
    lynga_df = lynga_df.loc[pd.notna(lynga_df['Name'].str.strip())]

    #fix capitalization in the names!
    names = lynga_df['Name'].values
    for i,n in enumerate(names):
        if ((n[0:3] != 'NGC') and (n[0:2] != 'IC')):
            p1 = n.find(' ')
            n = n[0] + n[1:p1].lower() + n[p1:]

        if (n[0:9] == 'Vdb-hagen'):
            n = 'vdBergh-Hagen' + n[9:]

        if (n[0:10] == 'Hav-moffat'):
            n = 'Havlen-Moffat' + n[10:]

        if (n[0:7] == 'Vdbergh'):
            n = 'vdBergh' + n[7:]

        if (n[0:8] == 'Dol-dzim'):
            n = 'Dol-Dzim' + n[8:]

        if (n == 'Sigma ORI'):
            n = 'Sigma Ori'

        names[i] = n.replace(' ','_')

    lynga_df['Name'] = names

    #print(lynga_df.loc[lynga_df['Name'] == 'NGC_2579'])
    
    #remove the "Unnamed" columns
    lynga_df = lynga_df.loc[:, ~lynga_df.columns.str.contains('^Unnamed')]
    
    #add Simbad names for matching (if possible) so I reduce the duplicates 
    lynga_ID = lynga_df['Name']
    RA = [x*units.degree for x in lynga_df['ra']]
    Dec = [x*units.degree for x in lynga_df['dec']]
    simbad_ID = matchToSimbad(lynga_ID, RA, Dec)
    lynga_df.insert(loc=0, column='Simbad_ID', value=simbad_ID)
    
    #check for duplicates
    for index, row in  lynga_df.iterrows():
        check = lynga_df.loc[lynga_df['Simbad_ID'] == row['Simbad_ID']]
        if (len(check['Simbad_ID']) != 1):
            print('lynga', row['Simbad_ID'], row['Name'])
            print('Check', check['Simbad_ID'].values, len(check['Simbad_ID']))
            print('')
        
    lynga_df.to_csv('lyngaCat_clean.csv', index=False)
    
lynga_df = pd.read_csv('lyngaCat_clean.csv')
lynga_df

Unnamed: 0,Simbad_ID,Name,ra,dec,distance,log_age,angular_diameter,alt_name,lii,bii,iau_num,seq_code,seq_num,prec_ra,prec_dec,lund_record_num,ocl_num,ref_angular_diameter,ref_distance,ref_log_age,metallicity,ref_metallicity,e_bv,ref_e_bv,type_flag,ref_type_flag,tr_concent_class,tr_range_class,tr_richness_class,tr_nebulosity,sb_bs_mag,sb_spect_code,sb_total_mag,sk_total_mag,sk_bv_color,sk_num_stars,ja_star_num,ja_class,ja_max_class,ja_richness,ja_e_bv,ref_ja_e_bv,ja_bv_turnoff,ref_ja_bv_turnoff,ly_tr_concent_class,ly_tr_range_class,ly_tr_richness_class,ly_tr_nebulosity,ly_member_stars,ly_angular_diameter,ly_refs_flag,radvel_weight,radvel,radvel_weight_class,ref_radvel1,ref_radvel2,ref_radvel3,ref_radvel4,ref_radvel5,basel_spect_code,basel_color_type,neg_ra_tracer,neg_lii_tracer,neg_seq_num_tracer,pos_ra_tracer,pos_lii_tracer,pos_seq_num_tracer,jdl_distance,jdl_distance_weight,jdl_turnoff_color,jdl_age,jdl_age_weight,jdl_reddening,jdl_reddening_flag,jdl_reddening_weight,ref_jdl1,ref_jdl2,ref_jdl3,ref_jdl4,ref_jdl5,ref_jdl6
0,Cl_Berkeley_59,Berkeley_59,0.64425,67.37840,,,10.0,0000+671,118.25,4.95,C0000+671,3,59,0.52,3.3,3,286.0,419.0,,,,,,,,,3,2,P,,11.0,,,,,,,,,,,,,,1.0,3.0,M,N,40.0,10.0,1,,,,,,,,,0,0,2,9,1,4,10,10,,,,,,,,,,,,,,
1,Cl_Berkeley_104,Berkeley_104,0.87085,63.59506,,,4.0,0000+633,117.63,1.22,C0000+633,3,104,0.52,3.3,4,282.0,419.0,,,,,,,,,4,2,P,,16.0,,,,,,,,,,,,,,2.0,1.0,P,-,15.0,3.0,1,,,,,,,,,0,0,3,2,1044,5,7,0,,,,,,,,,,,,,,
2,Cl_Blanco_1,Blanco_1,1.06343,-29.92162,190.0,7.70,89.0,0001-302,14.97,-79.26,C0001-302,24,1,0.51,3.3,5,43.0,49.0,170.0,374.0,0.03,322.0,0.09,64,,,3,2,M,,8.0,10105.0,,,,,20.0,1.0,1.0,1.0,0.00,380.0,-0.15,380.0,4.0,3.0,M,-,30.0,70.0,1,0.0,5.0,2.0,1.0,73.0,,,,0,0,4,869,0,6,827,0,240.0,4.0,-0.15,70.0,3.0,0.02,,3.0,138.0,123.0,,,,
3,Stock_19,Stock_19,1.09607,56.02838,,,3.0,0001+557,116.35,-6.24,C0001+557,11,19,0.52,3.3,6,274.0,437.0,,,,,,,,,2,2,P,,8.0,10105.0,,,,,,,,,,,,,3.0,1.0,P,-,6.0,2.0,1,,,,,,,,,0,0,5,1051,2,7,1052,44,,,,,,,,,,,,,,
4,Czernik_1,Czernik_1,1.92972,61.41163,,,9.0,0005+611,117.73,-1.02,C0005+611,4,1,0.52,3.3,7,283.0,117.0,,,,,,,DO,19.0,4,2,M,,,,,,,,,,,,,,,,1.0,2.0,P,-,12.0,3.0,1,,,,,,,,,0,0,6,4,0,8,8,27,,,,,,,,,,,,,,
5,Cl_Berkeley_1,Berkeley_1,2.40822,60.42822,,,5.0,0007+601,117.79,-2.03,C0007+601,3,1,0.53,3.3,8,284.0,419.0,,,,,,,,,4,2,P,,,,,,,,,,,,,,,,3.0,1.0,P,-,10.0,5.0,1,,,,,,,,,0,0,7,7,0,9,9,12,,,,,,,,,,,,,,
6,C_0007+609,King_13,2.53490,61.21153,,,7.0,0007+609,117.98,-1.28,C0007+609,10,13,0.53,3.3,9,285.0,329.0,,,,,,,,,4,2,P,,12.0,,,,,,,,,,,,,,2.0,2.0,M,-,30.0,5.0,1,,,,,,,,,0,0,8,8,1050,10,3,19,,,,,,,,,,,,,,
7,Cl_Berkeley_60,Berkeley_60,4.42607,60.96103,,,4.0,0015+606,118.85,-1.64,C0015+606,3,60,0.54,3.3,10,288.0,419.0,,,,,,,,,4,2,P,,14.0,,,,,,,,,,,,,,3.0,1.0,P,-,20.0,3.0,1,,,,,,,,,0,0,9,3,3,11,1155,30,,,,,,,,,,,,,,
8,C_0019+641,King_1,5.49230,64.39395,,,7.0,0019+641,119.75,1.69,C0019+641,10,1,0.55,3.3,11,290.0,284.0,,,,,,,,,3,2,P,,13.0,,,,,,,,,,,,,,2.0,2.0,R,-,100.0,9.0,1,,,,,,,,,0,0,10,12,0,1155,13,32,,,,,,,,,,,,,,
9,Cl_Berkeley_2,Berkeley_2,6.31620,60.39356,,,4.0,0022+601,119.70,-2.31,C0022+601,3,2,0.55,3.3,12,289.0,419.0,,,,,,,,,1,3,M,,15.0,,,,,,,,,,,,,,1.0,1.0,M,-,30.0,2.0,1,,,,,,,,,0,0,44,1155,8,13,11,29,,,,,,,,,,,,,,


In [34]:
#mwsc + webda
mwsc_webda_df = mwsc_df.join(webda_df.set_index('Simbad_ID'),
                            on='Simbad_ID',how='outer', lsuffix='_mwsc', rsuffix='_webda')
      
# + piskunov
mwsc_webda_piskunov_df = mwsc_webda_df.join(piskunov_df.set_index('Simbad_ID'), 
                            on='Simbad_ID', how='outer', rsuffix='_piskunov')

# + kharchenko
mwsc_webda_piskunov_kharchenko_df = mwsc_webda_piskunov_df.join(kharchenko_df.set_index('Simbad_ID'), 
                            on='Simbad_ID', how='outer', rsuffix='_kharchenko')

# + Salaris
mwsc_webda_piskunov_kharchenko_salaris_df = mwsc_webda_piskunov_kharchenko_df.join(salaris_df.set_index('Simbad_ID'), 
                            on='Simbad_ID', how='outer', rsuffix='_salaris')

# + vandenberg
mwsc_webda_piskunov_kharchenko_salaris_vandenbergh_df = mwsc_webda_piskunov_kharchenko_salaris_df.join(
                            vandenbergh_df.set_index('Simbad_ID'), 
                            on='Simbad_ID', how='outer', rsuffix='_vandenbergh')


# + Gaia
mwsc_webda_piskunov_kharchenko_salaris_vandenbergh_gaia_df = mwsc_webda_piskunov_kharchenko_salaris_vandenbergh_df.join(
                            gaiaDR2_df.set_index('Simbad_ID'), 
                            on='Simbad_ID', how='outer', rsuffix='_GaiaDR2')

# + Lynga
mwsc_webda_piskunov_kharchenko_salaris_vandenbergh_gaia_lynga_df = mwsc_webda_piskunov_kharchenko_salaris_vandenbergh_gaia_df.join(
                            lynga_df.set_index('Simbad_ID'), 
                            on='Simbad_ID', how='outer', rsuffix='_lynga')

#rename
OCs = mwsc_webda_piskunov_kharchenko_salaris_vandenbergh_gaia_lynga_df.copy()

print(len(OCs.loc[pd.isnull(OCs['Simbad_ID'])]))

#reindex
OCs = OCs.reset_index(drop=True)

#if there are NaN rows; drop them
idx = OCs.index[pd.isnull(OCs['Simbad_ID'])]
OCs.drop(idx, inplace=True)

#reindex
OCs = OCs.reset_index(drop=True)


print(len(mwsc_df), len(webda_df), len(piskunov_df), len(kharchenko_df), len(salaris_df), len(vandenbergh_df),
     len(gaiaDR2_df), len(lynga_df), len(mwsc_webda_piskunov_kharchenko_salaris_vandenbergh_gaia_df),
      len(mwsc_webda_piskunov_kharchenko_salaris_vandenbergh_gaia_lynga_df), len(OCs))

print(OCs.columns.values)

#dump to a file
OCs.to_csv('OCcompiled.csv', index=False)


0
2908 936 650 3005 70 595 1228 1080 3496 3689 3689
['Simbad_ID' 'Name_mwsc' 'broad_type' 'cluster_status' 'ra' 'dec' 'lii'
 'bii' 'core_radius' 'central_radius' 'cluster_radius' 'pm_ra' 'pm_dec'
 'pm_tot_error' 'rad_vel' 'rad_vel_error' 'num_rad_vel_stars'
 'num_core_stars' 'num_central_stars' 'num_cluster_stars' 'distance'
 'e_bv' 'distance_modulus' 'e_jk' 'e_jh' 'delta_h' 'log_age'
 'log_age_error' 'num_log_age_stars' 'king_core_radius'
 'king_core_radius_error' 'king_tidal_radius' 'king_tidal_radius_error'
 'king_norm_factor' 'king_norm_factor_error' 'reference_code'
 'cluster_type' 'metallicity' 'metallicity_error' 'num_metallicity_stars'
 'comments' 'class' 'Name_webda' 'RA_2000' 'Dec_2000' 'l' 'b' 'Dist' 'Mod'
 'EB-V' 'Age' 'ST' 'Z' 'Diam' 'Fe/H' 'MRV' 'pm RA' 'pm Dec' 'Measures'
 'Stars' 'COCD' 'Name' 'GLON[deg]' 'GLAT[deg]' 'DistMod' 'E(B-V)'
 'Dist[pc]' 'logt[yr]' 'rt[pc]' 'e_rt[pc]' 'logM[MSun]' 'e_logM[MSun]'
 'rtA[pc]' 'e_rtA[pc]' 'logMA[MSun]' 'e_logMA[MSun]' 'MWSC'
 'Nam

In [35]:
#a quick check to make sure that items matched up
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
#print(OCs.loc[OCs['Simbad_ID'] == 'C_1440+697'].iloc[0])
print(OCs.loc[OCs['Simbad_ID'] == 'NGC_2682'].iloc[0])

Simbad_ID                                                           NGC_2682
Name_mwsc                                                           NGC_2682
broad_type                                                                  
cluster_status                                                              
ra                                                                  08 51 23
dec                                                                 +11 48.9
lii                                                                  215.691
bii                                                                   31.923
core_radius                                                              0.1
central_radius                                                          0.55
cluster_radius                                                          1.03
pm_ra                                                                  -7.31
pm_dec                                                                 -5.92

In [36]:
#check for duplicates (by definition there should be none)
for index, row in  OCs.iterrows():
    check = OCs.loc[OCs['Simbad_ID'] == row['Simbad_ID']]
    if (len(check['Simbad_ID']) != 1):
        print('OCS', row['Simbad_ID'], row['Name'])
        print('Check', check['Simbad_ID'].values, len(check['Simbad_ID']))
        print('')

OCS [BDS2003]_73 nan
Check ['[BDS2003]_73' '[BDS2003]_73'] 2

OCS [BDS2003]_73 nan
Check ['[BDS2003]_73' '[BDS2003]_73'] 2



In [37]:
check = OCs.loc[OCs['Simbad_ID'].str.contains('Berkeley_69')]
print(len(check['Simbad_ID']))


1


In [38]:
check = OCs.loc[OCs['Simbad_ID'] == '[BDS2003]_73']
#check = mwsc_df.loc[mwsc_df['Simbad_ID'] == '[BDS2003]_73']
#check = mwsc_df.loc[mwsc_df['Name']== 'BDSB_73']
#print(len(check['Simbad_ID']))
check

Unnamed: 0,Simbad_ID,Name_mwsc,broad_type,cluster_status,ra,dec,lii,bii,core_radius,central_radius,cluster_radius,pm_ra,pm_dec,pm_tot_error,rad_vel,rad_vel_error,num_rad_vel_stars,num_core_stars,num_central_stars,num_cluster_stars,distance,e_bv,distance_modulus,e_jk,e_jh,delta_h,log_age,log_age_error,num_log_age_stars,king_core_radius,king_core_radius_error,king_tidal_radius,king_tidal_radius_error,king_norm_factor,king_norm_factor_error,reference_code,cluster_type,metallicity,metallicity_error,num_metallicity_stars,comments,class,Name_webda,RA_2000,Dec_2000,l,b,Dist,Mod,EB-V,Age,ST,Z,Diam,Fe/H,MRV,pm RA,pm Dec,Measures,Stars,COCD,Name,GLON[deg],GLAT[deg],DistMod,E(B-V),Dist[pc],logt[yr],rt[pc],e_rt[pc],logM[MSun],e_logM[MSun],rtA[pc],e_rtA[pc],logMA[MSun],e_logMA[MSun],MWSC,Name_kharchenko,Type,n_Type,RA[hr],Dec[deg],GLON[deg]_kharchenko,GLAT[deg]_kharchenko,r0[deg],r1[deg],r2[deg],pmRA[mas/yr],pmDec[mas/yr],e_pm[mas/yr],RV[km/s],e_RV[km/s],n_RV[km/s],N1sr0,N1sr1,N1sr2,d[pc],E(B-V)_kharchenko,appDistMod[mag],E(J-Ks),E(J-H),dH,logt[yr]_kharchenko,e_logt[yr],Nt,rc[pc],e_rc[pc],rt[pc]_kharchenko,e_rt[pc]_kharchenko,k[pc-2],e_k[pc-2],Src,SType,[Fe/H][Sun],e_[Fe/H][Sun],n_[Fe/H],Name_salaris,dV,err_dV,[Fe/H],err_[Fe/H],t[Gyr],err_t,Rgc[kpc],z[pc],flag,tJP94[Gyr],Seq,Name_vandenbergh,l[deg],Diam[pc],R[pc],Z[pc],E(B-V)_vandenbergh,logT[yr],SimbadName,_RA[deg],_Dec[deg],_RAJ2000,_DEJ2000,Name_GaiaDR2,RAJ2000[deg],DEJ2000[deg],GLON[deg]_GaiaDR2,GLAT[deg]_GaiaDR2,r50[deg],Nstars,pmRA[mas/yr]_GaiaDR2,pmDE[mas/yr],plx[mas],dmode[pc],Rgc[pc],SimbadName_GaiaDR2,Name_lynga,ra_lynga,dec_lynga,distance_lynga,log_age_lynga,angular_diameter,alt_name,lii_lynga,bii_lynga,iau_num,seq_code,seq_num,prec_ra,prec_dec,lund_record_num,ocl_num,ref_angular_diameter,ref_distance,ref_log_age,metallicity_lynga,ref_metallicity,e_bv_lynga,ref_e_bv,type_flag,ref_type_flag,tr_concent_class,tr_range_class,tr_richness_class,tr_nebulosity,sb_bs_mag,sb_spect_code,sb_total_mag,sk_total_mag,sk_bv_color,sk_num_stars,ja_star_num,ja_class,ja_max_class,ja_richness,ja_e_bv,ref_ja_e_bv,ja_bv_turnoff,ref_ja_bv_turnoff,ly_tr_concent_class,ly_tr_range_class,ly_tr_richness_class,ly_tr_nebulosity,ly_member_stars,ly_angular_diameter,ly_refs_flag,radvel_weight,radvel,radvel_weight_class,ref_radvel1,ref_radvel2,ref_radvel3,ref_radvel4,ref_radvel5,basel_spect_code,basel_color_type,neg_ra_tracer,neg_lii_tracer,neg_seq_num_tracer,pos_ra_tracer,pos_lii_tracer,pos_seq_num_tracer,jdl_distance,jdl_distance_weight,jdl_turnoff_color,jdl_age,jdl_age_weight,jdl_reddening,jdl_reddening_flag,jdl_reddening_weight,ref_jdl1,ref_jdl2,ref_jdl3,ref_jdl4,ref_jdl5,ref_jdl6
2202,[BDS2003]_73,BDSB_73,,,05 41 33,+35 51.7,173.65,2.899,0.025,0.1,0.185,2.52,-4.9,0.45,-13.41,,1,5.0,39.0,115.0,1110.0,0.541,10.401,0.26,0.173,0.0,8.875,,,1.92,0.68,7.12,2.02,4.32,1.14,FPRO,,,,0.0,"Overlaps with corona of Sh2_235, BDSB_72, and ...",OPEN STAR CL...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,631.0,Sh2_235B,no,,5.6815,35.707,173.706,2.697,0.02,0.095,0.14,1.45,-4.66,0.56,-8.86,0.3,2.0,1.0,17.0,28.0,2152.0,1.353,12.1,0.65,0.434,0.0,6.63,,,3.86,0.98,8.45,1.29,5.22,2.04,LLEM,emb,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2203,[BDS2003]_73,BDSB_73,,,05 41 33,+35 51.7,173.65,2.899,0.025,0.1,0.185,2.52,-4.9,0.45,-13.41,,1,5.0,39.0,115.0,1110.0,0.541,10.401,0.26,0.173,0.0,8.875,,,1.92,0.68,7.12,2.02,4.32,1.14,FPRO,,,,0.0,"Overlaps with corona of Sh2_235, BDSB_72, and ...",OPEN STAR CL...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,639.0,BDSB_73,,,5.6925,35.861,173.645,2.892,0.025,0.1,0.185,2.52,-4.9,0.45,-13.41,,1.0,5.0,39.0,115.0,1110.0,0.541,10.401,0.26,0.173,0.0,8.875,,,1.92,0.68,7.12,2.02,4.32,1.14,FPRO,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [39]:
check = OCs.loc[pd.isna(OCs['Simbad_ID'])]
check

Unnamed: 0,Simbad_ID,Name_mwsc,broad_type,cluster_status,ra,dec,lii,bii,core_radius,central_radius,cluster_radius,pm_ra,pm_dec,pm_tot_error,rad_vel,rad_vel_error,num_rad_vel_stars,num_core_stars,num_central_stars,num_cluster_stars,distance,e_bv,distance_modulus,e_jk,e_jh,delta_h,log_age,log_age_error,num_log_age_stars,king_core_radius,king_core_radius_error,king_tidal_radius,king_tidal_radius_error,king_norm_factor,king_norm_factor_error,reference_code,cluster_type,metallicity,metallicity_error,num_metallicity_stars,comments,class,Name_webda,RA_2000,Dec_2000,l,b,Dist,Mod,EB-V,Age,ST,Z,Diam,Fe/H,MRV,pm RA,pm Dec,Measures,Stars,COCD,Name,GLON[deg],GLAT[deg],DistMod,E(B-V),Dist[pc],logt[yr],rt[pc],e_rt[pc],logM[MSun],e_logM[MSun],rtA[pc],e_rtA[pc],logMA[MSun],e_logMA[MSun],MWSC,Name_kharchenko,Type,n_Type,RA[hr],Dec[deg],GLON[deg]_kharchenko,GLAT[deg]_kharchenko,r0[deg],r1[deg],r2[deg],pmRA[mas/yr],pmDec[mas/yr],e_pm[mas/yr],RV[km/s],e_RV[km/s],n_RV[km/s],N1sr0,N1sr1,N1sr2,d[pc],E(B-V)_kharchenko,appDistMod[mag],E(J-Ks),E(J-H),dH,logt[yr]_kharchenko,e_logt[yr],Nt,rc[pc],e_rc[pc],rt[pc]_kharchenko,e_rt[pc]_kharchenko,k[pc-2],e_k[pc-2],Src,SType,[Fe/H][Sun],e_[Fe/H][Sun],n_[Fe/H],Name_salaris,dV,err_dV,[Fe/H],err_[Fe/H],t[Gyr],err_t,Rgc[kpc],z[pc],flag,tJP94[Gyr],Seq,Name_vandenbergh,l[deg],Diam[pc],R[pc],Z[pc],E(B-V)_vandenbergh,logT[yr],SimbadName,_RA[deg],_Dec[deg],_RAJ2000,_DEJ2000,Name_GaiaDR2,RAJ2000[deg],DEJ2000[deg],GLON[deg]_GaiaDR2,GLAT[deg]_GaiaDR2,r50[deg],Nstars,pmRA[mas/yr]_GaiaDR2,pmDE[mas/yr],plx[mas],dmode[pc],Rgc[pc],SimbadName_GaiaDR2,Name_lynga,ra_lynga,dec_lynga,distance_lynga,log_age_lynga,angular_diameter,alt_name,lii_lynga,bii_lynga,iau_num,seq_code,seq_num,prec_ra,prec_dec,lund_record_num,ocl_num,ref_angular_diameter,ref_distance,ref_log_age,metallicity_lynga,ref_metallicity,e_bv_lynga,ref_e_bv,type_flag,ref_type_flag,tr_concent_class,tr_range_class,tr_richness_class,tr_nebulosity,sb_bs_mag,sb_spect_code,sb_total_mag,sk_total_mag,sk_bv_color,sk_num_stars,ja_star_num,ja_class,ja_max_class,ja_richness,ja_e_bv,ref_ja_e_bv,ja_bv_turnoff,ref_ja_bv_turnoff,ly_tr_concent_class,ly_tr_range_class,ly_tr_richness_class,ly_tr_nebulosity,ly_member_stars,ly_angular_diameter,ly_refs_flag,radvel_weight,radvel,radvel_weight_class,ref_radvel1,ref_radvel2,ref_radvel3,ref_radvel4,ref_radvel5,basel_spect_code,basel_color_type,neg_ra_tracer,neg_lii_tracer,neg_seq_num_tracer,pos_ra_tracer,pos_lii_tracer,pos_seq_num_tracer,jdl_distance,jdl_distance_weight,jdl_turnoff_color,jdl_age,jdl_age_weight,jdl_reddening,jdl_reddening_flag,jdl_reddening_weight,ref_jdl1,ref_jdl2,ref_jdl3,ref_jdl4,ref_jdl5,ref_jdl6


## Check for duplicates in RA and Dec

In [40]:
def getCoord(row):

    if (pd.notna(row['_RAJ2000']) and pd.notna(row['_DEJ2000'])):
        return SkyCoord(ra=row['_RAJ2000']+' hours', dec=row['_DEJ2000']+' degree', frame='icrs')
    
    elif (pd.notna(row['ra']) and pd.notna(row['dec'])):
        return SkyCoord(ra=row['ra']+' hours', dec=row['dec']+' degree', frame='icrs')
        
    elif (pd.notna(row['ra_lynga']) and pd.notna(row['dec_lynga'])):
        return SkyCoord(ra=row['ra_lynga']*units.degree, dec=row['dec_lynga']*units.degree, frame='icrs')
    
    elif (pd.notna(row['RA_2000']) and pd.notna(row['Dec_2000'])):
        return SkyCoord(ra=row['RA_2000']+' hours', dec=row['Dec_2000']+' degree', frame='icrs')
        
    elif (pd.notna(row['RA[hr]']) and pd.notna(row['Dec[deg]'])):
        return SkyCoord(ra=row['RA[hr]']*units.hourangle, dec=row['Dec[deg]']*units.degree, frame='icrs')
        
    elif (pd.notna(row['_RA[deg]']) and pd.notna(row['_Dec[deg]'])):
        return SkyCoord(ra=row['_Dec[deg]']*units.degree, dec=row['_Dec[deg]']*units.degree, frame='icrs')
        
    elif (pd.notna(row['GLON[deg]']) and pd.notna(row['GLAT[deg]'])):
        return SkyCoord(l=row['GLON[deg]']*units.degree, b=row['GLAT[deg]']*units.degree, frame='galactic').icrs

    elif (pd.notna(row['l']) and pd.notna(row['b'])):
        return SkyCoord(l=row['l']*units.degree, b=row['b']*units.degree, frame='galactic').icrs
    
    else:
        print('NO RA, Dec : ', row['Simbad_ID'])
        return False

In [41]:
#first get the coordinates in lists
RA = []
Dec = []
for index, row in OCs.iterrows():
    #RA and Dec
    c = getCoord(row)
    if (c):
        RA.append(c.ra.degree)
        Dec.append(c.dec.degree)

catalog = SkyCoord(ra = RA*units.degree, dec = Dec*units.degree)

In [42]:
#now match to the full catalog to see if there are duplicates
#I think these are OK overlaps (really different clusters)
#!!! OVERLAP NGC_3590 Hogg_12 [0.46110687] 253 2967
#!!! OVERLAP C_0925-549 Ruprecht_77 [0.90177733] 473 3360
#!!! OVERLAP NGC_6997 NGC_6996 [0.91921481] 3238 3250
#!!! OVERLAP Cl_Platais_8 NAME_HIP_67014_Cluster [0.44914403] 3260 3266
#!!! OVERLAP AH03_J0822-36.4 NGC_2579 [0.97738812] 888 3046 #not sure about this; NGC 2579 is labelled as HII region in Simbad
#!!! OVERLAP Cl_Pismis_24 NGC_6357 [0.97546562] 3134 3347 #nto sure about this; NGC 6357 is labelled as HII region in Simbad


#need to fix:
#!!! OVERLAP [BDS2003]_73 [BDS2003]_73 [6.47828804e-13] 2202 2203

max_sep = 1.0 * units.arcmin
nover = 0
for index, row in OCs.iterrows():
    c = getCoord(row)
    idx, d2d, d3d = c.match_to_catalog_sky(catalog, nthneighbor=2) #first neighbor is itself
    #print(index, row['Name'], OCs.iloc[int(idx)]['Name'], d2d.degree)
    if (d2d < max_sep and idx != index):
        print('!!! OVERLAP',row['Simbad_ID'], OCs.iloc[int(idx)]['Simbad_ID'], d2d.arcminute, index, idx)
        nover += 1

#this should be 12
print(nover)
        


!!! OVERLAP NGC_3590 Hogg_12 [0.46110687] 253 2993
!!! OVERLAP C_0925-549 Ruprecht_77 [0.90177733] 473 3483
!!! OVERLAP C_0847-465 Ruprecht_71 [0.90420297] 676 3482
!!! OVERLAP AH03_J0822-36.4 NGC_2579 [0.97738812] 888 3094
!!! OVERLAP NGC_6494 M_23 [0.89772001] 1300 3007
!!! OVERLAP C_0747-171 Ruprecht_37 [0.96929315] 1331 3550
!!! OVERLAP NGC_2323 M_50 [0.41511329] 1518 3465
!!! OVERLAP Basel_8 Cl_Basel_8 [0.3] 1777 3365
!!! OVERLAP Czernik_18 C_0424+308 [0.59099504] 2117 3368
!!! OVERLAP [BDS2003]_73 [BDS2003]_73 [6.47828804e-13] 2203 2202
!!! OVERLAP NAME_Alessi_Teutsch_3 Name_Cl_Alessi_Teutsch_3 [0.9250396] 2970 3355
!!! OVERLAP Hogg_12 NGC_3590 [0.46110687] 2993 253
!!! OVERLAP M_23 NGC_6494 [0.89772001] 3007 1300
!!! OVERLAP NGC_2579 AH03_J0822-36.4 [0.97738812] 3094 888
!!! OVERLAP NGC_6997 NGC_6996 [0.91921481] 3289 3303
!!! OVERLAP NGC_6996 NGC_6997 [0.91921481] 3303 3289
!!! OVERLAP Name_Cl_Alessi_Teutsch_3 NAME_Alessi_Teutsch_3 [0.9250396] 3355 2970
!!! OVERLAP Cl_Basel_8 B

In [22]:
print(OCs.columns.values)

['Simbad_ID' 'Name_mwsc' 'broad_type' 'cluster_status' 'ra' 'dec' 'lii'
 'bii' 'core_radius' 'central_radius' 'cluster_radius' 'pm_ra' 'pm_dec'
 'pm_tot_error' 'rad_vel' 'rad_vel_error' 'num_rad_vel_stars'
 'num_core_stars' 'num_central_stars' 'num_cluster_stars' 'distance'
 'e_bv' 'distance_modulus' 'e_jk' 'e_jh' 'delta_h' 'log_age'
 'log_age_error' 'num_log_age_stars' 'king_core_radius'
 'king_core_radius_error' 'king_tidal_radius' 'king_tidal_radius_error'
 'king_norm_factor' 'king_norm_factor_error' 'reference_code'
 'cluster_type' 'metallicity' 'metallicity_error' 'num_metallicity_stars'
 'comments' 'class' 'Name_webda' 'RA_2000' 'Dec_2000' 'l' 'b' 'Dist' 'Mod'
 'EB-V' 'Age' 'ST' 'Z' 'Diam' 'Fe/H' 'MRV' 'pm RA' 'pm Dec' 'Measures'
 'Stars' 'COCD' 'Name' 'GLON[deg]' 'GLAT[deg]' 'DistMod' 'E(B-V)'
 'Dist[pc]' 'logt[yr]' 'rt[pc]' 'e_rt[pc]' 'logM[MSun]' 'e_logM[MSun]'
 'rtA[pc]' 'e_rtA[pc]' 'logMA[MSun]' 'e_logMA[MSun]' 'MWSC'
 'Name_kharchenko' 'Type' 'n_Type' 'RA[hr]' 'Dec[deg]'
 '

# Make a plot of the age distribution and mass distribution

### First check how many have both

In [23]:
def getMass(row, mm = 0.5):
    #take a mean if there are more than 1
    masses = []
    if (pd.notna(row['num_cluster_stars'])):
        masses.append(row['num_cluster_stars']*mm)
        
    if (pd.notna(row['Nstars'])):
        masses.append(row['Nstars']*mm)    

    if (pd.notna(row['Stars'])):
        masses.append(row['Stars']*mm)    
        
    if (pd.notna(row['ly_member_stars'])):
        if row['ly_member_stars'].isnumeric():
            masses.append(float(row['ly_member_stars'])*mm)
        
    if (pd.notna(row['N1sr2'])):
        masses.append(row['N1sr2']*mm)    
        
    if (pd.notna(row['logM[MSun]'])):
        masses.append(10.**row['logM[MSun]'])
        
    if (pd.notna(row['logMA[MSun]'])):
        masses.append(10.**row['logMA[MSun]'])
        

    if (len(masses) > 0):
        masses = np.array(masses)
        return (np.mean(masses), np.std(masses)/(len(masses))**0.5)
    else:
        print('NO MASS', row['Name'])
        return (np.nan, np.nan)

mass = []
for index, row in OCs.iterrows():
    m,em  = getMass(row)
    if (~np.isnan(m)):
        mass.append(m)
        

# hasAge = OCs.loc[(pd.notna(OCs['Age'])) | 
#                  (pd.notna(OCs['log_age'])) |
#                  (pd.notna(OCs['logt[yr]'])) |
#                  (pd.notna(OCs['logt[yr]_kharchenko'])) |
#                  (pd.notna(OCs['logt'])) |
#                  (pd.notna(OCs['logt_vandenbergh'])) |
#                  (pd.notna(OCs['t[Gyr]']))
#                  ]
# hasBoth = OCs.loc[( 
#          (pd.notna(OCs['logM[MSun]'])) |
#          (pd.notna(OCs['num_cluster_stars'])) |
#          (pd.notna(OCs['Stars'])) | 
#          (pd.notna(OCs['logMA[MSun]'])) |
#          (pd.notna(OCs['N1sr0'])) |
#          (pd.notna(OCs['N1sr1'])) |
#          (pd.notna(OCs['N1sr2'])) 
#     ) & (
#         (pd.notna(OCs['Age'])) | 
#         (pd.notna(OCs['log_age'])) |
#         (pd.notna(OCs['logt[yr]'])) |
#         (pd.notna(OCs['logt[yr]_kharchenko'])) |
#         (pd.notna(OCs['logt'])) |
#         (pd.notna(OCs['logt_vandenbergh'])) |
#         (pd.notna(OCs['t[Gyr]']))
#     )]
print(len(OCs), len(mass))
#print(mass)

AttributeError: 'float' object has no attribute 'isnumeric'

In [None]:
noAge = OCs.loc[(pd.isnull(OCs['Age'])) &
                 (pd.isnull(OCs['log_age'])) &
                 (pd.isnull(OCs['logt[yr]'])) &
                 (pd.isnull(OCs['logt[yr]_kharchenko'])) &
                 (pd.isnull(OCs['logt'])) &
                 (pd.isnull(OCs['logt_vandenbergh'])) &
                 (pd.isnull(OCs['t[Gyr]']))
                 ]
noAge

In [None]:
#Add a column to estimate the mass from the number of stars? (or vice versa) 
#This would require an estimate of the mean mass, which depends on age

#as a test, just assume <m>=0.5
meanM = 0.5

logMass = []
logAge = []
name = []
for index, row in  hasBoth.iterrows():
    name.append(row['name'])
    
    #age
#     (pd.notnull(OCs['Age'])) | 
#                  (pd.notnull(OCs['log_age'])) |
#                  (pd.notnull(OCs['logt[yr]'])) |
#                  (pd.notnull(OCs['logt[yr]_kharchenko'])) |
#                  (pd.notnull(OCs['logt'])) |
#                  (pd.notnull(OCs['logt_vandenbergh'])) |
#                  (pd.notnull(OCs['t[Gyr]']))
                 
                
    if (pd.notnull(row['log_age'])): #MWSC
        logAge.append(row['log_age'])
    elif (pd.notnull(row['logt'])): #Solaris
        logAge.append(row['logt'])
    elif (pd.notnull(row['log(t[yr])K'])): #Kharchenko
        logAge.append(row['log(t[yr])K'])
    elif (pd.notnull(row['Age'])): #WEBDA
        logAge.append(np.log10(row['Age']))

    #mass
#          (pd.notnull(OCs['logM[MSun]'])) |
#          (pd.notnull(OCs['num_cluster_stars'])) |
#          (pd.notnull(OCs['Stars'])) | 
#          (pd.notnull(OCs['logMA[MSun]'])) |
#          (pd.notnull(OCs['N1sr0'])) |
#          (pd.notnull(OCs['N1sr1'])) |
#          (pd.notnull(OCs['N1sr2'])) 
        
    if (pd.notnull(row['logM[Msun]'])): #Piskunov
        logMass.append(row['logM[Msun]'])    
    elif (pd.notnull(row['num_cluster_stars'])): #MWSC <-- NEED TO FIX THIS 
        logMass.append(np.log10(row['num_cluster_stars']*meanM))
        
print(len(name), len(logAge), len(logMass))

### Make a few plots

In [None]:
f,(ax1, ax2) = plt.subplots(1,2)

ax1.hist(logAge, bins=40, density=True)
ax1.set_xlabel('log(Age [yr?])')
ax1.set_yscale('log')

ax2.hist(logMass, bins=40, density=True)
ax2.set_xlabel('log(Mass [Msun])')
ax2.set_yscale('log')


### As if I'm only reading from the file

In [None]:
df = pd.read_csv("OCcompiled_hasAgeMass.csv")

data = np.vstack((df['logAge'].values, df['logMass'].values))
KDE = gaussian_kde(data)
sample = KDE.resample(size=int(1e5))

nbins = 40

f,(ax1, ax2) = plt.subplots(1,2)
ax1.hist(df['logAge'].values, bins=nbins, density=True)
ax1.hist(sample[0,:], bins=nbins, density=True, histtype='step')
ax1.set_xlabel('log(Age [yr?])')
ax1.set_yscale('log')

ax2.hist(df['logMass'].values, bins=nbins, density=True)
ax2.hist(sample[1,:], bins=nbins, density=True, histtype='step')
ax2.set_xlabel('log(Mass [Msun])')
ax2.set_yscale('log')

lt = 5
lm = 2
values = np.vstack([lt, lm])
print(KDE(values))
#NOTE: the age KDE seems to be missing the edges.  Maybe I should set those to zero automatically?

### Make a smaller file that has everything we need for the EBLSST code

Name, RA, Dec, dist[kpc], rh[pc], mass[Msun], Age[Myr], Z, sigma_v[km/s]

In [None]:
df = pd.read_csv("OCcompiled.csv")


name = []
RA = []
Dec = []

logMass = []
logAge = []
for index, row in df.iterrows():
    name.append(row['name'])
    
    #RA
    if (pd.notnull(row['ra'])):
        RA.append(row['ra'])
    elif (pd.notnull(row['RA_2000'])):
        RA.append(row['RA_2000'])
    else:
        print('NO RA', row['name'])
        #print('\nNO RA', row)
     
    #Dec
    Dec.append(row['dec'])
    
    #age
    if (pd.notnull(row['log_age'])): #MWSC
        logAge.append(row['log_age'])
    elif (pd.notnull(row['logt'])): #Solaris
        logAge.append(row['logt'])
    elif (pd.notnull(row['log(t[yr])K'])): #Kharchenko
        logAge.append(row['log(t[yr])K'])
    elif (pd.notnull(row['Age'])): #WEBDA
        logAge.append(np.log10(row['Age']))

print(len(name),len(RA))