# Process the data from Anna for the web

I will construct an sqlite database files for each cluster.

For now, I am only taking the members.  And I am planning to create a different database file for each cluster.  (Currently the sample here for NGC 6819 is ~1GB in size).

In [None]:
import pandas as pd
import numpy as np
import csv
import os
import sqlite3

from zipfile import ZipFile

In [None]:
def process_cluster(cluster_name, run_sample_mass_files = True):
    print('processing files for : ', cluster_name)
    # create an sqlite file (or connect to existing file)
    conn = sqlite3.connect(os.path.join('sqlite', cluster_name + '.db'))
    cursor = conn.cursor()

    with ZipFile(os.path.join('rawData', cluster_name + '.zip'), 'r') as zip_ref:

        file_list = zip_ref.namelist()

        # read in and process the cluster .res file
        res_file = os.path.join(cluster_name, cluster_name + '.res')
        if res_file in file_list:
            with zip_ref.open(res_file) as f:
                print(res_file)
                singlePop_res = pd.read_csv(f, delimiter = '\s+')
                singlePop_res.insert(loc = 0, column = 'iteration', value = singlePop_res.index + 1)
                singlePop_res.to_sql('cluster_posterior', conn, if_exists = 'replace', index = False)
        else:
            print('ERROR! did not find', res_file)


        # read in and process the cluster summary file
        sum_file = os.path.join(cluster_name, cluster_name + '.df')
        if sum_file in file_list:
            with zip_ref.open(sum_file) as f:
                print(sum_file)
                stars_summary = pd.read_csv(f, delimiter = ' ')
                stars_summary.drop('Unnamed: 0', axis = 1, inplace = True)
                # take only the members to reduce the file size?
                stars_summary_members = stars_summary.loc[stars_summary['member']]
                stars_summary_members.to_sql('stars_summary', conn, if_exists = 'replace', index = False)
        else:
            print('ERROR! did not find', sum_file)

        # read in and process the .ms file
        ms_file = os.path.join(cluster_name, cluster_name + '_dir.ms')
        if ms_file in file_list:
            with zip_ref.open(ms_file) as f:
                print(ms_file)
                ms_df = pd.read_csv(f, delimiter = '\s+')
                # fix the column names because sqlite is case insensitive
                ms_df.rename(columns = {'u':'u_SDSS', 'g':'g_SDSS', 'r':'r_SDSS', 'i':'i_SDSS', 'z':'z_SDSS'}, inplace = True)
                ms_df.to_sql('parsec_isochrone', conn, if_exists = 'replace', index = False)
        else:
            print('ERROR! did not find', ms_file)

        if (run_sample_mass_files):
            # parse through the sampleMass files
            # get all the files
            # iterate through the files to read them all in, and create tables for each star
            print('sampleMass output ... ')
            memIDs = stars_summary_members['source_id'].to_numpy()
            have_sample_mass = np.array([False for x in memIDs])
            sample_mass_files = [x for x in file_list if ('sampleMass.out' in x and '__MACOSX' not in x)]
            for i, smf in enumerate(sample_mass_files):
                with zip_ref.open(smf) as f:
                    print(f'{i} {(i+1)/len(sample_mass_files)*100:.2f} {smf}' )

                    df = pd.read_csv(f, delimiter = '\s+', quoting = csv.QUOTE_NONE)
                    df.rename(columns = {'starId':'source_id'}, inplace = True)
                    df['source_id'] = df['source_id'].str.replace('"', '', regex = True)

                    # get the unique IDs
                    uID = df['source_id'].unique()

                    for iden in uID:
                        # include only the members(?)
                        if (np.int64(iden) in memIDs):
                            j = np.where(memIDs == np.int64(iden))[0]
                            have_sample_mass[j] = True
                            foo = df.loc[df['source_id'] == iden].copy()
                            foo.drop('source_id', axis = 1, inplace = True)
                            foo.to_sql('posterior_for_id_' + iden, conn, if_exists = 'replace', index = False)
                    
            found = np.where(have_sample_mass == True)[0]
            missing = np.where(have_sample_mass == False)[0]
            print('# of found sampleMass files : ', len(found))
            print('# of missing sampleMass files : ', len(missing))

    
    cursor.close()
    conn.close()

In [None]:
# process the summary file
conn = sqlite3.connect(os.path.join('sqlite',  'cluster_summary.db'))
cursor = conn.cursor()
cluster_params = pd.read_csv(os.path.join('rawData','cluster_params.csv'))
cluster_params.to_sql('cluster_parameters', conn, if_exists = 'replace', index = False)
cursor.close()
conn.close()
cluster_params

In [None]:
process_cluster('NGC_6791')

In [None]:
process_cluster('NGC_188')

In [None]:
process_cluster('NGC_2682') #M_67

In [None]:
process_cluster('NGC_6819') 

In [None]:
process_cluster('NGC_7789') 

In [None]:
process_cluster('NGC_2168') #M_35