# Process the data for the web

I will construct an sqlite database files for each cluster.

For now, I am only taking the members.  And I am planning to create a different database file for each cluster.  (Currently the sample here for NGC 6819 is ~1GB in size).

In [None]:
import pandas as pd
import numpy as np
import csv
import os
import sqlite3

In [None]:
def process_cluster(cluster_name):

    print('processing files for : ', cluster_name)
    # create an sqlite file (or connect to existing file)
    conn = sqlite3.connect(os.path.join('sqlite', cluster_name + '.db'))
    cursor = conn.cursor()

    # read in and process the cluster .res file
    print(cluster_name + '.res')
    singlePopRes = pd.read_csv(os.path.join('rawData', cluster_name, cluster_name + '.res'), delimiter = '\s+')
    singlePopRes.insert(loc = 0, column = 'iteration', value = singlePopRes.index + 1)
    singlePopRes.to_sql('cluster_posterior', conn, if_exists = 'replace', index = False)

    # read in and process the cluster
    print(cluster_name + '.df')
    starsSummary = pd.read_csv(os.path.join('rawData', cluster_name, cluster_name + '.df'), delimiter = ' ')
    starsSummary.drop('Unnamed: 0', axis = 1, inplace = True)
    # take only the members to reduce the file size?
    starsSummaryMembers = starsSummary.loc[starsSummary['member']]
    starsSummaryMembers.to_sql('stars_summary', conn, if_exists = 'replace', index = False)

    # parse through the sampleMass files
    # get all the files
    # iterate through the files to read them all in, and create tables for each star
    print('sampleMass output ... ')
    memIDs = starsSummaryMembers['source_id'].to_numpy()
    haveSampleMass = np.array([False for x in memIDs])
    directory = os.path.join('rawData', cluster_name, cluster_name + '_sampleMass')
    files = os.listdir(directory)
    for i, filename in enumerate(files):
        if ('sampleMass.out' in filename):
            f = os.path.join(directory, filename)
            print(f'{i} {(i+1)/len(files)*100:.2f} {filename}' )

            # checking if it is a file
            if os.path.isfile(f):

                df = pd.read_csv(f, delimiter = '\s+', quoting = csv.QUOTE_NONE)
                df.rename(columns = {'starId':'source_id'}, inplace = True)
                df['source_id'] = df['source_id'].str.replace('"', '', regex = True)

                # get the unique IDs
                uID = df['source_id'].unique()

                for iden in uID:
                    # include only the members(?)
                    if (np.int64(iden) in memIDs):
                        j = np.where(memIDs == np.int64(iden))[0]
                        haveSampleMass[j] = True
                        foo = df.loc[df['source_id'] == iden].copy()
                        foo.drop('source_id', axis = 1, inplace = True)
                        foo.to_sql('posterior_for_id_' + iden, conn, if_exists = 'replace', index = False)
            
    found = np.where(haveSampleMass == True)[0]
    missing = np.where(haveSampleMass == False)[0]
    print('# of found sampleMass files : ', len(found))
    print('# of missing sampleMass files : ', len(missing))

    # there's probably some linking that I could do, but for now I don't think it is necessary

    
    cursor.close()
    conn.close()

In [None]:
process_cluster('NGC_6791')

In [None]:
process_cluster('NGC_188')

In [None]:
process_cluster('NGC_2682') #M_67

In [None]:
process_cluster('NGC_6819') 

In [None]:
process_cluster('NGC_7789') 

In [None]:
process_cluster('NGC_2168') #M_35

# Testing accessing the data for the backend

In [None]:
import sqlite3
import os

In [None]:
data_dir = os.path.join(os.getcwd(), 'sqlite')
conn = sqlite3.connect(os.path.join(data_dir, 'NGC_6819.db'))
cursor = conn.cursor()

In [None]:
def get_available_clusters():
    files = []
    clusters = []
    contents = os.listdir(data_dir)
    for item in contents:
        if os.path.isfile(os.path.join(data_dir, item)) and '.db' in item:
            files.append(os.path.join(data_dir, item))
            clusters.append(str.replace(item, '.db',''))

    return files, clusters
get_available_clusters()

In [None]:
def get_available_tables(cursor):
    # get all the available tables
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
    tbls = cursor.fetchall()
    tables = [t[0] for t in tbls]
    return tables
get_available_tables(cursor)

In [None]:
def get_available_columns(cursor, table_name):
    # Execute the PRAGMA to get table information
    cursor.execute(f"PRAGMA table_info({table_name})")

    # Fetch all rows of the result
    table_info = cursor.fetchall()

    # return the column names
    column_names = [row[1] for row in table_info]
    return column_names

#get_available_columns(cursor, 'stars_summary')
get_available_columns(cursor, 'cluster_posterior')
#get_available_columns(cursor, 'posterior_for_id_2076299826416672896')

In [None]:
def get_column_data(cursor, table_name, column):
    # select the data from the table
    cursor.execute(f"SELECT {column} FROM {table_name}")
    
    # Fetch all the rows of that result
    dd = cursor.fetchall()
    
    # return the data
    data = [d[0] for d in dd]
    return data
get_column_data(cursor, 'posterior_for_id_2076299826416672896', 'mass')