In [1]:
import json
import time
import pandas as pd
import numpy as np
import sys
sys.path.append('./lib')

from EfficientNode import Node
from DBConnection import DBConnection
from ReadJsonConfig import read_json_config
from UsefulFunctions import flatten_list

schema = read_json_config('outputs/schemas_localhost.json')

In [2]:
test_root = Node(['world', 'test'])
test_root.add_path("hello", "world")
test_root.add_path("'ello", "test")
test_root.search('ll')

['test', 'world']

In [3]:
def build_value_string(schema_name: str, table_name: str, column_name: str):
    return f"{schema_name}.{table_name}.{column_name}"

# Define Build State Machine

In [4]:
def build_state_machine(db: DBConnection, schema: dict, schema_name: str, table_responses: dict, filters: dict):
    values = []
    for table_name, table_data in schema[schema_name].items():
        for column_name in table_data['columns'].keys():
            values.append(build_value_string(schema_name, table_name, column_name))
    root = Node(values)
    max_distance = 6
    elapsed_time_map = {}

    print(f"\nBuilding state machine for {schema_name}...")
    for table_name, response in table_responses.items():
        if (any([keyword in table_name.lower() for keyword in filters.get('keywords', [])])):
            print(f"  --  Skipping {schema_name}.{table_name}")
            continue
        if (response.get('query') == None):
            print(f"  --  Skipping {schema_name}.{table_name}")
            continue
        dict = db.to_dict(response.get('query'), list(schema[schema_name][table_name]['columns'].keys()))
        df = pd.DataFrame(dict)
        print(f"{schema_name}.{table_name}")
        start_time = time.time()
        for col_name, col_values in df.items():
            for col_value in col_values.unique():
                if col_value != None and col_value != "":
                    root.add_path(str(col_value)[:max_distance], build_value_string(schema_name, table_name, col_name))
        elapsed_time_map[f"{schema_name}.{table_name}"] = time.time() - start_time

    return root, elapsed_time_map

# define query schema function

In [5]:
def query_schema(db: DBConnection, schema: dict, schema_name: str, filters: dict):
    queries = {}
    print(f"Fetching schema {schema_name}...")
    for table_name in schema[schema_name].keys():
        if (table_name in filters.get('tables', [])):
            print(f"    --  Skipping {table_name} : filter table match...")
            continue
        if (any([keyword in table_name for keyword in filters.get('keywords', [])])):
            print(f"    --  Skipping {table_name} : filter keyword match...")
            continue
            
        print(f"    >>  {table_name}")
        if not table_name in queries:
            queries[table_name] = {}
        try:
            start_time = time.time()
            queries[table_name]['query'] = db.fetch_query(f"SELECT * FROM {schema_name}.{table_name}")
            queries[table_name]['time'] = time.time() - start_time
        except Exception as e:
            print(f"Error fetching table {table_name} from schema {schema_name}")
            print(e)
            queries[table_name] = None
            db.rollback_transaction()
    return queries

# Query Data

In [6]:
def query(schema_name):
    db = DBConnection(read_json_config('secrets/db_config_localhost.json'))

    filters = {
        'keywords': [
            "analytics",
            'market',
            'backup',
            'finance',
            'forecast',
            'customer',
            'call',
            'sales',
        ],
        'tables': [
            "used_make_model",
            "analytics_marketdata",
            "marketing_ss2021_11Desig",
            "date_dimension",
            # "evo_soldunitlabor",
        ],
        'schemas': [
            "nada",
            "analytics",
            "public",
            "inventory",
            "development",
            "service",
            "recipe",
            "bishfix_dev",
            "bishnet_dev",
            "inventory_dev",
        ]
    }

    responses = query_schema(db, schema, schema_name, filters)
    state_machine, elapsed_time = build_state_machine(db, schema, schema_name, responses, filters)
    db.close()
    return state_machine, elapsed_time, responses


In [7]:
def save(schema_name, state_machine, elapsed_time):
    print(f"\nSaving outputs for {schema_name}...")
    state_machine.save(f'state_machine_{schema_name}')
    with open(f'outputs/{schema_name}_elapsed_time.json', 'w') as f:
        json.dump(elapsed_time, f)

In [8]:
schema_name = 'public'
state_machine, elapsed_time, responses = query(schema_name)

Fetching schema bishnet...
    >>  check_for_approval
    >>  complaint_image
    >>  employee
    >>  appsheets_techtime
    --  Skipping appsheets_techtime_backup : filter keyword match...
    >>  bad_open_service_det_unit_job_cause
    >>  bfr
    >>  bishnet_employee_database
    >>  deal_note
    >>  deleted_rono_assign
    >>  delord
    >>  form_dealer_address
    >>  help_link
    >>  dof_images
    >>  dev_complaint_image
    >>  drop_off_form
    >>  invest_protect
    >>  part_status
    >>  new_job_request
    >>  note
    >>  open_service_det_unit_job_cause
    >>  open_work_order
    >>  part_image
    >>  part_status_changelog
    --  Skipping sales_snapshot : filter keyword match...
    >>  image_test
    >>  porter
    >>  job_request
    >>  porter_assign
    >>  image
    >>  retool_user
    >>  retool_user_staging
    >>  job_correction
    >>  job_task
    >>  rono_assign
    >>  rv_sys_check
    >>  temp_event_master_tech
    >>  service_update
    >>  setup
    >

In [9]:
save(schema_name, state_machine, elapsed_time)


Saving outputs for bishfix...
[('parent', '<u8'), ('key', 'u1'), ('values', '<u8', (14,))]


In [10]:
sm = state_machine;

In [11]:
for s in ['105167', '106287']:
    print(sm.search(s))

['bishnet.complaint_image.rono', 'bishnet.job_request.rono', 'bishnet.job_task.rono', 'bishnet.open_service_det_unit_job_cause.rono', 'bishnet.part_status.rono', 'bishnet.part_status_changelog.changelogid']
['bishnet.appsheets_techtime.rono', 'bishnet.complaint_image.rono', 'bishnet.job_request.rono', 'bishnet.job_task.rono', 'bishnet.open_service_det_unit_job_cause.rono', 'bishnet.part_status.rono', 'bishnet.part_status_changelog.changelogid']
