In [1]:
import json
import pandas as pd
import sys
sys.path.append('./lib')

from Node import Node
from DBConnection import DBConnection
from ReadJsonConfig import read_json_config
from UsefulFunctions import flatten_list

In [2]:
# root = Node()
# root.min_search_distance = 1
# root.add_path("hello", "world")
# root.add_path("'ello", "test")

# root.to_small_file("test.txt")

In [6]:
schema = read_json_config('outputs/schemas.json')
table_counts = {
    key: len(tables.keys())
    for key, tables
    in schema.items()
}
num_tables = sum(table_counts.values())
print(f"num tables: {num_tables}")

x = 42
num_bytes = sys.getsizeof(x)
bits_in_int = num_bytes * 8
print(f"bytes in int: {num_bytes}")
print(f"bits in int: {bits_in_int}")
print(f"num ints: {num_tables / bits_in_int}")

table_counts

num tables: 640
bytes in int: 28
bits in int: 224
num ints: 2.857142857142857


{'nada': 6,
 'public': 318,
 'development': 0,
 'recipe': 40,
 'bishfix_dev': 0,
 'inventory': 37,
 'bishnet_dev': 0,
 'bishfix': 44,
 'analytics': 130,
 'service': 0,
 'inventory_dev': 0,
 'terminal': 19,
 'bishnet': 46}

# Query Data

In [4]:
db = DBConnection(read_json_config('secrets/db_config_prod.json'))

filter_keywords = [
    "analytics",
    'market',
    'backup',
    'finance',
    'forecast',
    'customer',
    'call'
    
]
filter_tables = [
    "used_make_model",
    "analytics_marketdata",
    "marketing_ss2021_11Desig",
    # "evo_soldunitlabor",
]
filter_schemas = [
    "nada",
    "analytics",
    "public",
    "inventory",
    "development",
    "service",
    "recipe",
    "bishfix_dev",
    "bishnet_dev",
    "inventory_dev",
]

queries = {}
# schema_name = "bishfix"
for schema_name, tables in schema.items():
    if (schema_name in filter_schemas):
        print(f"Skipping schema {schema_name} : filter schema match...")
        continue
    print(f"Fetching schema {schema_name}...")
    for table_name in tables.keys():
        if (table_name in filter_tables):
            print(f"    --  Skipping {table_name} : filter table match...")
            continue
        if (any([keyword in table_name for keyword in filter_keywords])):
            print(f"    --  Skipping {table_name} : filter keyword match...")
            continue
        print(f"    >>  {table_name}")
        if not schema_name in queries:
            queries[schema_name] = {}
        try:
            queries[schema_name][table_name] = db.fetch_query(f"SELECT * FROM {schema_name}.{table_name}")
        except Exception as e:
            print(f"Error fetching table {table_name} from schema {schema_name}")
            print(e)
            queries[schema_name][table_name] = None
            db.rollback_transaction()

Skipping schema nada : filter schema match...
Skipping schema public : filter schema match...
Skipping schema development : filter schema match...
Skipping schema recipe : filter schema match...
Skipping schema bishfix_dev : filter schema match...
Skipping schema inventory : filter schema match...
Skipping schema bishnet_dev : filter schema match...
Fetching schema bishfix...
    --  Skipping call_schedule : filter keyword match...
    >>  case_unit
    >>  part_request_detail
    >>  job_time_estimate
    >>  complaint
    >>  part_request
    >>  job_skill
    >>  images
    >>  image
    --  Skipping call_resolution : filter keyword match...
    >>  event_live_transfer
    >>  event_live_transfer_staging
    >>  action_case_manager
    --  Skipping customer_creation : filter keyword match...
    >>  case_manager_icon
    --  Skipping call_live_transfer : filter keyword match...
    --  Skipping filtered_evo_customer_db : filter keyword match...
    >>  case_resolution
    >>  app_ti

# build state machine

In [5]:
root = Node()
root.set_max_distance(50)
root.min_search_distance = 1

print('max_distance: ', root.max_distance)

filter_keywords = [
    "customer",
    "call_log",
]

for schema_name, table in queries.items():
    for table_name, response in table.items():
        if (any([keyword in table_name.lower() for keyword in filter_keywords])):
            print(f"  --  Skipping {schema_name}.{table_name}")
            continue
        dict = db.to_dict(response, list(schema[schema_name][table_name]['columns'].keys()))
        df = pd.DataFrame(dict)
        print(f"{schema_name}.{table_name}")
        for col_name, col_values in df.items():
            for col_value in col_values.unique():
                if col_value != None and col_value != "":
                    root.add_path(str(col_value)[:20], f"{schema_name}.{table_name}.{col_name}")


max_distance:  50
bishfix.case_unit
bishfix.part_request_detail
bishfix.job_time_estimate
bishfix.complaint
bishfix.part_request
bishfix.job_skill
bishfix.images
bishfix.image
bishfix.event_live_transfer
bishfix.event_live_transfer_staging
bishfix.action_case_manager
bishfix.case_manager_icon
bishfix.case_resolution
bishfix.app_time_dimension
bishfix.case_note
bishfix.terminal_help
bishfix.action_mastertech
bishfix.action_part_request
bishfix.event_bishfix
bishfix.event_master_tech
bishfix.case_archive
bishfix.event_other
bishfix.filter
bishfix.action_intake
bishfix.terminal_help_staging
bishfix.job
bishfix.active_tag
bishfix.filter_mastertech
bishfix.tag
bishfix.part_status
bishfix.job_archive
bishfix.date_dimension
bishfix.time_dimension
bishfix.case
bishfix.filter_case_manager
bishfix.case_work_log
terminal.technician_time_log_active
terminal.appointment
terminal.image
terminal.clockout_code
terminal.record_requirement
terminal.part_recomendation
terminal.announcement
terminal.compl

KeyboardInterrupt: 

# Save State Machine

In [None]:
root.to_small_file("bishfix_smallfile")

# test search

In [18]:
def search_path(path: str):
    return list(set(flatten_list([node.get_values() for node in root.search_path(path)])))

In [19]:
matches = {
    "a": search_path("105167"),
    "b": search_path("106287")
}
matches

{'a': ['bishnet.open_service_det_unit_job_cause.rono',
  'bishnet.part_status.rono',
  'bishnet.deleted_rono_assign.roheader_id',
  'bishnet.job_task.rono',
  'bishnet.dev_complaint_image.roheader_id',
  'bishnet.service_outfitter_schedule.serviceoutfitterschedule_id',
  'bishnet.job_request.rono',
  'bishnet.dev_complaint_image.warrantytracker_id',
  'terminal.appointment.id',
  'bishnet.sales_snapshot.DealNoCMFDate'],
 'b': ['bishnet.open_service_det_unit_job_cause.rono',
  'bishnet.part_status.rono',
  'bishnet.job_task.rono',
  'bishnet.job_request.rono',
  'bishnet.sales_snapshot.DealNoCMFDate']}

In [20]:
intersection = set(matches["a"]).intersection(matches["b"])
intersection

{'bishnet.job_request.rono',
 'bishnet.job_task.rono',
 'bishnet.open_service_det_unit_job_cause.rono',
 'bishnet.part_status.rono',
 'bishnet.sales_snapshot.DealNoCMFDate'}

# find possible foreign matches

In [None]:
schema_name = "bishfix"
table_name = "job"
response = queries[schema_name][table_name]
dict = db.to_dict(response, list(schema[schema_name][table_name]['columns'].keys()))
df = pd.DataFrame(dict)
df.head()

In [None]:
from Node import key_map
key_map['6']

In [None]:
[node.get_values() for node in root.search_path('t')]

In [None]:
column_name = "jobid"
matches = flatten_list([
    node.get_values()
    for value
    in df[column_name]
        for node
        in root.search_path(str(value[:root.max_distance]))
])

matches = [match for match in matches if match != f"{schema_name}.{table_name}.{column_name}"]

from collections import Counter
counts = Counter(matches)

print(f"Possible foreign matches for {schema_name}.{table_name}.{column_name}:")
for (match, count) in counts.most_common(20):
    print(f"    ({count}) : {match}")

In [None]:
[node.get_values() for node in root.search_path("105167")]

In [None]:
[node.get_values() for node in root.search_path("106287")]