# Stage to Graph

This notebook extracts some data from the stage database and populates graph database.

In [None]:
# Allow for the import of packages from the top project folder
import sys
sys.path.append('../..')

In [None]:
import shutil
import kuzu
import duckdb
from studymetricspoc import ProcessConfig

In [None]:
pc = ProcessConfig()
KZDB_PATH = pc.get_path('GRAPH_PATH')
KZDB_DIR = KZDB_PATH.parent
DDB_PATH = pc.get_path('STAGE_PATH')
DDB_DIR = DDB_PATH.parent

## Remove and Clean Everything

In [None]:
shutil.rmtree(KZDB_DIR, ignore_errors=True)
KZDB_DIR.mkdir(parents=True, exist_ok=True)

## Create Graph Database Schema

In [None]:
kzdb = kuzu.Database(KZDB_PATH)
kzcon = kuzu.Connection(kzdb)

### DDL

In [None]:
ddl_text = pc.get('BUILDER_DDL')
for statement in filter(lambda x: x.strip(), ddl_text.split(';')):
    kzcon.execute(statement)

In [None]:
list(kzcon.execute("CALL show_tables() RETURN *"))

In [None]:
print(ddl_text)

### SQL

In [None]:
ddbcon = duckdb.connect(DDB_PATH, read_only = True)

In [None]:
sql_queries = pc.get('BUILDER_SQL')

In [None]:
for table_name, query in sql_queries.items():
    # Get data from SQL
    df = ddbcon.execute(query).pl()
    # Populate graph with nodes or relations from the query
    result = kzcon.execute(f"COPY {table_name} FROM $dataframe (ignore_errors=true)", {'dataframe': df})
    rdf = result.get_as_pl()
    print(f'{table_name}:', rdf.get_column('result').last())

### Post-Ingest Object Creation

In [None]:
graph_statement_groups = pc.get('BUILDER_POSTINGEST')

In [None]:
for statement_group, sg_dict in graph_statement_groups.items():
    print(f'Starting group: {statement_group}')
    for statement_name, statement in sg_dict.items():
        print(f'> Running statement: {statement_name}')
        result = kzcon.execute(statement)

In [None]:
ddbcon.close()
kzcon.close()
kzdb.close()