In [1]:
# load data from the sgdb.db which is a sqlite db into a data frame
import pandas as pd
import sqlite3
import os
import sys

# adds parent directory to sys.path
from pathlib import Path
import json
sys.path.append(str(Path(os.getcwd()).resolve().parent))  # Add the parent directory to the path

data_dir = '/Users/syamil/Projects/searchgov/data'


In [2]:
# load data from parquet

tenure_df = pd.read_parquet(os.path.join(data_dir, 'tenure.parquet'))
orgs_df = pd.read_parquet(os.path.join(data_dir, 'orgs.parquet'))

In [3]:
from src.app.temporal_graph import TemporalGraph

In [4]:
age_graph = TemporalGraph(
    host="localhost",
    database="searchgov",
    user="postgres",
    password="postgres"
)

[32m2025-06-15 17:39:30.717[0m | [1mINFO    [0m | [36msrc.database.postgres.connection[0m:[36mconnect[0m:[36m34[0m - [1mConnected to PostgreSQL successfully[0m


In [5]:
from src.database.postgres.schema import SchemaManager
from src.database.postgres.connection import DatabaseConnection

schema_manager = SchemaManager(DatabaseConnection(
        host="localhost",
        database="searchgov",
        user="postgres",
        password="postgres"
    ),
)


schema_manager.reset_schema()

[32m2025-06-15 17:39:35.707[0m | [1mINFO    [0m | [36msrc.database.postgres.connection[0m:[36mconnect[0m:[36m34[0m - [1mConnected to PostgreSQL successfully[0m
[32m2025-06-15 17:39:35.721[0m | [34m[1mDEBUG   [0m | [36msrc.database.postgres.schema[0m:[36m_drop_tables[0m:[36m44[0m - [34m[1mDropped materialized view: colleague_pairs[0m


[32m2025-06-15 17:39:35.734[0m | [34m[1mDEBUG   [0m | [36msrc.database.postgres.schema[0m:[36m_drop_tables[0m:[36m50[0m - [34m[1mDropped table: employment[0m
[32m2025-06-15 17:39:35.738[0m | [34m[1mDEBUG   [0m | [36msrc.database.postgres.schema[0m:[36m_drop_tables[0m:[36m50[0m - [34m[1mDropped table: people[0m
[32m2025-06-15 17:39:35.819[0m | [34m[1mDEBUG   [0m | [36msrc.database.postgres.schema[0m:[36m_drop_tables[0m:[36m50[0m - [34m[1mDropped table: organizations[0m
[32m2025-06-15 17:39:35.824[0m | [1mINFO    [0m | [36msrc.database.postgres.schema[0m:[36m_drop_tables[0m:[36m52[0m - [1mAll tables and materialized views dropped[0m
[32m2025-06-15 17:39:35.833[0m | [34m[1mDEBUG   [0m | [36msrc.database.postgres.schema[0m:[36m_create_extensions[0m:[36m67[0m - [34m[1mCreated extension: CREATE EXTENSION IF NOT EXISTS btree_gist;[0m
[32m2025-06-15 17:39:35.834[0m | [34m[1mDEBUG   [0m | [36msrc.database.postgres.schema

## Pre-seed orgs for efficiency

In [6]:
# convert any datetime column to iso format
import numpy as np
orgs_records = orgs_df.to_dict(orient='records')
# convert any value in orgs_records that is a datetime to iso format and any ndarray to list
for record in orgs_records:
    for key, value in record.items():
        if isinstance(value, pd.Timestamp):
            record[key] = value.isoformat()
        elif isinstance(value, pd.Series):
            record[key] = value.tolist()
# convert any value in orgs_records that is a numpy ndarray to list
        elif isinstance(value, (list, tuple)):
            record[key] = list(value)
        elif isinstance(value, np.ndarray):
            record[key] = value.tolist()

# sorted_org_data = sorted(
#             orgs_records, key=lambda x: len(x.get("parts", []))
#         )
age_graph.preseed_orgs(orgs_records)



{'created': 23361, 'updated': 1772, 'failed': 0}

In [7]:
import html 

def transform_record(record):
    return {
        'clean_name': record['clean_name'],
        'org': record['org'],
        'rank': record['rank'],
        'start_date': record['start_date'].date() if hasattr(record['start_date'], 'date') else record['start_date'],
        'end_date': record['end_date'].date() if hasattr(record['end_date'], 'date') else record['end_date'],
        'embedding': record.get('embedding', None),
        'tenure_days': record['tenure_days'],
        'tel': record.get('tel'),
        'email': record.get('email'),
        'url': record.get('url'),
        'raw_name': record.get('raw_name'),
        'lower_name': record.get('lower_name'),
        'type': record.get('type', 'person'),
        'parent_org_name': record.get('parent_org_name'),
        'parent_org_url': record.get('parent_org_url'),
        'sgdi_entity_type': record.get('sgdi_entity_type'),
    }

# Your data
your_data = tenure_df.to_dict(orient='records')
# Bulk insert
transformed_data = [transform_record(record) for record in your_data]

In [8]:
result = age_graph.bulk_insert_records(transformed_data, batch_size=1000)

[32m2025-06-15 17:39:51.873[0m | [1mINFO    [0m | [36msrc.services.employment[0m:[36mbulk_insert_records[0m:[36m48[0m - [1mGrouping 136487 records by name...[0m
[32m2025-06-15 17:39:51.968[0m | [1mINFO    [0m | [36msrc.services.employment[0m:[36mbulk_insert_records[0m:[36m52[0m - [1mGrouped into 61028 unique names.[0m
[32m2025-06-15 17:42:04.015[0m | [1mINFO    [0m | [36msrc.services.employment[0m:[36mbulk_insert_records[0m:[36m85[0m - [1mBulk insert process finished. Refreshing materialized views...[0m
[32m2025-06-15 17:42:12.212[0m | [1mINFO    [0m | [36msrc.database.postgres.schema[0m:[36mrefresh_materialized_views[0m:[36m443[0m - [1mMaterialized views refreshed[0m
[32m2025-06-15 17:42:12.212[0m | [1mINFO    [0m | [36msrc.services.employment[0m:[36mbulk_insert_records[0m:[36m90[0m - [1mMaterialized views refreshed successfully.[0m


[{'colleague_name': 'Andrea Liang',
  'organization': 'POLICY AND PLANNING  : PLANNING & RESEARCH',
  'colleague_rank': 'Senior Assistant Director',
  'colleague_start_date': datetime.date(2021, 7, 22),
  'colleague_end_date': datetime.date(2022, 4, 10),
  'person_start_date': datetime.date(2021, 7, 22),
  'person_end_date': datetime.date(2022, 4, 10),
  'overlap_start_date': datetime.date(2021, 7, 22),
  'overlap_end_date': datetime.date(2022, 4, 10),
  'overlap_days': 263},
 {'colleague_name': 'Haikal Yeo',
  'organization': 'POLICY AND PLANNING  : PLANNING & RESEARCH',
  'colleague_rank': 'Senior Analyst',
  'colleague_start_date': datetime.date(2021, 7, 22),
  'colleague_end_date': datetime.date(2022, 4, 10),
  'person_start_date': datetime.date(2021, 7, 22),
  'person_end_date': datetime.date(2022, 4, 10),
  'overlap_start_date': datetime.date(2021, 7, 22),
  'overlap_end_date': datetime.date(2022, 4, 10),
  'overlap_days': 263},
 {'colleague_name': 'He Weixuan',
  'organization':

In [None]:
tenure_df