In [1]:
import duckdb

db_file_path = "clinvar.duckdb"
with duckdb.connect(db_file_path) as db_conn:
    pass

In [2]:
from pathlib import Path
import random

from anyvar.storage.duckdb import DuckdbObjectStore


db_file_path = Path(db_file_path)
if not db_file_path.exists():
    raise FileNotFoundError(f"Database file {db_file_path} does not exist")

* 'schema_extra' has been renamed to 'json_schema_extra'


In [3]:
import json
import time

sqlalchemy_url = f"duckdb:///{db_file_path}"

filename = "/Users/kferrite/dev/clinvar-ingest/buckets/clinvar-gk-pilot/2024-09-17/stage/output-vi.json"

from anyvar.storage.sql_storage import SqlStorageBatchManager
from anyvar.utils.types import VrsVariation, variation_class_map

written_count = 0
last_logged_count = 0
last_logged_time = time.time()
log_every = 10000

with open(filename) as f_in:
    try:
        object_store = DuckdbObjectStore(db_url=sqlalchemy_url, batch_limit=10000)
        with SqlStorageBatchManager(object_store) as batch_manager:
            for line in filter(len, f_in):
                record = json.loads(line)

                if "out" in record and record["out"] and "id" in record["out"]:
                    record_id = record["out"]["id"]
                    record_type = record["out"]["type"]
                    vrs_model = variation_class_map[record_type](**record["out"])
                    object_store[record_id] = vrs_model
                    written_count += 1
                    if written_count - last_logged_count >= log_every:
                        elapsed = time.time() - last_logged_time
                        print(f"written_count: {written_count}"
                              f", {written_count - last_logged_count} in {elapsed:.2f} seconds")
                        last_logged_count = written_count
                        last_logged_time = time.time()

    except Exception as e:
        raise e
    finally:
        object_store.close()

written_count: 10000, 10000 in 0.25 seconds
written_count: 20000, 10000 in 0.27 seconds
written_count: 30000, 10000 in 0.24 seconds
written_count: 40000, 10000 in 0.25 seconds
written_count: 50000, 10000 in 0.27 seconds
written_count: 60000, 10000 in 0.30 seconds
written_count: 70000, 10000 in 0.14 seconds
written_count: 80000, 10000 in 0.33 seconds
written_count: 90000, 10000 in 0.15 seconds
written_count: 100000, 10000 in 0.37 seconds
written_count: 110000, 10000 in 0.14 seconds
written_count: 120000, 10000 in 0.15 seconds
written_count: 130000, 10000 in 0.42 seconds
written_count: 140000, 10000 in 0.15 seconds
written_count: 150000, 10000 in 0.15 seconds
written_count: 160000, 10000 in 0.49 seconds
written_count: 170000, 10000 in 0.15 seconds
written_count: 180000, 10000 in 0.14 seconds
written_count: 190000, 10000 in 0.14 seconds
written_count: 200000, 10000 in 0.14 seconds
written_count: 210000, 10000 in 0.59 seconds
written_count: 220000, 10000 in 0.14 seconds
written_count: 2300