In [None]:
from dotenv import load_dotenv
import logging
import os
import sys
from pathlib import Path
import polars as pl

# hack for importing modules
sys.path.append(os.path.abspath(".."))

from utils import (
    extract_articulations_lazy,
    load_full_schema,
    to_dnf,
    timer
)

In [None]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("agreements_to_db")

PROJECTDIR = Path("/home/akash/Main/projects/CACourses")
DATA_DIR = PROJECTDIR/"data"
ETL_DIR = PROJECTDIR/"etl_pipeline"
schema_prefix_fp = ETL_DIR / "schemas/schema_prefix.pickle"
schema_major_fp = ETL_DIR / "schemas/schema_major.pickle"

load_dotenv(dotenv_path=ETL_DIR/".env")
psql_user =   os.getenv("POSTGRES_USER")
psql_pwd =    os.getenv("POSTGRES_PWD")
psql_host =   os.getenv("POSTGRES_HOSTNAME")
psql_port =   os.getenv("POSTGRES_PORT")
psql_dbname = os.getenv("POSTGRES_DBNAME")
psql_url = f"postgresql://{psql_user}:{psql_pwd}@{psql_host}:{psql_port}/{psql_dbname}"

In [None]:
schema_prefix = load_full_schema(
    schema_fp=schema_prefix_fp,
    data_dir=DATA_DIR,
    data_glob="*/*prefixes.json",
    logger=logger
)

# load schema for major-based data
schema_major = load_full_schema(
    schema_fp=schema_major_fp,
    data_dir=DATA_DIR,
    data_glob="*/*majors.json",
    logger=logger
)

with timer(label="LF Extraction", logger=logger, level=logging.INFO):
    prefixes_lazy = (
        pl.concat((
            extract_articulations_lazy(fp=fp, schema=schema_prefix) 
            for fp in DATA_DIR.glob("*/*prefixes.json")
        ))
        .with_columns(pl.col("articulation").map_elements(to_dnf, return_dtype=pl.String))
    )

    majors_lazy = (
        pl.concat((
            extract_articulations_lazy(fp=fp, schema=schema_major ) 
            for fp in DATA_DIR.glob("*/*majors.json")
        ))
        .with_columns(pl.col("articulation").map_elements(to_dnf, return_dtype=pl.String))
    )

# 4. Collect Articulations
    
with timer(label="LF Collection", logger=logger, level=logging.INFO):
    articulations = pl.concat((prefixes_lazy, majors_lazy)).unique().collect()
    logger.info(f" articulations DF estimated size: {articulations.estimated_size("mb"):.2f} megabytes, {len(articulations)} rows")

    del prefixes_lazy, majors_lazy

# Port database function from sqlalchemy to adbc

In [None]:
from adbc_driver_postgresql import dbapi

def write_articulations_to_psql(agreements: pl.DataFrame, db_url: str) -> None:
    tablename = "articulations"

    agreements = agreements.cast({
        "course_id": pl.Int32,
        "cc": pl.Int16,
        "uni": pl.Int16
    })

    with dbapi.connect(db_url) as conn:
        with conn.cursor() as cur:
            
            cur.execute(f"DROP TABLE IF EXISTS {tablename};")
            cur.execute(f"""
                CREATE TABLE {tablename} (
                    course_id INT4 NOT NULL,
                    cc INT2 NOT NULL,
                    uni INT2 NOT NULL,
                    articulation TEXT NOT NULL,
                    PRIMARY KEY (course_id, cc, uni)
                );
            """)
        conn.commit()
    
    agreements.write_database(
        table_name=tablename,
        connection=db_url,
        if_table_exists="append",
        engine="adbc"
    )

print(psql_url)
write_articulations_to_psql(
    agreements=articulations,
    db_url=psql_url
)

