In [6]:
import orjson
import json
import os
import pandas as pd
import polars as pl

from collections import defaultdict
from dotenv import load_dotenv
from sqlalchemy import create_engine, text

In [None]:
def get_query(cc_id: int, uni_id: int) -> list[dict]:
    """
    fetch local copy of agreement
    
    - cc_id: internal ASSIST id for california community college
    - uni_id: internal ASSIST id for california public university

    returns:
    - python representation of JSON file (list of dicts)
    """
    with open(f"./data/{uni_id}/{cc_id}to{uni_id}.json", "r") as fp:
        out = orjson.loads(fp.read())
    return out


def extract_articulations(cc: int, uni: int) -> pl.DataFrame:
    """
    Performs a series of polars dataframe transformations on query to
    assemble dataframe of articulations mapping uni courses to json relations
    of cc courses.

    Initially reads in df as pandas dataframe due to looser data structuring
    requirements, casts to polars dataframe, then applies series of vectorized
    transformations.

    - cc_id: internal ASSIST id for california community college
    - uni_id: internal ASSIST id for california public university

    returns:
    - polars dataframe with (uni course id: int, cc relation: json string) relation
    """

    # extract data from query & construct polars dataframe
    articulations: pl.DataFrame = pl.from_pandas(pd.DataFrame(get_query(cc, uni)))
    
    colname = "articulation"
    if "articulations" in articulations.columns:
        articulations = articulations.explode("articulations")
        colname = "articulations"
    articulations_struct = articulations.get_column(colname).struct

    articulations = pl.concat((
        articulations_struct.field("course").struct.field("courseIdentifierParentId").rename("course_id").to_frame(),
        articulations_struct.field("series").to_frame(),
        articulations_struct.field("sendingArticulation").to_frame()
    ), how="horizontal")
    
    articulations = articulations.with_columns(
        pl.lit(cc).alias("cc"),
        pl.lit(uni).alias("uni")
    )
    
    
    # transform into mapping of course id : relationship to course ids that articulate to it
    articulations = (
        articulations
        # extract courses from uni series objects and treat them as individual courses
        # premise: A and B articulates to C and D => A and B articulates to C and A and B articulates D
        .with_columns(pl.col("series").struct.field("courses"))
        .explode("courses")
        # extract sendingArticulation field (contains A and B) and id of C/D courses
        .with_columns(
            course_id=pl.coalesce(
                "course_id",
                pl.col("courses").struct.field("courseIdentifierParentId")
            ),
            items=(
                pl.when(pl.col("sendingArticulation").struct.field("items").list.len() > 0)
                .then(pl.col("sendingArticulation").struct.field("items"))
                .otherwise(None)
            )
        )
        # transform sendingArticulation to only keep course ids from whole course structs for C/D
        .with_columns(
            pl.col("items").list.eval(
                pl.struct([
                    pl.element().struct.field("courseConjunction").alias("conj"),
                    pl.element().struct.field("items").list.eval(
                        pl.element().struct.field("courseIdentifierParentId")
                    ).alias("items")
                ])#.struct.json_encode()  # uncomment to convert struct to json string
            )
        )
        .drop_nulls("items")
        # ensure AND groupings are grouped together by the proper group conjunction
        # with OR as the default if articulation exists else null
        .with_columns(
            pl.when(pl.col("items").is_not_null())
            .then(
                pl.struct([
                    pl.when(
                        pl.col("sendingArticulation")
                        .struct.field("courseGroupConjunctions")
                        .list.len() > 0
                    )
                    .then(
                        pl.col("sendingArticulation")
                        .struct.field("courseGroupConjunctions")
                        .list.first()
                        .struct.field("groupConjunction")
                    )
                    .otherwise(pl.lit("Or"))
                    .alias("groupConj"),            
                    pl.col("items")
                ])
            )
            .otherwise(None)
        )
        .drop_nulls()
        .unique()
        # drop intermediary columns
        .drop(["series", "courses", "sendingArticulation", "items"])
    ).rename({"groupConj": "articulation"})
    
    return articulations


def aggregate_agreements(errored: defaultdict) -> pl.DataFrame:
    with open("./data/institutions_cc.json", "r") as cc_fp, open("./data/institutions_state.json", "r") as uni_fp:
        cc_ids = orjson.loads(cc_fp.read()).keys()
        uni_ids = orjson.loads(uni_fp.read()).keys()
    
    articulations = []
    for uni in uni_ids:
        for cc in cc_ids:
            if not os.path.exists(f"./data/{uni}/{cc}to{uni}.json"):
                continue
            try:
                articulations.append(extract_articulations(int(cc), int(uni)))
            except Exception:
                errored[uni].append(cc)
        print(f"Extracted articulations for {uni=}")
    with open("data/known_errors_agreements.json", "w") as fp:
        json.dump(errored, fp, indent=2)
    return pl.concat(articulations)


def process_agreements(agreements: pl.DataFrame) -> pl.DataFrame:
    filtered = agreements.drop_nulls().unique()
    return (
        filtered
        .with_columns(gc=pl.col("articulation").struct.field("groupConj"))  # get group conjunction
        .group_by(['course_id', 'cc', 'uni'])
        .all()  # collapse multiple articulations per agreement into 1 entry
        .with_columns(articulation=pl.struct(superConj=pl.lit("Or"), items=pl.col("articulation")))
        # recreate articulation entry
        .drop("gc")
    )

In [None]:
errdict = defaultdict(list)
aggregate_agreements(errored=errdict)


Extracted articulations for uni='1'
Extracted articulations for uni='7'
Extracted articulations for uni='11'
Extracted articulations for uni='12'
Extracted articulations for uni='21'
Extracted articulations for uni='23'
Extracted articulations for uni='24'
Extracted articulations for uni='26'
Extracted articulations for uni='29'
Extracted articulations for uni='39'
Extracted articulations for uni='42'
Extracted articulations for uni='46'
Extracted articulations for uni='50'
Extracted articulations for uni='60'


In [18]:
try:
    extract_articulations(cc=6, uni=7)
except pl.exceptions.StructFieldNotFoundError as err:
    print(err.with_traceback())

TypeError: BaseException.with_traceback() takes exactly one argument (0 given)