In [11]:
import polars as pl
from functools import lru_cache

from pathlib import Path

from utils.dnf_converter import to_dnf
from utils.generate_schema import merge_schemas

In [9]:
PROJECTDIR = Path("/home/akash/Main/projects/CACourses")

In [10]:
# functions

def extract_articulations(fp: Path, schema: pl.Schema | None) -> pl.DataFrame:
    uni = int(fp.parts[-2])
    cc  = int(fp.parts[-1].split('to')[0])

    lf = pl.read_json(source=fp, schema=schema).lazy()

    # Normalize structure (Explode list vs Rename single)
    if "prefixes" in str(fp):
        lf = lf.explode("articulations").rename({"articulations": "articulation"})

    return (
        lf
        # 1. Filter empty articulations immediately
        .filter(
            pl.col("articulation")
            .struct.field("sendingArticulation")
            .struct.field("items")
            .list.len() > 0
        )
        # 2. Extract Fields & Merge Source IDs
        .select(
            # Extract Series IDs (List[Int]) and Single IDs (Int)
            series_ids=pl.col("articulation").struct.field("series").struct.field("courses")
                     .list.eval(pl.element().struct.field("courseIdentifierParentId")),
            
            root_id=pl.col("articulation").struct.field("course").struct.field("courseIdentifierParentId"),

            # Extract Destination Data
            sending_items=pl.col("articulation").struct.field("sendingArticulation").struct.field("items"),
            
            # Global Conjunction
            global_conj=(
                pl.col("articulation")
                .struct.field("sendingArticulation")
                .struct.field("courseGroupConjunctions")
                .list.first()
                .struct.field("groupConjunction")
                .fill_null("Or")
            )
        )
        .unique()
        # 3. Safe Explode Logic
        # Coalesce series list with root_id (wrapped in a list) so we never drop rows
        .with_columns(
            source_id_list=pl.coalesce(
                pl.col("series_ids"), 
                pl.concat_list(pl.col("root_id")) 
            )
        )
        .explode("source_id_list") # Now safe to explode
        .drop_nulls("source_id_list")
        # 4. Final Construction
        .select(
            cc=pl.lit(cc),
            uni=pl.lit(uni),
            course_id=pl.col("source_id_list"),
            articulation=pl.struct(
                conj=pl.col("global_conj"),
                items=pl.col("sending_items").list.eval(
                    pl.struct(
                        conj=pl.element().struct.field("courseConjunction"),
                        items=pl.element().struct.field("items").list.eval(
                            pl.element().struct.field("courseIdentifierParentId")
                        )
                    )
                )
            )
        )
        .group_by(
            pl.col("course_id"),
            pl.col("cc"),
            pl.col("uni")
        ).all()
        .select(
            course_id=pl.col("course_id"),
            cc=pl.col("cc"),
            uni=pl.col("uni"),
            articulation=pl.struct(
                conj=pl.lit("Or"),
                items=pl.col("articulation")
            )
        )
        .collect()
    )


@lru_cache(maxsize=128)
def _resolve_supertype(dtype1: pl.DataType, dtype2: pl.DataType) -> pl.DataType:
    """
    Caches the expensive supertype resolution.
    This bypasses creating dummy Series for repetitive primitive merges
    (e.g., merging Int64 and Float64 thousands of times).
    """
    try:
        # diagonal_relaxed allows Polars to determine the common supertype
        return pl.concat(
            [pl.Series([None], dtype=dtype1), pl.Series([None], dtype=dtype2)],
            how="diagonal_relaxed",
        ).dtype
    except Exception:
        raise TypeError(f"Could not merge incompatible types: {dtype1} and {dtype2}")
    return


def _merge_dtypes_optimized(dtype1: pl.DataType, dtype2: pl.DataType) -> pl.DataType:
    """Optimized recursive merge."""
    # 1. Identity Check (Fastest exit)
    if dtype1 == dtype2:
        return dtype1

    # 2. Null Handling
    if isinstance(dtype1, pl.Null): return dtype2
    if isinstance(dtype2, pl.Null): return dtype1

    # 3. Recursive List Merge
    if isinstance(dtype1, pl.List) and isinstance(dtype2, pl.List):
        return pl.List(_merge_dtypes_optimized(dtype1.inner, dtype2.inner))

    # 4. Recursive Struct Merge
    if isinstance(dtype1, pl.Struct) and isinstance(dtype2, pl.Struct):
        # Convert both to dictionaries once
        f1 = dtype1.to_schema()
        f2 = dtype2.to_schema()
        
        # Start with f1's fields
        merged_fields = f1.copy()
        
        # Only iterate over fields in f2
        for key, type2 in f2.items():
            type1 = merged_fields.get(key)
            if type1 is not None:
                # Recursively merge only if types differ
                if type1 != type2:
                    merged_fields[key] = _merge_dtypes_optimized(type1, type2)
            else:
                # New field from f2
                merged_fields[key] = type2
        
        return pl.Struct(merged_fields)

    # 5. Cached Primitive Resolution
    # We use the cached function for scalar types (Int, Float, String, etc.)
    return _resolve_supertype(dtype1, dtype2)


def merge_schemas(schemas: list[pl.Schema]) -> pl.Schema:
    """
    Optimized schema merging.
    """
    if not schemas:
        return pl.Schema()

    # Convert the first schema to a mutable dictionary immediately
    # casting to dict() is cheaper than repetitive lookups on a Schema object
    current_schema_map = dict(schemas[0])

    for schema in schemas[1:]:
        # Iterate only over the new schema's items
        for field_name, new_dtype in schema.items():
            existing_dtype = current_schema_map.get(field_name)
            
            if existing_dtype is None:
                # Fast path: New field
                current_schema_map[field_name] = new_dtype
            elif existing_dtype != new_dtype:
                # Slow path: Conflict resolution
                current_schema_map[field_name] = _merge_dtypes_optimized(existing_dtype, new_dtype)
    
    return pl.Schema(current_schema_map)

In [14]:
schema_list_prefix = [
    pl.read_json(
        source=fp,
        infer_schema_length=None
    ).schema 
    for fp 
    in PROJECTDIR.glob("data/*/*prefixes.json")
]
schema_prefix = merge_schemas(schema_list_prefix)
del schema_list_prefix

KeyboardInterrupt: 

In [12]:
from utils.dnf_converter import to_dnf

testpath = Path("/home/akash/Main/projects/CACourses/data/129/101to129-prefixes.json")

schem = pl.read_json(source=testpath, infer_schema_length=None).schema
test = extract_articulations(fp=testpath, schema=schem)


test.with_columns(
    pl.col("articulation")
    .map_elements(to_dnf, return_dtype=pl.Struct)
)

course_id,cc,uni,articulation
i64,i32,i32,struct[2]
155879,101,129,"{""Or"",[{""And"",[193934]}, {""And"",[195409, 193934]}]}"
193460,101,129,"{""Or"",[{""And"",[288634]}]}"
249779,101,129,"{""Or"",[{""And"",[164337]}]}"
337294,101,129,"{""Or"",[{""And"",[359007, 280593]}, {""And"",[359007]}]}"
336143,101,129,"{""Or"",[{""And"",[339424]}]}"
…,…,…,…
203851,101,129,"{""Or"",[{""And"",[281046]}]}"
178316,101,129,"{""Or"",[{""And"",[249283]}]}"
330060,101,129,"{""Or"",[{""And"",[338279, 337353]}, {""And"",[337353]}]}"
135366,101,129,"{""Or"",[{""And"",[168432, 215730]}, {""And"",[168432]}]}"


In [13]:
prefixes_agg = pl.concat([
    extract_articulations(fp, schema_prefix).with_columns(
        pl.col("articulation")
        .map_elements(to_dnf, return_dtype=pl.Struct)
    ).lazy()
    for fp
    in PROJECTDIR.glob("data/*/*-prefixes.json")
]).collect().unique()

prefixes_agg

NameError: name 'schema_prefix' is not defined

In [330]:
import pickle

with (PROJECTDIR/"etl-pipeline/schema_prefix.pickle").open(mode="wb") as fp:
    pickle.dump(obj=schema_prefix, file=fp)

In [16]:
import logging
# from utils.generate_schema import merge_schemas

def main():
    logger = logging.getLogger(__name__)

    # file paths
    DATA_DIR = Path("/home/akash/Main/projects/CACourses/data")
    SCHEMA_DIR = DATA_DIR/"../etl_pipeline/schemas"
    schema_prefix_fp = SCHEMA_DIR / "schema_prefix.pickle"
    schema_major_fp = SCHEMA_DIR / "schema_major.pickle"


    # load schema for prefix-based data
    if schema_prefix_fp.exists():
        logger.warning("Loading precomputed schema for prefix-based articulations")
        with schema_prefix_fp.open(mode='rb') as fp:
            schema_prefix: pl.Schema = pickle.load(file=fp)
    else:
        logger.warning("Precomputed prefix schema not found, inferring from data")
        schema_list_prefix = [pl.read_json(fp, infer_schema_length=None).schema for fp in DATA_DIR.glob("*/*prefixes.json")]
        schema_prefix = merge_schemas(schemas=schema_list_prefix)
        with schema_prefix_fp.open(mode='wb') as fp:
            pickle.dump(obj=schema_prefix, file=fp)

    # load schema for major-based data
    if schema_major_fp.exists():
        logger.warning("Loading precomputed schema for major-based articulations")
        with schema_major_fp.open(mode='rb') as fp:
            schema_major: pl.Schema = pickle.load(file=fp)
    else:
        logger.warning("Precomputed major schema not found, inferring from data")
        schema_list_major  = [pl.read_json(fp, infer_schema_length=None).schema for fp in DATA_DIR.glob("*/*majors.json")]
        schema_major  = merge_schemas(schema_list_major)
        with schema_major_fp.open(mode="wb") as fp:
            pickle.dump(obj=schema_major, file=fp)


    # extract articulations
    logger.warning("Extracting articulations")
    prefixes_agg = [
        extract_articulations(fp=fp, schema=schema_prefix)
        .with_columns(
            pl.col("articulation")
            .map_elements(to_dnf, return_dtype=pl.List(pl.List(pl.Int64)))
        )
        for fp in DATA_DIR.glob("*/*prefixes.json")
    ]
    majors_agg = [
        extract_articulations(fp=fp, schema=schema_major)
        .with_columns(
            pl.col("articulation")
            .map_elements(to_dnf, return_dtype=pl.List(pl.List(pl.Int64)))
        )
        for fp in DATA_DIR.glob("*/*majors.json")
    ]
    articulations = pl.concat(prefixes_agg + majors_agg)
    return articulations

In [17]:
df = main()

Loading precomputed schema for prefix-based articulations
Loading precomputed schema for major-based articulations
Extracting articulations


In [None]:
df.shrink_to_fit().estimated_size()

26220743

In [54]:
upper_bound = df.get_column("articulation").list.len()

df.with_columns(
    pl.col("articulation")
    .list.to_struct()
    .struct.json_encode()
)

df.sort(
    by=pl.col("articulation").list.len()
)

for col in df.filter(
    pl.col("course_id").is_null()
).iter_rows():
    print(col)

  .list.to_struct()


(None, 28, 117, [[333983], [166008], [193158], [357756], [157582], [370080], [370080, 370081], [370079], [166008], [193158], [357756], [157582], [370080], [370081], [370079], [370080], [370081], [370079], [166008], [193158], [357756], [157582], [166008], [193158], [357756], [157582], [370081, 370080], [370079], [370080], [289292], [273226], [353421], [333983], [283036], [308215], [338639], [157582], [333983], [163667], [168424], [166008], [193158], [357756], [157582], [370080], [370080, 370081], [370079], [157582], [333983], [353421], [353421], [193158], [166008], [166008], [193158], [357756], [157582], [370081, 370080], [370079], [370080], [166008], [193158], [357756], [157582], [370080], [370080, 370081], [370079], [193158], [166008], [166008], [193158], [357756], [157582], [370081, 370080], [370079], [370080], [166008], [193158], [357756], [157582], [224453], [157545], [209128], [193047], [167890], [193395], [368890], [163667], [166008], [193158], [357756], [157582], [370081, 370080

In [110]:
testurl = "https://assist.org/api/articulation/Agreements?Key=75/137/to/120/AllMajors"
testpath = PROJECTDIR/"data/120/137to120-majors.json"

schem = pl.read_json(source=testpath, infer_schema_length=None).schema
testdf = extract_articulations(testpath, schem)

In [139]:
testdf.get_column("articulation").struct.field("items").list.len().max()

22

In [138]:
testitem = testdf.with_columns(
    len=pl.col("articulation")
    .struct.field("items")
    .list.len()
).sort(pl.col("len")).filter(
    pl.col("len") > 10
).item(row=0, column="articulation")

In [178]:
testpath = PROJECTDIR/"data/120/137to120-majors.json"
lf = pl.read_json(source=testpath, schema=schem).rename({"articulation": "articulations"})
cc, uni = 120, 137

test = extract_articulations(testpath, schem)

In [189]:
test_sorted = test.sort(
    pl.col("articulation")
    .struct.field("items")
    .list.len()
)

In [198]:
from utils.dnf_converter import to_dnf as TO_DNF

test_sorted.with_columns(
    pl.col("articulation")
    .map_elements(TO_DNF)
)

  test_sorted.with_columns(


course_id,cc,uni,articulation
i64,i32,i32,struct[2]
130113,137,120,"{""Or"",[{""And"",""conj""}, {""And"",""items""}]}"
289487,137,120,"{""Or"",[{""And"",""conj""}, {""And"",""items""}]}"
355122,137,120,"{""Or"",[{""And"",""conj""}, {""And"",""items""}]}"
264367,137,120,"{""Or"",[{""And"",""conj""}, {""And"",""items""}]}"
294655,137,120,"{""Or"",[{""And"",""conj""}, {""And"",""items""}]}"
…,…,…,…
289431,137,120,"{""Or"",[{""And"",""conj""}, {""And"",""items""}, … {""And"",""items""}]}"
258210,137,120,"{""Or"",[{""And"",""conj""}, {""And"",""items""}, … {""And"",""items""}]}"
289454,137,120,"{""Or"",[{""And"",""conj""}, {""And"",""items""}, … {""And"",""items""}]}"
300673,137,120,"{""Or"",[{""And"",""conj""}, {""And"",""items""}, … {""And"",""items""}]}"


{'conj': 'Or',
 'items': [{'conj': 'And', 'items': 'conj'},
  {'conj': 'And', 'items': 'items'},
  {'conj': 'And', 'items': 'conj'},
  {'conj': 'And', 'items': 'items'},
  {'conj': 'And', 'items': 'conj'},
  {'conj': 'And', 'items': 'items'},
  {'conj': 'And', 'items': 'conj'},
  {'conj': 'And', 'items': 'items'},
  {'conj': 'And', 'items': 'conj'},
  {'conj': 'And', 'items': 'items'},
  {'conj': 'And', 'items': 'conj'},
  {'conj': 'And', 'items': 'items'}]}