In [1]:
# hack for importing modules
import os
import sys

sys.path.append(os.path.abspath(".."))

In [None]:
import polars as pl
from pathlib import Path

from utils import load_full_schema

In [3]:
PROJECTDIR = Path("/home/akash/Main/projects/CACourses")
DATA_DIR = PROJECTDIR / "data"

testpath_prefixes = DATA_DIR / "7/45to7-prefixes.json"
testpath_majors   = DATA_DIR / "120/45to120-majors.json"

In [4]:
schema_prefix = load_full_schema(
    schema_fp=PROJECTDIR/"etl_pipeline/schemas/schema_prefix.pickle",
    data_dir=DATA_DIR,
    data_glob="*/*prefixes.json"
)
schema_major = load_full_schema(
    schema_fp=PROJECTDIR/"etl_pipeline/schemas/schema_major.pickle",
    data_dir=DATA_DIR,
    data_glob="*/*majors.json"
)

In [5]:
# utility functions
def _coalesce_courses(field: str):
    return pl.coalesce([
        pl.col("uni_courses").struct.field(field),
        pl.col("uni_series_courses").struct.field(field)
        # pl.col("cc_courses")
    ])


def _concat_coalesce_courses(*fields):
    return pl.coalesce([
        pl.concat_str([pl.col("uni_series_courses").struct.field(f) for f in fields], separator=" "),
        pl.concat_str([pl.col("uni_courses").struct.field(f) for f in fields], separator=" ")
    ])


def create_glossary(fp: Path, schema: pl.Schema) -> pl.DataFrame:
    uni = int(fp.parts[-2])
    cc  = int(fp.parts[-1].split('to')[0])

    lf = pl.read_json(source=fp, schema=schema).lazy()

    if "prefix" in fp.name:
        lf = lf.explode("articulations").rename({"articulations": "articulation"})

    cc_courses = (
        lf
        .select(cc_courses=(
            pl.col("articulation")
            .struct.field("sendingArticulation")
            .drop_nulls()
            .struct.field("items")
            .explode()
            .struct.field("items")
            .explode()
        ))
        .select(
                course_id=pl.col("cc_courses").struct.field("courseIdentifierParentId"),
                course_code=pl.col("cc_courses").struct.field("prefix") + " " + pl.col("cc_courses").struct.field("courseNumber"),
                course_title=pl.col("cc_courses").struct.field("courseTitle"),
                min_units=pl.col("cc_courses").struct.field("minUnits"),
                max_units=pl.col("cc_courses").struct.field("maxUnits"),
                begin=pl.col("cc_courses").struct.field("begin"),
                end=pl.col("cc_courses").struct.field("end"),
                inst=pl.lit(cc)
        )
    )

    uni_courses = (
        lf
        .select(
            uni_courses=pl.col("articulation").struct.field("course"),
            uni_series_courses=pl.col("articulation").struct.field("series").struct.field("courses")
        )
        .explode("uni_series_courses")
        .select(
                course_id=_coalesce_courses("courseIdentifierParentId"),
                course_code=_concat_coalesce_courses("prefix", "courseNumber"),
                course_title=_coalesce_courses("courseTitle"),
                min_units=_coalesce_courses("minUnits"),
                max_units=_coalesce_courses("maxUnits"),
                begin=_coalesce_courses("begin"),
                end=_coalesce_courses("end"),
                inst=pl.lit(uni)
        )
    )

    return (
        pl.concat([cc_courses, uni_courses])
        .drop_nulls()
        .collect()
    )

test_prefix = create_glossary(fp=testpath_prefixes, schema=schema_prefix)

In [6]:
def _get_uni_field(field_name):
        return pl.coalesce(
            pl.col("uni_node").struct.field(field_name), 
            pl.col("series_node").struct.field(field_name)
        )


def create_glossary_optimized(fp: Path, schema: pl.Schema) -> pl.LazyFrame:
    uni = int(fp.parts[-2])
    cc  = int(fp.parts[-1].split('to')[0])

    # 1. Read Lazy
    # Note: If memory is tight, ensure 'schema' is precise to avoid type inference overhead.
    lf = pl.read_json(source=fp, schema=schema).lazy()

    # 2. Pre-filter and Normalize
    # We filter null articulations immediately to reduce row count for subsequent explodes.
    if "prefixes" in str(fp):
        lf = lf.explode("articulations").rename({"articulations": "articulation"})
    
    lf = lf.filter(pl.col("articulation").is_not_null())

    # --- Branch 1: Community College (CC) ---
    # Strategy: Drill down to items -> explode -> explode -> flatten cols
    cc_lf = (
        lf
        .select(
            # Extract the deeply nested list in one pass
            node = pl.col("articulation")
                .struct.field("sendingArticulation")
                .struct.field("items")
                .explode()
                .struct.field("items")
                .explode()
        )
        .filter(pl.col("node").is_not_null())
        .select(
            course_id    = pl.col("node").struct.field("courseIdentifierParentId"),
            course_code  = pl.col("node").struct.field("prefix") + " " + pl.col("node").struct.field("courseNumber"),
            course_title = pl.col("node").struct.field("courseTitle"),
            min_units    = pl.col("node").struct.field("minUnits"),
            max_units    = pl.col("node").struct.field("maxUnits"),
            begin        = pl.col("node").struct.field("begin"),
            end          = pl.col("node").struct.field("end"),
            inst         = pl.lit(cc, dtype=pl.Int16)
        )
    )

    # --- Branch 2: University (Uni) ---
    # Strategy: Select the two possible sources (course vs series), explode series, coalesce results.
    
    # Helper to clean up the coalesce logic below
    

    uni_lf = (
        lf
        .select(
            uni_node    = pl.col("articulation").struct.field("course"),
            series_node = pl.col("articulation").struct.field("series").struct.field("courses")
        )
        # Explode the series list. 
        # Note: If series_node is null, explode preserves it as null, allowing us to still read uni_node.
        .explode("series_node") 
        .select(
            course_id    = _get_uni_field("courseIdentifierParentId"),
            course_code  = pl.coalesce(
                                pl.col("uni_node").struct.field("prefix") + " " + pl.col("uni_node").struct.field("courseNumber"),
                                pl.col("series_node").struct.field("prefix") + " " + pl.col("series_node").struct.field("courseNumber")
                           ),
            course_title = _get_uni_field("courseTitle"),
            min_units    = _get_uni_field("minUnits"),
            max_units    = _get_uni_field("maxUnits"),
            begin        = _get_uni_field("begin"),
            end          = _get_uni_field("end"),
            inst         = pl.lit(uni, dtype=pl.Int16)
        )
    )

    # 3. Concat Flat Frames
    # Since we avoided packing them into "courses" structs, we can just concat and dedupe.
    return pl.concat([cc_lf, uni_lf]).unique()

In [7]:
prefixes_agg = pl.concat((create_glossary(fp=fp, schema=schema_prefix) for fp in DATA_DIR.glob("*/*prefixes.json"))).unique()
majors_agg = pl.concat((create_glossary(fp=fp, schema=schema_major) for fp in DATA_DIR.glob("*/*majors.json"))).unique()

In [None]:
courses = pl.concat((prefixes_agg, majors_agg), rechunk=True).unique()
# del prefixes_agg, majors_agg
qmap = {'W': 1, 'S': 2, 'Su': 3, 'F': 4}
(
    courses.with_columns(
        eterm=(
            pl.col("end").replace("", None).str.slice(-4).cast(pl.UInt16) * 10 +
            pl.col("end").replace("", None).str.head(-4).replace_strict(qmap, return_dtype=pl.UInt16)
        ).fill_null(99999)
    )
    .drop("begin", "end")
    .sort("eterm", descending=True)
    .unique(subset=["course_id"], keep="first")
    .drop("eterm")
)



statistic,course_id,course_code,course_title,min_units,max_units,inst
str,f64,str,str,f64,f64,f64
"""count""",70026.0,"""70026""","""70026""",70026.0,70026.0,70026.0
"""null_count""",0.0,"""0""","""0""",0.0,0.0,0.0
"""mean""",237682.473681,,,2.869335,2.912269,78.310113
"""std""",107453.723008,,,1.160872,1.131531,42.83933
"""min""",43.0,"""A 001""",""" Introductory Biology: Ocean…",0.0,0.0,1.0
"""25%""",164517.0,,,3.0,3.0,43.0
"""50%""",248094.0,,,3.0,3.0,78.0
"""75%""",338097.0,,,3.0,3.0,114.0
"""max""",417819.0,"""ZOOL 5""","""​South Asia History & Culture""",20.0,30.0,200.0


In [45]:
courses.filter(
    pl.col("max_units") >= 16
)

course_id,course_code,course_title,min_units,max_units,begin,end,inst
i64,str,str,f64,f64,str,str,i32
377521,"""SUBC ForLang""","""Foreign Language Proficiency""",0.0,16.0,"""F2020""","""""",21
307442,"""AUTOT 10""","""Automotive Technician Program""",16.0,16.0,"""F2006""","""""",36
114211,"""KNES 172""","""Intercollegiate Cross Country""",2.0,20.0,"""F2018""","""""",129
293695,"""JPN 007S""","""Intensive Intermediate Japanes…",20.0,20.0,"""F2022""","""""",89
25232,"""ATHL 276""","""Women's Intercollegiate Spring…",0.5,30.0,"""F2014""","""""",16


In [30]:
ids = set(courses.filter(
    pl.col("begin") == ""
).get_column("course_id").to_list())
courses.filter(
    pl.col("course_id").is_in(ids)
)

course_id,course_code,course_title,min_units,max_units,begin,end,inst
i64,str,str,f64,f64,str,str,i32
328919,"""ART 10B""","""Ceramics""",3.0,3.0,"""""","""F2015""",72
275950,"""GEOG 101""","""Physical Geography""",3.0,3.0,"""""","""""",92
96460,"""ART 41""","""Basic Design""",2.0,2.0,"""""","""S2017""",77
67119,"""GEOG 101L""","""Physical Geography Laboratory""",1.0,1.0,"""""","""""",92


In [28]:
courses.filter(
    pl.col("begin") == ""
)

course_id,course_code,course_title,min_units,max_units,begin,end,inst
i64,str,str,f64,f64,str,str,i32
328919,"""ART 10B""","""Ceramics""",3.0,3.0,"""""","""F2015""",72
275950,"""GEOG 101""","""Physical Geography""",3.0,3.0,"""""","""""",92
96460,"""ART 41""","""Basic Design""",2.0,2.0,"""""","""S2017""",77
67119,"""GEOG 101L""","""Physical Geography Laboratory""",1.0,1.0,"""""","""""",92
