In [3]:
# hack for importing modules
import os
import sys

sys.path.append(os.path.abspath(".."))

In [4]:
import polars as pl
from pathlib import Path
from dotenv import load_dotenv

from utils import load_full_schema

In [5]:
PROJECTDIR = Path("/home/akash/Main/projects/CACourses")
DATA_DIR = PROJECTDIR / "data"

testpath_prefixes = DATA_DIR / "7/45to7-prefixes.json"
testpath_majors   = DATA_DIR / "120/45to120-majors.json"

In [6]:
schema_prefix = load_full_schema(
    schema_fp=PROJECTDIR/"etl_pipeline/schemas/schema_prefix.pickle",
    data_dir=DATA_DIR,
    data_glob="*/*prefixes.json"
)
schema_major = load_full_schema(
    schema_fp=PROJECTDIR/"etl_pipeline/schemas/schema_major.pickle",
    data_dir=DATA_DIR,
    data_glob="*/*majors.json"
)

In [7]:
# utility functions
def _coalesce_courses(field: str):
    return pl.coalesce([
        pl.col("uni_courses").struct.field(field),
        pl.col("uni_series_courses").struct.field(field)
        # pl.col("cc_courses")
    ])


def _concat_coalesce_courses(*fields):
    return pl.coalesce([
        pl.concat_str([pl.col("uni_series_courses").struct.field(f) for f in fields], separator=" "),
        pl.concat_str([pl.col("uni_courses").struct.field(f) for f in fields], separator=" ")
    ])


def create_glossary(fp: Path, schema: pl.Schema) -> pl.DataFrame:
    uni = int(fp.parts[-2])
    cc  = int(fp.parts[-1].split('to')[0])

    lf = pl.read_json(source=fp, schema=schema).lazy()

    if "prefix" in fp.name:
        lf = lf.explode("articulations").rename({"articulations": "articulation"})

    cc_courses = (
        lf
        .select(cc_courses=(
            pl.col("articulation")
            .struct.field("sendingArticulation")
            .drop_nulls()
            .struct.field("items")
            .explode()
            .struct.field("items")
            .explode()
        ))
        .select(
                course_id=pl.col("cc_courses").struct.field("courseIdentifierParentId"),
                inst_id=pl.lit(cc),
                course_code=pl.col("cc_courses").struct.field("prefix") + " " + pl.col("cc_courses").struct.field("courseNumber"),
                course_name=pl.col("cc_courses").struct.field("courseTitle"),
                min_units=pl.col("cc_courses").struct.field("minUnits"),
                max_units=pl.col("cc_courses").struct.field("maxUnits"),
                begin=pl.col("cc_courses").struct.field("begin"),
                end=pl.col("cc_courses").struct.field("end"),
        )
    )

    uni_courses = (
        lf
        .select(
            uni_courses=pl.col("articulation").struct.field("course"),
            uni_series_courses=pl.col("articulation").struct.field("series").struct.field("courses")
        )
        .explode("uni_series_courses")
        .select(
                course_id=_coalesce_courses("courseIdentifierParentId"),
                inst_id=pl.lit(uni),
                course_code=_concat_coalesce_courses("prefix", "courseNumber"),
                course_name=_coalesce_courses("courseTitle"),
                min_units=_coalesce_courses("minUnits"),
                max_units=_coalesce_courses("maxUnits"),
                begin=_coalesce_courses("begin"),
                end=_coalesce_courses("end"),
        )
    )

    return (
        pl.concat([cc_courses, uni_courses])
        .drop_nulls()
        .collect()
    )

test_prefix = create_glossary(fp=testpath_prefixes, schema=schema_prefix)

In [8]:
prefixes_agg = pl.concat((create_glossary(fp=fp, schema=schema_prefix) for fp in DATA_DIR.glob("*/*prefixes.json"))).unique()
majors_agg = pl.concat((create_glossary(fp=fp, schema=schema_major) for fp in DATA_DIR.glob("*/*majors.json"))).unique()

In [9]:
qmap = {'W': 1, 'S': 2, 'Su': 3, 'F': 4}

courses = (
    pl.concat((prefixes_agg, majors_agg), rechunk=True)
    .unique()
    .with_columns(
        eterm=(
            pl.col("end").replace("", None).str.slice(-4).cast(pl.UInt16) * 10 +
            pl.col("end").replace("", None).str.head(-4).replace_strict(qmap, return_dtype=pl.UInt16)
        )
        .fill_null(99999)
    )
    .sort("eterm", descending=True)
    .drop("begin", "end", "eterm")
    .unique(subset=["course_id"], keep="first")
)
        # del prefixes_agg, majors_agg
courses

course_id,inst_id,course_code,course_name,min_units,max_units
i64,i32,str,str,f64,f64
210518,137,"""SOCIOL 34""","""Racial and Ethnic Relations in…",3.0,3.0
368618,95,"""THA V29B""","""History of Motion Pictures II""",3.0,3.0
374909,39,"""KIN 21A""","""Beginning Tennis""",1.0,1.0
347506,57,"""MUSC 51A""","""Digital Audio 1: Fundamentals""",3.0,3.0
256800,14,"""ENVR 259""","""Environmental Biology""",4.0,4.0
…,…,…,…,…,…
357715,144,"""ENG 052""","""Politics and Prose of the Nobe…",4.0,4.0
164751,49,"""ENGR 016""","""Engineering Circuits""",3.0,3.0
69320,69,"""DANCE 10A""","""Jazz Dance IA""",1.0,1.0
122557,92,"""GEOG 172""","""Geographic Information Systems…",2.0,2.0


In [10]:
from utils.to_postgres import write_glossary_to_psql

load_dotenv(dotenv_path=PROJECTDIR/"etl_pipeline/.env")
psql_user =   os.getenv("POSTGRES_USER")
psql_pwd =    os.getenv("POSTGRES_PWD")
psql_host =   os.getenv("POSTGRES_HOSTNAME")
psql_port =   os.getenv("POSTGRES_PORT")
psql_dbname = os.getenv("POSTGRES_DBNAME")
psql_url = f"postgresql://{psql_user}:{psql_pwd}@{psql_host}:{psql_port}/{psql_dbname}"

print(courses.schema)

write_glossary_to_psql(
    glossary=courses,
    db_url=psql_url
)

Schema({'course_id': Int64, 'inst_id': Int32, 'course_code': String, 'course_name': String, 'min_units': Float64, 'max_units': Float64})
shape: (70_026, 6)
┌───────────┬─────────┬─────────────┬─────────────────────────────────┬───────────┬───────────┐
│ course_id ┆ inst_id ┆ course_code ┆ course_name                     ┆ min_units ┆ max_units │
│ ---       ┆ ---     ┆ ---         ┆ ---                             ┆ ---       ┆ ---       │
│ i64       ┆ i32     ┆ str         ┆ str                             ┆ f64       ┆ f64       │
╞═══════════╪═════════╪═════════════╪═════════════════════════════════╪═══════════╪═══════════╡
│ 210518    ┆ 137     ┆ SOCIOL 34   ┆ Racial and Ethnic Relations in… ┆ 3.0       ┆ 3.0       │
│ 368618    ┆ 95      ┆ THA V29B    ┆ History of Motion Pictures II   ┆ 3.0       ┆ 3.0       │
│ 374909    ┆ 39      ┆ KIN 21A     ┆ Beginning Tennis                ┆ 1.0       ┆ 1.0       │
│ 347506    ┆ 57      ┆ MUSC 51A    ┆ Digital Audio 1: Fundamentals   ┆ 3.0 