In [1]:
""" Imports """
import json
import pandas as pd
import polars as pl

# Defining a Database Schema

## What data do we need

Starting from the top, what information should we display to users?

example return field in csv format (use tsv since a few course/dept names have commas)

Example: user selects inst=UCSC, course=MATH19A
`San Diego Miramar College, MATH 150, Calculus with Analytic Geometry I, 4 units`

Example: user selects inst=UCSD, course=CHEM6B

`San Diego Miramar College, CHEM 200, General Chemistry I - Lecture, 3 units`

`and`

`San Diego Miramar College, CHEM 201, General Chemistry II - Lecture, 3 units`


Required data:
- sending institution (e.g. San Diego Miramar College)
- course prefix (e.g. CHEM)
- course number (e.g. 201)
- course name (e.g. General Chemistry II - Lecture)
- unit count (e.g 3 units)
- course ID (some string of numbers, makes mappings easy to deal with)


## The schema

Table 1: course glossary

fields:
- id (int, primary key)  // course id
- inst (string)  // community college or univ
- prefix (string)  // course prefix (e.g. CHEM)
- course number (int) 
- course name (string)
- min units (int)  // if min units and max units are the same, display 1
- max units (int)

Table 2: articulations
fields:
- id: (int, primary key)
- inst (uni) (string)
- agreements (json string)

## The JSON string

tbd, working on that rn

## The query flow

- user enters site
- user picks university -> get list of all courses at university for dropdown
    - sends request to backend
    - backend uses user input to send query to db
        -  `FROM glossary SELECT * WHERE inst IS {whatever the user picked}`
    - backend converts results into json w/ id: {prefix, num, name, min units, max units}  // use for formatting dropdown
    - backend returns json
- user picks course from list -> get list of all articulated courses from id
    - sends request to backend
    - backend uses user input to send query to db
        - `FROM articulations SELECT agreements WHERE id IS {id of course user picked}`
        - returns 1 json string with all articulations
    - backend aggregates all json string'd course IDs into list, queries course data
        - `FROM glossary SELECT * WHERE id IS {ids in list}`
    - backend converts results into json w/ id: {inst, prefix, num, name, min units, max units}  // actual displayed data
    - backend returns both jsons (articulation string w/ IDs and the cc glossary)
- both jsons formatted into cells on frontend
- user gets results

In [2]:
""" Define functions for generating an in-memory glossary of every course in an AllPrefixes agreement page """

def get_query(cc_id: int, uni_id: int) -> dict:
    with open(f"./data/{uni_id}/{cc_id}to{uni_id}.json", "r") as fp:
        out = json.load(fp)
    return out
    

def update_courses(courselist: list[dict], glossary: dict, inst: int) -> None:
    for course in courselist:
        if not all((course["prefix"],
                    course["courseNumber"],
                    course["courseTitle"],
                    course["minUnits"],
                    course["maxUnits"])):
            continue
            
        course_id: str = f"{inst}{course["prefix"]}{course["courseNumber"]}"
        
        if course_id not in glossary:            
            glossary[course_id] = {
                "course_id": course_id,
                "inst_id": inst,
                "course_code": f"{course["prefix"]} {course["courseNumber"]}",
                "course_name": course["courseTitle"],
                "min_units": int(course["minUnits"]),
                "max_units": int(course["maxUnits"])
            }

def create_course_glossary(cc: int, uni: int) -> pl.DataFrame:
    # create output glossary
    course_glossary = dict()
    
    with open(f"./data/{uni}/{cc}to{uni}.json", "r") as fp:
        articulations = json.load(fp)
    
    # populate query
    for dept in articulations:
        articulationList = dept["articulations"] if "articulations" in dept else [dept["articulation"]]
        
        for articulation in articulationList:
            
            # handle university courses
            uni_courses: list[dict] = [articulation["course"]] if "course" in articulation else articulation["series"]["courses"]
            update_courses(courselist=uni_courses, glossary=course_glossary, inst=uni)
                
            # handle cc courses
            agreements: dict | None = articulation["sendingArticulation"]
            if isinstance(agreements, dict):
                for agreement in agreements["items"]:
                    update_courses(courselist=agreement["items"], glossary=course_glossary, inst=cc)
        
    return pl.DataFrame(pd.DataFrame(course_glossary.values(), index=course_glossary.keys()))
    # except TypeError as err:
    #     print(f"TYPEERROR (cc={cc}, uni={uni}):", err)

In [110]:

articulations = pl.DataFrame(get_query(45, 7)).explode("articulations")

articulations = pl.concat((
    articulations.get_column("articulations").struct.field("course").struct.field("courseIdentifierParentId").rename("course_id").to_frame(),
    articulations.get_column("articulations").struct.field("series").to_frame(),
    articulations.get_column("articulations").struct.field("sendingArticulation").to_frame(),
), how="horizontal")

test = (
    articulations
    .with_columns(pl.col("series").struct.field("courses"))
    .explode("courses")
    .with_columns(
        course_id=pl.coalesce(
            "course_id",
            pl.col("courses").struct.field("courseIdentifierParentId")
        ),
        items=(
            pl.when(pl.col("sendingArticulation").struct.field("items").list.len() > 0)
            .then(pl.col("sendingArticulation").struct.field("items"))
            .otherwise(None)
        ),
        
    )
    .with_columns(
        pl.col("items").list.eval(
            pl.struct([
                pl.element().struct.field("courseConjunction"),
                pl.element().struct.field("items").list.eval(
                    pl.element().struct.field("courseIdentifierParentId")
                )
            ])#.struct.json_encode()  # uncomment to convert struct to json string
        )
    )
    # .with_columns(
    #     groupConj=(
    #         pl.when(pl.col("items").is_not_null() & pl.col("items").list.len() > 1)
    #         .then(pl.col("sendingArticulation").struct.field("courseGroupConjunctions").struct.field("groupConjunction"))
    #         .otherwise(None)
    #     )
    # )
    .drop(["series", "courses"])
)


(
    test
    .filter(pl.col("items").list.len() > 1)
    # .with_columns(
    #     groupConjLen=pl.col("groupConj").list.len()
    # )
    # .get_column("groupConj").list.get(0).struct.fields
    # .filter(pl.col("course_id") == 327985)
)


# test

course_id,sendingArticulation,items
i64,struct[6],list[struct[2]]
357468,"{null,[],[{""And"",[{[],[],[],203808,""Biology of Human Nutrition"",""135"",""BIOL"",4864,""Biology"",1706,""Biology"",""F1995"","""",3.0,3.0,[],null,0,""Course""}],[],0,""CourseGroup""}, {""And"",[{[],[],[],208250,""Nutrition"",""150"",""NUTR"",7953,""Nutrition"",9779,""Nutrition"",""F1995"",""F2025"",3.0,3.0,[],null,0,""Course""}],[],1,""CourseGroup""}],[{""b9aefc50-70a6-4814-acaf-08dca807cdc2"",""5aa4625a-ac22-47b9-cf6e-08dca807cdc1"",""Or"",0,1}],""SendingArticulation"",[]}","[{""And"",[203808]}, {""And"",[208250]}]"
259948,"{null,[],[{""And"",[{[],[],[],339988,""Fundamentals of Chemistry"",""100"",""CHEM"",3785,""Chemistry"",2301,""Chemistry"",""F1995"","""",3.0,3.0,[],null,0,""Course""}],[],0,""CourseGroup""}, {""And"",[{[],[],[],267304,""Chemistry in Society"",""111"",""CHEM"",3785,""Chemistry"",2301,""Chemistry"",""F2013"","""",3.0,3.0,[],null,0,""Course""}],[],1,""CourseGroup""}],[{""7159b010-8735-4948-44c7-08dca807cdfc"",""89e81001-74d2-4fe0-f6f7-08dca807cdae"",""Or"",0,1}],""SendingArticulation"",[]}","[{""And"",[339988]}, {""And"",[267304]}]"
223760,"{null,[],[{""And"",[{[],[],[],338911,""Elementary Statistics"",""119"",""MATH"",9952,""Mathematics"",8982,""Mathematics"",""F1995"","""",3.0,3.0,[],null,0,""Course""}],[],0,""CourseGroup""}, {""And"",[{[],[],[],345437,""Behavioral Science Statistics"",""258"",""PSYC"",4633,""Psychology"",11234,""Psychology"",""F2004"","""",3.0,3.0,[],null,0,""Course""}],[],1,""CourseGroup""}],[{""180e448f-a7c8-417a-acad-08dca807cdc2"",""ce35ebae-7d90-4199-cf5b-08dca807cdc1"",""Or"",0,1}],""SendingArticulation"",[]}","[{""And"",[338911]}, {""And"",[345437]}]"
289406,"{null,[],[{""And"",[{[],[],[],337201,""African American History to Reconstruction"",""140A"",""BLAS"",1590,""Black Studies"",1813,""Black Studies"",""F2024"","""",3.0,3.0,[],null,0,""Course""}],[],0,""CourseGroup""}, {""And"",[{[],[],[],156119,""African American History since Reconstruction to the Present"",""140B"",""BLAS"",1590,""Black Studies"",1813,""Black Studies"",""F2024"","""",3.0,3.0,[],null,0,""Course""}],[],1,""CourseGroup""}],[{""6744d5a1-0bb8-4013-44c9-08dca807cdfc"",""06d095c4-ce51-4d1a-f72a-08dca807cdae"",""Or"",0,1}],""SendingArticulation"",[]}","[{""And"",[337201]}, {""And"",[156119]}]"
327985,"{null,[],[{""And"",[{[],[],[],281065,""Third Course in Spanish"",""201"",""SPAN"",3135,""Spanish"",12241,""Spanish"",""F1995"","""",5.0,5.0,[],null,0,""Course""}],[],0,""CourseGroup""}, {""And"",[{[],[],[],206938,""Conversation and Composition Spanish 1"",""210"",""SPAN"",3135,""Spanish"",12241,""Spanish"",""S2001"","""",3.0,3.0,[],null,0,""Course""}, {[],[],[],200029,""Conversation and Composition Spanish II"",""211"",""SPAN"",3135,""Spanish"",12241,""Spanish"",""Su2001"","""",3.0,3.0,[],null,1,""Course""}],[],1,""CourseGroup""}],[{""a1a9cd10-fe9a-455b-ab1f-08dca807cdc2"",""2b0c7571-7bd3-4a0f-bb29-08dca807cdc1"",""Or"",0,1}],""SendingArticulation"",[]}","[{""And"",[281065]}, {""And"",[206938, 200029]}]"
…,…,…
295083,"{null,[],[{""And"",[{[],[],[],297077,""General Physics"",""125"",""PHYS"",225,""Physics"",10771,""Physics"",""F2007"","""",5.0,5.0,[],227,0,""Course""}],[],0,""CourseGroup""}, {""And"",[{[],[],[],304422,""General Physics II"",""126"",""PHYS"",225,""Physics"",10771,""Physics"",""F2007"","""",5.0,5.0,[],227,0,""Course""}],[],1,""CourseGroup""}],[{""c6d8522b-6438-4cb4-acae-08dca807cdc2"",""ac88af3c-2a62-4f78-cf5f-08dca807cdc1"",""Or"",0,1}],""SendingArticulation"",[]}","[{""And"",[297077]}, {""And"",[304422]}]"
259933,"{null,[],[{""And"",[{[],[],[],345437,""Behavioral Science Statistics"",""258"",""PSYC"",4633,""Psychology"",11234,""Psychology"",""F2004"","""",3.0,3.0,[],null,0,""Course""}],[],1,""CourseGroup""}, {""And"",[{[],[],[],338911,""Elementary Statistics"",""119"",""MATH"",9952,""Mathematics"",8982,""Mathematics"",""F1995"","""",3.0,3.0,[],null,0,""Course""}],[],0,""CourseGroup""}],[{""bab493ec-77cd-4318-acac-08dca807cdc2"",""d7021df1-a29c-445f-cf51-08dca807cdc1"",""Or"",0,1}],""SendingArticulation"",[]}","[{""And"",[345437]}, {""And"",[338911]}]"
273644,"{null,[],[{""And"",[{[],[],[],340102,""Adolescent Psychology"",""123"",""PSYC"",4633,""Psychology"",11234,""Psychology"",""F1995"","""",3.0,3.0,[],null,0,""Course""}],[],0,""CourseGroup""}, {""And"",[{[],[],[],344558,""Psychology of Lifespan Development"",""230"",""PSYC"",4633,""Psychology"",11234,""Psychology"",""F2002"","""",3.0,3.0,[],null,0,""Course""}],[],1,""CourseGroup""}],[{""c002b6d2-6d46-4e19-44c8-08dca807cdfc"",""05a4e87b-4081-4496-f704-08dca807cdae"",""Or"",0,1}],""SendingArticulation"",[]}","[{""And"",[340102]}, {""And"",[344558]}]"
357745,"{null,[],[{""And"",[{[],[],[],263771,""Introduction to Psychological Research"",""255"",""PSYC"",4633,""Psychology"",11234,""Psychology"",""F2015"","""",3.0,3.0,[],null,0,""Course""}],[],0,""CourseGroup""}, {""And"",[{[],[],[],233806,""Introduction to Research Methods in Sociology"",""220"",""SOCO"",9018,""Sociology"",12171,""Sociology"",""F2014"","""",3.0,3.0,[],null,0,""Course""}],[],1,""CourseGroup""}],[{""a78afe7b-0e0d-4ad5-acb1-08dca807cdc2"",""550b9559-f53f-4ca9-cf82-08dca807cdc1"",""Or"",0,1}],""SendingArticulation"",[]}","[{""And"",[263771]}, {""And"",[233806]}]"


In [23]:
get_query = lambda cc_id, uni_id: pl.read_json(f"./data/{uni_id}/{cc_id}to{uni_id}.json")

query45to7 = get_query(cc_id=45, uni_id=7)
query51to1 = get_query(cc_id=51, uni_id=1)

# query45to7.explode("articulations")
query51to1.get_column("articulation").struct.fields

['type',
 'series',
 'visibleCrossListedCourses',
 'seriesAttributes',
 'courseAttributes',
 'sendingArticulation',
 'templateOverrides',
 'attributes',
 'receivingAttributes',
 'course']

In [5]:
create_course_glossary(cc=45, uni=7)

course_id,inst_id,course_code,course_name,min_units,max_units
str,i64,str,str,i64,i64
"""7AAS10""",7,"""AAS 10""","""Introduction to African Americ…",4,4
"""7AAS11""",7,"""AAS 11""","""Introduction to Black Diaspori…",4,4
"""7AAS14""",7,"""AAS 14""","""Introduction to African Studie…",4,4
"""7AAS15""",7,"""AAS 15""","""Racism and Global Imperialism""",4,4
"""7ANTH1""",7,"""ANTH 1""","""Introduction to Culture""",4,4
…,…,…,…,…,…
"""7VIS84""",7,"""VIS 84""","""Film History""",4,4
"""7VIS84B""",7,"""VIS 84B""","""Film Aesthetics""",4,4
"""7VIS85A""",7,"""VIS 85A""","""Media History""",4,4
"""7VIS85B""",7,"""VIS 85B""","""Media Aesthetics""",4,4


In [6]:
"""
Create a test run of a full course glossary
"""

def create_full_glossary():
    with open("./data/institutions_cc.json", "r") as cc_fp, open("./data/institutions_state.json", "r") as uni_fp:
        cc_ids = json.load(cc_fp).keys()
        uni_ids = json.load(uni_fp).keys()
    
    glossary = []
    cc_count = 0
    for uni_id in uni_ids:
        batch = []
        for cc_id in cc_ids:
            try:
                batch.append(create_course_glossary(cc=cc_id,uni=uni_id))
                cc_count += 1
            except FileNotFoundError:  # no query exists, no query was scraped
                continue
        glossary.extend(batch)
        print("created", cc_count, "dfs for uni ID:", uni_id)
        cc_count = 0
    print("glossary list constructed")
    return pl.concat(glossary)



"""
Notes: getting errors for the following cc:uni combos
KeyError 'articulations': 51:1, 55:26, 124:26
TypeError: str + NoneType unsupported: 97:21, 51:50

Checking files...
51:1 -> AllMajors file, diff schema, no by dept split

AllPrefixes Schema: 
results
-> articulations
    -> list of depts
        -> name, list of articulations for dept
            -> course objects
            
AllMajors Schema:
results
-> articulations
    -> list of articulations
        -> templateCellId, articulation, 
"""

glossary = create_full_glossary()

created 115 dfs for uni ID: 1
created 115 dfs for uni ID: 7
created 115 dfs for uni ID: 11
created 115 dfs for uni ID: 12
created 115 dfs for uni ID: 21
created 0 dfs for uni ID: 23
created 115 dfs for uni ID: 24
created 11 dfs for uni ID: 26
created 90 dfs for uni ID: 29
created 115 dfs for uni ID: 39
created 115 dfs for uni ID: 42
created 115 dfs for uni ID: 46
created 115 dfs for uni ID: 50
created 115 dfs for uni ID: 60
created 115 dfs for uni ID: 75
created 115 dfs for uni ID: 76
created 115 dfs for uni ID: 79
created 115 dfs for uni ID: 81
created 35 dfs for uni ID: 85
created 0 dfs for uni ID: 88
created 0 dfs for uni ID: 89
created 0 dfs for uni ID: 98
created 0 dfs for uni ID: 115
created 0 dfs for uni ID: 116
created 0 dfs for uni ID: 117
created 0 dfs for uni ID: 120
created 0 dfs for uni ID: 128
created 0 dfs for uni ID: 129
created 0 dfs for uni ID: 132
created 0 dfs for uni ID: 141
created 0 dfs for uni ID: 143
created 0 dfs for uni ID: 144
glossary list constructed


In [9]:
uni_ids

dict_keys(['1', '7', '11', '12', '21', '23', '24', '26', '29', '39', '42', '46', '50', '60', '75', '76', '79', '81', '85', '88', '89', '98', '115', '116', '117', '120', '128', '129', '132', '141', '143', '144'])

In [7]:
glossary.filter(pl.col("course_id") == "1BUS100")

course_id,inst_id,course_code,course_name,min_units,max_units
str,str,str,str,i64,i64
"""1BUS100""","""1""","""BUS 100""","""Accounting Principles I: Finan…",3,3
"""1BUS100""","""1""","""BUS 100""","""Accounting Principles I: Finan…",3,3
"""1BUS100""","""1""","""BUS 100""","""Accounting Principles I: Finan…",3,3
"""1BUS100""","""1""","""BUS 100""","""Accounting Principles I: Finan…",3,3
"""1BUS100""","""1""","""BUS 100""","""Accounting Principles I: Finan…",3,3
…,…,…,…,…,…
"""1BUS100""","""1""","""BUS 100""","""Accounting Principles I: Finan…",3,3
"""1BUS100""","""1""","""BUS 100""","""Accounting Principles I: Finan…",3,3
"""1BUS100""","""1""","""BUS 100""","""Accounting Principles I: Finan…",3,3
"""1BUS100""","""1""","""BUS 100""","""Accounting Principles I: Finan…",3,3


In [8]:
"""
Dump glossary with SQLite3
"""

import sqlite3 as sql3

with sql3.connect("./test.db") as conn:
    cursor = conn.cursor()
    
    # create table
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS course_glossary (
            course_id INTEGER PRIMARY KEY,
            inst_id INTEGER,
            course_code TEXT,
            course_name TEXT,
            min_units REAL,
            max_units REAL
        )
    """)
    
    with open("./data/institutions_cc.json", "r") as cc_fp, open("./data/institutions_state.json", "r") as uni_fp:
        cc_ids = json.load(cc_fp).keys()
        uni_ids = json.load(uni_fp).keys()
           
    glossary = []
    for uni_id in uni_ids:
        batch = []
        for cc_id in cc_ids:
            try:
                batch.append(create_course_glossary(
                    articulations=get_query(cc_id=cc_id, uni_id=uni_id),
                    cc=cc_id,
                    uni=uni_id
                ))
            except FileNotFoundError:  # no query exists, no query was scraped
                continue
        glossary.extend(batch)
        

    glossary_df = pd.concat(glossary, ignore_index=True)
    glossary_df.to_sql(
        name="course_glossary", 
        con=conn, 
        if_exists="append", 
        index=False
    )

TypeError: create_course_glossary() got an unexpected keyword argument 'articulations'