In [2]:
""" Imports """
import json
import pandas as pd
import polars as pl

# Defining a Database Schema

## What data do we need

Starting from the top, what information should we display to users?

example return field in csv format (use tsv since a few course/dept names have commas)

Example: user selects inst=UCSC, course=MATH19A
`San Diego Miramar College, MATH 150, Calculus with Analytic Geometry I, 4 units`

Example: user selects inst=UCSD, course=CHEM6B

`San Diego Miramar College, CHEM 200, General Chemistry I - Lecture, 3 units`

`and`

`San Diego Miramar College, CHEM 201, General Chemistry II - Lecture, 3 units`


Required data:
- sending institution (e.g. San Diego Miramar College)
- course prefix (e.g. CHEM)
- course number (e.g. 201)
- course name (e.g. General Chemistry II - Lecture)
- unit count (e.g 3 units)
- course ID (some string of numbers, makes mappings easy to deal with)


## The schema

Table 1: course glossary

fields:
- id (int, primary key)  // course id
- inst (string)  // community college or univ
- prefix (string)  // course prefix (e.g. CHEM)
- course number (int) 
- course name (string)
- min units (int)  // if min units and max units are the same, display 1
- max units (int)

Table 2: articulations
fields:
- id: (int, primary key)
- inst (uni) (string)
- agreements (json string)

## The JSON string

tbd, working on that rn

## The query flow

- user enters site
- user picks university -> get list of all courses at university for dropdown
    - sends request to backend
    - backend uses user input to send query to db
        -  `FROM glossary SELECT * WHERE inst IS {whatever the user picked}`
    - backend converts results into json w/ id: {prefix, num, name, min units, max units}  // use for formatting dropdown
    - backend returns json
- user picks course from list -> get list of all articulated courses from id
    - sends request to backend
    - backend uses user input to send query to db
        - `FROM articulations SELECT agreements WHERE id IS {id of course user picked}`
        - returns 1 json string with all articulations
    - backend aggregates all json string'd course IDs into list, queries course data
        - `FROM glossary SELECT * WHERE id IS {ids in list}`
    - backend converts results into json w/ id: {inst, prefix, num, name, min units, max units}  // actual displayed data
    - backend returns both jsons (articulation string w/ IDs and the cc glossary)
- both jsons formatted into cells on frontend
- user gets results

In [9]:
""" Define functions for generating an in-memory glossary of every course in an AllPrefixes agreement page """

def get_query(cc_id: int, uni_id: int) -> dict:
    with open(f"./data/{uni_id}/{cc_id}to{uni_id}.json", "r") as fp:
        out = json.load(fp)
    return out
    

def update_courses(courselist: list[dict], glossary: dict, inst: str) -> None:
    for course in courselist:
        course_id: int = course["courseIdentifierParentId"]
        
        if course_id not in glossary:
            
            if not all((course["prefix"],
                        course["courseNumber"],
                        course["courseTitle"],
                        course["minUnits"],
                        course["maxUnits"])):
                continue
            
            glossary[course_id] = {
                "course_id": int(course_id),
                "inst_id": int(inst),
                "course_code": course["prefix"] + " " + course["courseNumber"],
                "course_name": course["courseTitle"],
                "min_units": int(course["minUnits"]),
                "max_units": int(course["maxUnits"])
            }

def create_course_glossary(cc: int, uni: int) -> pl.DataFrame:
    # create output glossary
    course_glossary = dict()
    articulations = get_query(cc_id=cc, uni_id=uni)
    
    # populate query
    for dept in articulations:
        articulationList = dept["articulations"] if "articulations" in dept else [dept["articulation"]]
            # for articulation in dept["articulations"]:
        for articulation in articulationList:
            
            # handle university courses
            uni_courses: list[dict] = [articulation["course"]] if "course" in articulation else articulation["series"]["courses"]
            update_courses(courselist=uni_courses, glossary=course_glossary, inst=uni)
                
            # handle cc courses
            agreements: dict | None = articulation["sendingArticulation"]
            if isinstance(agreements, dict):
                for agreement in agreements["items"]:
                    update_courses(courselist=agreement["items"], glossary=course_glossary, inst=cc)
        
        print("created df for", cc, "->", uni)
        return pl.DataFrame(pd.DataFrame(course_glossary.values(), index=course_glossary.keys()))
    # except TypeError as err:
    #     print(f"TYPEERROR (cc={cc}, uni={uni}):", err)

In [None]:
articulations = pl.DataFrame(get_query(45, 7)).explode("articulations")

articulations = pl.concat((
    articulations.get_column("articulations").struct.field("course").struct.field("courseIdentifierParentId").rename("course_id").to_frame(),
    articulations.get_column("articulations").struct.field("series").to_frame(),
    articulations.get_column("articulations").struct.field("sendingArticulation").to_frame(),
), how="horizontal")

# test = articulations.with_columns(pl.col("series").struct.field("courses")).explode("courses")
# test = test.with_columns(
#     course_id2=pl.col("courses").struct.field("courseIdentifierParentId"),
#     conj=pl.col("series").struct.field("conjunction"),
# )
# test = test.with_columns(pl.coalesce("course_id", "course_id2").alias("course_id"))
# test.drop_in_place("series")
# test.drop_in_place("courses")
# test.drop_in_place("course_id2")

test = (
    articulations
    .with_columns(pl.col("series").struct.field("courses"))
    .explode("courses")
    .with_columns(
        course_id=pl.coalesce(
            "course_id",
            pl.col("courses").struct.field("courseIdentifierParentId")
        ),
        items=pl.col("sendingArticulation").struct.field("items")
    )
    .drop(["series", "courses"])
)

# test.with_columns(pl.col("sendingArticulation").struct.json_encode())
test.filter(pl.col("items").is_not_null())
test.filter(pl.col("course_id").eq(289392))

result = (
    test.filter(pl.col("course_id").eq(289392))
    .select("items").to_series().to_list()
)
result[0]

test.get_column("items").list.get(0, null_on_oob=True).struct.field("items")
# ['courseConjunction', 'items', 'attributes', 'position', 'type']

test.with_columns(
    items2=pl.col("items").list.get(0, null_on_oob=True).struct.field("items")
)

(
    test.get_column("items")
    .list.get(0, null_on_oob=True)
    .struct.field("items")
    .list.get(0, null_on_oob=True)
    .struct.field("courseIdentifierParentId")
)

courseIdentifierParentId
i64
""
""
""
""
200913
…
""
""
""
""


In [None]:
def create_course_glossary_pl(cc: int, uni: int) -> pl.DataFrame:
    articulations = get_query(cc_id=cc, uni_id=uni)
    glossary = pl.DataFrame({
        "course_id": pl.UInt16,
        "inst_id": pl.UInt16,
        "course_code": pl.Utf8,
        "course_name": pl.Utf8,
        "min_units": pl.UInt8,
        "max_units": pl.Uint8
    })
    
    
    

In [10]:
create_course_glossary(cc=45, uni=7)

created df for 45 -> 7


course_id,inst_id,course_code,course_name,min_units,max_units
i64,i64,str,str,i64,i64
384566,7,"""AAS 10""","""Introduction to African Americ…",4,4
384567,7,"""AAS 11""","""Introduction to Black Diaspori…",4,4
384569,7,"""AAS 14""","""Introduction to African Studie…",4,4
384570,7,"""AAS 15""","""Racism and Global Imperialism""",4,4


In [11]:
"""
Create a test run of a full course glossary
"""

def create_full_glossary():
    with open("./data/institutions_cc.json", "r") as cc_fp, open("./data/institutions_state.json", "r") as uni_fp:
        cc_ids = json.load(cc_fp).keys()
        uni_ids = json.load(uni_fp).keys()
    
    glossary = []
    for uni_id in uni_ids:
        batch = []
        for cc_id in cc_ids:
            try:
                batch.append(create_course_glossary(cc=cc_id,uni=uni_id))
            except FileNotFoundError:  # no query exists, no query was scraped
                continue
        glossary.extend(batch)
    print("glossary list constructed")
    return pl.concat(glossary)


"""
Notes: getting errors for the following cc:uni combos
KeyError 'articulations': 51:1, 55:26, 124:26
TypeError: str + NoneType unsupported: 97:21, 51:50

Checking files...
51:1 -> AllMajors file, diff schema, no by dept split

AllPrefixes Schema: 
results
-> articulations
    -> list of depts
        -> name, list of articulations for dept
            -> course objects
            
AllMajors Schema:
results
-> articulations
    -> list of articulations
        -> templateCellId, articulation, 
"""

create_full_glossary()

created df for 2 -> 1
created df for 3 -> 1
created df for 4 -> 1
created df for 5 -> 1
created df for 6 -> 1
created df for 8 -> 1
created df for 9 -> 1
created df for 10 -> 1
created df for 13 -> 1
created df for 14 -> 1
created df for 16 -> 1
created df for 17 -> 1
created df for 18 -> 1
created df for 19 -> 1
created df for 20 -> 1
created df for 25 -> 1
created df for 27 -> 1
created df for 28 -> 1
created df for 30 -> 1
created df for 31 -> 1
created df for 32 -> 1
created df for 33 -> 1
created df for 35 -> 1
created df for 36 -> 1
created df for 38 -> 1
created df for 40 -> 1
created df for 41 -> 1
created df for 43 -> 1
created df for 44 -> 1
created df for 45 -> 1
created df for 47 -> 1
created df for 48 -> 1
created df for 49 -> 1
created df for 51 -> 1
created df for 52 -> 1
created df for 53 -> 1
created df for 54 -> 1
created df for 55 -> 1
created df for 56 -> 1
created df for 57 -> 1
created df for 58 -> 1
created df for 61 -> 1
created df for 62 -> 1
created df for 63 

KeyboardInterrupt: 

In [None]:
"""
Dump glossary with SQLite3
"""

import sqlite3 as sql3

with sql3.connect("./test.db") as conn:
    cursor = conn.cursor()
    
    # create table
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS course_glossary (
            course_id INTEGER PRIMARY KEY,
            inst_id INTEGER,
            course_code TEXT,
            course_name TEXT,
            min_units REAL,
            max_units REAL
        )
    """)
    
    with open("./data/institutions_cc.json", "r") as cc_fp, open("./data/institutions_state.json", "r") as uni_fp:
        cc_ids = json.load(cc_fp).keys()
        uni_ids = json.load(uni_fp).keys()
           
    glossary = []
    for uni_id in uni_ids:
        batch = []
        for cc_id in cc_ids:
            try:
                batch.append(create_course_glossary(
                    articulations=get_query(cc_id=cc_id, uni_id=uni_id),
                    cc=cc_id,
                    uni=uni_id
                ))
            except FileNotFoundError:  # no query exists, no query was scraped
                continue
        glossary.extend(batch)
        

    glossary_df = pd.concat(glossary, ignore_index=True)
    glossary_df.to_sql(
        name="course_glossary", 
        con=conn, 
        if_exists="append", 
        index=False
    )

KEYERROR (cc=51, uni=1): 'articulations'
TYPEERROR (cc=97, uni=21): unsupported operand type(s) for +: 'NoneType' and 'str'
KEYERROR (cc=55, uni=26): 'articulations'
KEYERROR (cc=124, uni=26): 'articulations'
TYPEERROR (cc=51, uni=50): unsupported operand type(s) for +: 'NoneType' and 'str'


IntegrityError: UNIQUE constraint failed: course_glossary.course_id

In [None]:
import pandas as pd

# Example DataFrames
df1 = pd.DataFrame({
    'course_id': [1, 2, 3],
    'course_name': ['Math', 'Science', 'History'],
    'min_units': [3, 4, 3],
    'max_units': [5, 6, 4]
})

df2 = pd.DataFrame({
    'course_id': [4, 5, 6],
    'course_name': ['Geography', 'Literature', 'Physics'],
    'min_units': [3, 3, 4],
    'max_units': [5, 6, 6]
})

# Concatenate the DataFrames
df_combined = pd.concat([df1, df2], ignore_index=True)

# Print the resulting DataFrame
print(df_combined)


   course_id course_name  min_units  max_units
0          1        Math          3          5
1          2     Science          4          6
2          3     History          3          4
3          4   Geography          3          5
4          5  Literature          3          6
5          6     Physics          4          6
