In [None]:
import polars as pl

# typing
from polars import DataType, Schema
from polars._typing import PolarsDataType
from pathlib import Path

In [None]:
PROJECTDIR = Path("/home/akash/Main/projects/CACourses")

In [None]:
def _merge_dtypes(dtype1: PolarsDataType, dtype2: PolarsDataType) -> PolarsDataType:
    """Recursively merges two Polars data types into a supertype."""
    # If types are the same, no merge needed
    if dtype1 == dtype2:
        return dtype1
    
    # Null type is superseded by any other type
    if isinstance(dtype1, pl.Null):
        return dtype2
    if isinstance(dtype2, pl.Null):
        return dtype1

    # Recursively merge for List types
    if isinstance(dtype1, pl.List) and isinstance(dtype2, pl.List):
        merged_inner = _merge_dtypes(dtype1.inner, dtype2.inner)
        return pl.List(merged_inner)

    # Recursively merge for Struct types
    if isinstance(dtype1, pl.Struct) and isinstance(dtype2, pl.Struct):
        # Combine fields from both structs
        merged_fields = dtype1.to_schema()
        for field_name, field_dtype in dtype2.to_schema().items():
            merged_fields[field_name] = _merge_dtypes(merged_fields[field_name], field_dtype) if field_name in merged_fields else field_dtype
        return pl.Struct(merged_fields)

    # For other types, use Polars' built-in supertype casting
    try:
        # Create dummy series and find the supertype upon concatenation
        super_type = pl.concat(
            [pl.Series([None], dtype=dtype1), pl.Series([None], dtype=dtype2)],
            how="diagonal_relaxed",
        ).dtype
        return super_type
    except Exception:
        # If Polars can't find a common supertype (e.g., Int64 and Struct), raise an error
        raise TypeError(f"Could not merge incompatible types: {dtype1} and {dtype2}")


def merge_schemas(schemas: list[pl.Schema]) -> pl.Schema:
    """
    Merges a list of Polars schemas into a single, generalized schema.

    Args:
        schemas: A list of schema dictionaries (e.g., from `df.schema`).

    Returns:
        A single schema dictionary that covers all fields and types.
    """
    if not schemas:
        return pl.Schema()

    # Start with the first schema as the base
    merged_schema = dict(schemas[0])

    # Iteratively merge the remaining schemas
    for schema in schemas[1:]:
        for field_name, field_dtype in schema.items():
            if field_name in merged_schema:
                # Field exists, merge the data types
                existing_dtype = merged_schema[field_name]
                merged_schema[field_name] = _merge_dtypes(existing_dtype, field_dtype)
            else:
                # New field, just add it to the schema
                merged_schema[field_name] = field_dtype
    
    return Schema(merged_schema)


def extract_articulations_optimized(fp: str | Path, schema: pl.Schema) -> pl.DataFrame:
    path = Path(fp)
    # Robust path parsing
    # print(path.parts)
    uni = int(path.parts[-2])
    cc = int(path.parts[-1].split("to")[0])

    if "prefixes" in str(fp):
        df = pl.read_json(fp, schema=schema).explode("articulations")
    else:
        df = pl.read_json(fp, schema=schema).rename({"articulation": "articulations"})

    return (
        df.select(
            # Use select for efficient, simultaneous column creation
            pl.col("articulations").struct.field("course").struct.field("courseIdentifierParentId").alias("course_id"),
            pl.col("articulations").struct.field("series"),
            pl.col("articulations").struct.field("sendingArticulation")
        )
        .with_columns(pl.col("series").struct.field("courses"))
        .explode("courses")
        .with_columns(
            # This logic remains largely the same, but benefits from a cleaner starting point
            course_id=pl.coalesce(
                "course_id",
                pl.col("courses").struct.field("courseIdentifierParentId")
            ),
            # The 'when' is necessary to convert empty lists to null for later dropping
            items=pl.when(pl.col("sendingArticulation").struct.field("items").list.len() > 0)
                 .then(pl.col("sendingArticulation").struct.field("items"))
                 .otherwise(None),
            cc=pl.lit(cc),
            uni=pl.lit(uni)
        )
        .drop_nulls(subset="items") # Single drop_nulls is sufficient for this column
        .drop("series", "courses")
        .with_columns(
            articulation=pl.col("items").list.eval(
                pl.struct(
                    conj=(
                        pl.col("sendingArticulation")
                        .struct.field("courseGroupConjunctions")
                        .list.first()
                        .struct.field("groupConjunction")
                        .fill_null("Or") # Much cleaner than when/then/otherwise
                    ),
                    items=pl.struct(
                        conj=pl.element().struct.field("courseConjunction"),
                        items=pl.element().struct.field("items").list.eval(
                            pl.element().struct.field("courseIdentifierParentId")
                        )
                    )
                )
            ),
        )
        .drop("sendingArticulation", "items")
    )

In [None]:
schema_list_prefix = [
    pl.read_json(
        source=fp,
        infer_schema_length=None
    ).schema 
    for fp 
    in PROJECTDIR.glob("data/*/*prefixes.json")
]
schema_list_major  = [
    pl.read_json(
        source=fp,
        infer_schema_length=None
    ).schema 
    for fp
    in PROJECTDIR.glob("data/*/*majors.json")
]

schema_prefix = merge_schemas(schema_list_prefix)
schema_major  = merge_schemas(schema_list_major)

del schema_list_prefix, schema_list_major

In [55]:
prefixes_agg = pl.concat([
    extract_articulations_optimized(fp, schema_prefix) 
    for fp
    in PROJECTDIR.glob("data/*/*-prefixes.json")
]).unique()

In [56]:
majors_agg = pl.concat([
    extract_articulations_optimized(fp, schema_major) 
    for fp 
    in PROJECTDIR.glob("data/*/*-majors.json")
]).unique()

In [57]:
prefixes_agg.get_column('articulation')

test_frame = (
    prefixes_agg
    .filter(pl.col("articulation").list.len() > 2)
    # .get_column('articulation')
    .top_k(by=pl.col("articulation"), k=10)
)

In [58]:
test_frame

course_id,cc,uni,articulation,groupConj
i64,i32,i32,list[struct[2]],str
65915,121,98,"[{""Or"",[386545]}, {""And"",[386546]}, {""And"",[386549]}]","""Or"""
237149,135,98,"[{""Or"",[385694]}, {""And"",[385695]}, {""And"",[385696]}]","""Or"""
374012,110,85,"[{""Or"",[383729]}, {""And"",[383728]}, … {""And"",[385834]}]","""Or"""
261048,14,141,"[{""Or"",[380882, 387926]}, {""Or"",[380845, 386929]}, {""Or"",[380851, 386929]}]","""Or"""
110050,153,128,"[{""Or"",[372214, 372203]}, {""Or"",[372053, 372213]}, {""And"",[371870]}]","""And"""
372609,153,128,"[{""Or"",[372214, 372203]}, {""Or"",[372053, 372213]}, {""And"",[371870]}]","""And"""
235554,153,128,"[{""Or"",[372214, 372203]}, {""Or"",[372053, 372213]}, {""And"",[371870]}]","""And"""
227332,153,128,"[{""Or"",[372214, 372203]}, {""Or"",[372053, 372213]}, {""And"",[371870]}]","""And"""
263383,153,128,"[{""Or"",[372214, 372203]}, {""Or"",[372053, 372213]}, {""And"",[371870]}]","""And"""
372611,153,128,"[{""Or"",[372214, 372203]}, {""Or"",[372053, 372213]}, {""And"",[371870]}]","""And"""


In [None]:
def convert_to_dnf(item: int | dict[str, int | dict]) -> list[list[int]]:
    if isinstance(item, int):
        return [[item]]
    