In [None]:
import os
import json
import boto3
import requests

import polars as pl

from botocore import UNSIGNED
from botocore.config import Config

from rich.pretty import pprint


In [None]:
BASE_URL = "https://cdl-segg.fra1.cdn.digitaloceanspaces.com/cdl-segg"

In [None]:
def get_df(file_name: str) -> pl.DataFrame:
    response = requests.get(f"{BASE_URL}/{file_name}")
    response.raise_for_status()

    return pl.read_parquet(response.content)

In [None]:
def get_json(file_name: str) -> dict:
    response = requests.get(f"{BASE_URL}/{file_name}")
    response.raise_for_status()

    return json.loads(response.content)

In [None]:
def show_bucket_objects() -> None:
    s3 = boto3.client(
        "s3",
        endpoint_url="https://fra1.digitaloceanspaces.com",
        config=Config(signature_version=UNSIGNED),
    )

    bucket_name = "cdl-segg"
    resp = s3.list_objects_v2(
        Bucket=bucket_name,
        Prefix="cdl-segg/",
    )

    parquets = (os.path.basename(item["Key"]) for item in resp["Contents"])
    parquets = [p for p in parquets if not p.startswith("smoke_test_")]

    print(f"parquets: {len(parquets)}")
    pprint(parquets)

In [None]:
show_bucket_objects()

In [None]:
df_sections = get_df(file_name="sections.parquet")
print(len(df_sections))
df_sections.head()

In [None]:
df_posts = get_df(file_name="posts.parquet")
print(len(df_posts))
df_posts.head()

In [None]:
# TODO: Ask to Jonas.
# TODO: category_title => some times are regions, I always need the region?
df_downloads = get_df(file_name="downloads.parquet")
print(len(df_downloads))

display(df_downloads["file_type"].value_counts())
df_downloads.head()

In [None]:
df_downloads["category_title"].unique().to_list()

In [None]:
# download_tree = get_json(file_name="downloads_tree.json")
# pprint(download_tree)

In [None]:
df_legal_resources = get_df(file_name="legal_resources.parquet")
print(len(df_legal_resources))
df_legal_resources.head()

In [None]:
df_legal_resources = get_df(file_name="publications.parquet")
print(len(df_legal_resources))
df_legal_resources.head()

In [None]:
df_svtipps = get_df(file_name="svtipps.parquet")
print(len(df_svtipps))
df_svtipps.head()

In [None]:
# TODO: Ask Jonas why jurisdiction here? How to use the glossary?
df_glossary_terms = get_df(file_name="glossary_terms.parquet")
print(len(df_glossary_terms))
df_glossary_terms.head()

In [None]:
df_student_council_committees = get_df(
    file_name="student_council_committees.parquet"
)

print(len(df_student_council_committees))
df_student_council_committees.head()

In [None]:
df_student_council_committees["name"].to_list()