In [1]:
import os
import json
import boto3
import requests

import polars as pl

from botocore import UNSIGNED
from botocore.config import Config

from rich.pretty import pprint


In [2]:
BASE_URL = "https://cdl-segg.fra1.cdn.digitaloceanspaces.com/cdl-segg"

In [3]:
def get_df(file_name: str) -> pl.DataFrame:
    response = requests.get(f"{BASE_URL}/{file_name}")
    response.raise_for_status()

    return pl.read_parquet(response.content)

In [4]:
def get_json(file_name: str) -> dict:
    response = requests.get(f"{BASE_URL}/{file_name}")
    response.raise_for_status()

    return json.loads(response.content)

In [5]:
def show_bucket_objects() -> None:
    s3 = boto3.client(
        "s3",
        endpoint_url="https://fra1.digitaloceanspaces.com",
        config=Config(signature_version=UNSIGNED),
    )

    bucket_name = "cdl-segg"
    resp = s3.list_objects_v2(
        Bucket=bucket_name,
        Prefix="cdl-segg/",
    )

    parquets = (os.path.basename(item["Key"]) for item in resp["Contents"])
    parquets = [p for p in parquets if not p.startswith("smoke_test_")]

    print(f"parquets: {len(parquets)}")
    pprint(parquets)

In [6]:
show_bucket_objects()

parquets: 9


In [7]:
# df_sections = get_df(file_name="sections.parquet")
# print(len(df_sections))
# df_sections.head()

In [8]:
# df_posts = get_df(file_name="posts.parquet")
# print(len(df_posts))
# df_posts.head()

In [9]:
# df_downloads = get_df(file_name="downloads.parquet")
# print(len(df_downloads))

# display(df_downloads["file_type"].value_counts())
# df_downloads.head()

In [10]:
# download_tree = get_json(file_name="downloads_tree.json")
# pprint(download_tree)

In [11]:
df_legal_resources = get_df(file_name="legal_resources.parquet")
print(len(df_legal_resources))
df_legal_resources.head()

32


url,type,title,html,jurisdiction
str,str,str,str,cat
"""https://gesetze.berlin.de/perm…","""Schulgesetz""","""SchulG""",""" <a name=""DocInhalt""> </a> <…","""DE_BE"""
"""https://bravors.brandenburg.de…","""Schulgesetz""","""BbgSchulG""",""" <ul> <li><a>Ansicht drucken…","""DE_BB"""
"""https://landesrecht.thueringen…","""Schulgesetz""","""ThürSchulG""",""" <a name=""DocInhalt""> </a> <…","""DE_TH"""
"""https://landesrecht.thueringen…","""Schulordnung""","""ThürSchulO""",""" <a name=""DocInhalt""> </a> <…","""DE_TH"""
"""https://www.landesrecht.sachse…","""Schulgesetz""","""SchulG LSA""",""" <a name=""DocInhalt""> </a> <…","""DE_ST"""


In [12]:
df_legal_resources = get_df(file_name="publications.parquet")
print(len(df_legal_resources))
df_legal_resources.head()

3


key,type,title,authors,abstract,date,url,pdf_binary,jurisdiction,school_type,tags
str,str,str,list[str],str,str,str,binary,cat,cat,list[str]
"""21001957/D794UHKJ""","""book""","""Pimp my school : 1x1 der SV-Ar…","[""Felix Scheel""]","""Das Handbuch greift alle klass…","""2023-08""","""https://collections.fes.de/pub…","b""%PDF-1.7\x0d%\xe2\xe3\xcf\xd3\x0d\x0a4814\x200\x20obj\x0d<</Linearized\x201/L\x207810875/O\x204816/""…","""DE_MV""",,[]
"""21001957/MRSKJQKT""","""book""","""Rechtsextremismus & Schule: Th…","[""Schule ohne Rassismus – Schule mit Courage""]",,"""2024""","""https://www.schule-ohne-rassis…","b""%PDF-1.7\x0d%\xe2\xe3\xcf\xd3\x0d\x0a1320\x200\x20obj\x0d<</Linearized\x201/L\x209313914/O\x201325/""…",,,
"""21001957/GBX9H2JU""","""article""","""Handbuch für Klassensprecher*i…","[""Jugendbildungsstätte Kurt Löwenstein""]",,"""2024""","""https://www.kurt-loewenstein.d…","b""%PDF-1.6\x0d%\xe2\xe3\xcf\xd3\x0d\x0a1\x200\x20obj\x0d<</Lang(de-DE)/Metadata\x202\x200\x20R/Output""…","""DE_BE""","""elementary""",[]


In [13]:
df_svtipps = get_df(file_name="svtipps.parquet")
print(len(df_svtipps))
df_svtipps.head()

97


title,url,html_content,category,subcategory
str,str,str,str,str
"""Struktur – SVTipps – Von Schül…","""https://svtipps.de/struktur/""","""<h1>Struktur</h1><p>Ihr wollt,…","""Struktur""",
"""Die Schülervertretung – SVTipp…","""https://svtipps.de/struktur/di…","""<h1>Die Schülervertretung</h1>…","""Struktur""",
"""Schulsprecher*in – SVTipps – V…","""https://svtipps.de/struktur/sc…","""<h1>Schulsprecher*in</h1><p>Di…","""Struktur""",
"""Klassensprecher*in & Stufenspr…","""https://svtipps.de/struktur/kl…","""<h1>Klassensprecher*in &amp; S…","""Struktur""",
"""Kassenwart – SVTipps – Von Sch…","""https://svtipps.de/struktur/ka…","""<h1>Kassenwart</h1><p>Der Kass…","""Struktur""",


In [None]:
# TODO: Ask Jonas why jurisdiction here? How to use the glossary?
df_glossary_terms = get_df(file_name="glossary_terms.parquet")
print(len(df_glossary_terms))
df_glossary_terms.head()

33


term,definition,DE,DE_BW,DE_BY,DE_BE,DE_BB,DE_HB,DE_HH,DE_HE,DE_MV,DE_NI,DE_NW,DE_RP,DE_SL,DE_SN,DE_ST,DE_SH,DE_TH
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""Antrag""","""Ein Antrag ist ein Vorschlag, …",,,,,,,,,,,,,,,,,
"""Aufgabenprofil""","""Ihr schreibt transparent und k…",,,,,,,,,,,,,,,,,
"""Auftaktveranstaltung / SV-Tag""","""Start in die SV-Arbeit zu Begi…","""Start in die SV-Arbeit zu Begi…",,,,,,,,,,,,,,,,
"""Bezirks-/ Kreis & Landesschüle…",,"""BSK (Bundesschülerkonferenz – …",,,"""BSA und LSA (Bezirksschüler- u…","""KSR und LSR (Kreisschüler- und…",,,,"""KSR und LSR (Kreisschüler- und…",,"""BSV und LSV (Bezirksschüler- u…",,,,"""KSR und LSR (Kreisschüler- und…",,"""Kreisschülersprecher:innen und…"
"""BSK""","""Bundesschülerkonferenz. Sie be…","""Bundesschülerkonferenz. Sie be…",,,,,,,,,,,,,,,,


In [None]:
# TODO: Ask Jonas what is this?
df_student_council_committees = get_df(
    file_name="student_council_committees.parquet"
)

print(len(df_student_council_committees))
df_student_council_committees.head()