In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

INTERIM = Path("/Users/ms/Projects/acoe-edu-data-portfolio/data/01_caldashboard/interim")
PROCESSED = Path("/Users/ms/Projects/acoe-edu-data-portfolio/data/01_caldashboard/processed")

PROCESSED.mkdir(parents=True, exist_ok=True)

# Privacy / aggregation settings
SMALL_N_SUPPRESS = 10         # suppress rates where enrolled (or cohort) < 10
ROUND_TO = 4                  # decimal places for rates
YEAR_FILTER = None            # e.g., 2023 to keep a single year; or None for all

pd.options.display.max_columns = 120
pd.options.display.width = 160

In [2]:
# Prefer parquet; fall back to CSV
clean_files = sorted(INTERIM.glob("*_clean.parquet"))
if not clean_files:
    clean_files = sorted(INTERIM.glob("*_clean.csv"))
    loader = pd.read_csv
else:
    loader = pd.read_parquet

if not clean_files:
    raise FileNotFoundError("No *_clean.[parquet|csv] files found in data/interim/. Run 01_ingest_clean first.")

dfs = []
for fp in clean_files:
    df = loader(fp)
    df["__source"] = fp.name
    dfs.append(df)

base = pd.concat(dfs, ignore_index=True)
print("Loaded rows:", len(base), "| columns:", list(base.columns))


Loaded rows: 343602 | columns: ['year', 'aggregate_level_code', 'aggregate_level', 'county_code', 'district_code', 'school_code', 'district_cds', 'cds_code', 'county_name', 'district_name', 'school_name', 'charter', 'dass', 'reporting_category', 'subgroup_code', 'subgroup', 'cohort', 'chronic_absent_count', 'chronic_absent_rate', 'chronic_absent_rate_calc', '__source']


In [3]:
# Ensure numeric types for cohort/count/rate if present
for c in ("cohort","chronic_absent_count","chronic_absent_rate","chronic_absent_rate_calc"):
    if c in base.columns:
        base[c] = pd.to_numeric(base[c], errors="coerce")

# Make sure text keys are trimmed
for t in ("county_name","district_name","school_name","subgroup","reporting_category"):
    if t in base.columns:
        base[t] = base[t].astype(str).str.strip()

# Optional year filter
if YEAR_FILTER is not None and "year" in base.columns:
    base = base[base["year"] == YEAR_FILTER].copy()
    print("Filtered to year:", YEAR_FILTER, "| rows:", len(base))


In [4]:
def select_base(df: pd.DataFrame) -> pd.DataFrame:
    lvl_col = "aggregate_level" if "aggregate_level" in df.columns else None
    if lvl_col:
        lvls = df[lvl_col].dropna().astype(str).str.lower().unique().tolist()
    else:
        lvls = []

    if "school" in lvls:
        # Use school rows; include both charter Y/N naturally
        out = df[df[lvl_col].str.lower().eq("school")].copy()
        grain = "school"
    elif "district" in lvls:
        out = df[df[lvl_col].str.lower().eq("district")].copy()
        # If charter/dass exist, pick 'All' rows when available
        for col in ("charter","dass"):
            if col in out.columns:
                has_all = out[col].astype(str).str.upper().eq("ALL").any()
                if has_all:
                    out = out[out[col].astype(str).str.upper().eq("ALL")].copy()
        grain = "district"
    else:
        # fallback: use as-is
        out = df.copy()
        grain = "(unknown)"

    print(f"Base grain: {grain} | rows: {len(out)}")
    return out

base_grain = select_base(base)


Base grain: school | rows: 208350


In [5]:
def keys_for_level(df, level):
    if level == "district":
        keys = [c for c in ("district_name","district_code","district_cds","county_name","county_code","year","subgroup") if c in df.columns]
    elif level == "county":
        keys = [c for c in ("county_name","county_code","year","subgroup") if c in df.columns]
    elif level == "state":
        keys = [c for c in ("year","subgroup") if c in df.columns]
    else:
        raise ValueError(level)
    return keys

def agg_chronic(df, level):
    keys = keys_for_level(df, level)
    grp = df.groupby(keys, dropna=False).agg(
        cohort=("cohort","sum"),
        chronic_absent_count=("chronic_absent_count","sum")
    ).reset_index()
    grp["chronic_absent_rate"] = (grp["chronic_absent_count"] / grp["cohort"]).round(ROUND_TO)
    grp["non_chronic_rate"] = (1 - grp["chronic_absent_rate"]).round(ROUND_TO)
    grp.insert(0, "geo_level", level)
    return grp


In [6]:
levels = ["district","county","state"]
tables = [agg_chronic(base_grain, lv) for lv in levels]
kpi_wide = pd.concat(tables, ignore_index=True)
print("kpi_wide:", kpi_wide.shape)
kpi_wide.head()


kpi_wide: (24615, 12)


Unnamed: 0,geo_level,district_name,district_code,district_cds,county_name,county_code,year,subgroup,cohort,chronic_absent_count,chronic_absent_rate,non_chronic_rate
0,district,ABC Unified,64212,1964212,Los Angeles,19,2024,All Students,18598.0,2559.0,0.1376,0.8624
1,district,ABC Unified,64212,1964212,Los Angeles,19,2024,American Indian/Alaska Native,0.0,0.0,,
2,district,ABC Unified,64212,1964212,Los Angeles,19,2024,Asian,4303.0,195.0,0.0453,0.9547
3,district,ABC Unified,64212,1964212,Los Angeles,19,2024,Black/African American,1292.0,280.0,0.2167,0.7833
4,district,ABC Unified,64212,1964212,Los Angeles,19,2024,English Learners,2213.0,418.0,0.1889,0.8111


In [7]:
def suppress_small_n(df, threshold=SMALL_N_SUPPRESS):
    out = df.copy()
    if "cohort" in out.columns and threshold:
        mask = out["cohort"].notna() & (out["cohort"] < threshold)
        for col in ("chronic_absent_rate","non_chronic_rate"):
            if col in out.columns:
                out.loc[mask, col] = pd.NA
    return out

kpi_wide_s = suppress_small_n(kpi_wide, SMALL_N_SUPPRESS)


In [8]:
def add_gap_vs_all(df):
    out = df.copy()
    # define grouping keys that identify a geo grouping for the same year
    def group_keys(cols):
        return [c for c in cols if c in out.columns]

    # group templates by level
    grp_keys = {
        "district": group_keys(["geo_level","district_name","year"]),
        "county":   group_keys(["geo_level","county_name","year"]),
        "state":    group_keys(["geo_level","year"]),
    }

    parts = []
    for level, gkeys in grp_keys.items():
        if not gkeys: 
            continue
        sub = out[out["geo_level"]==level].copy()
        # find All Students rate
        all_rows = sub[sub["subgroup"].astype(str).str.lower().eq("all students")][gkeys + ["chronic_absent_rate"]]
        all_rows = all_rows.rename(columns={"chronic_absent_rate":"rate_all"})
        merged = sub.merge(all_rows, on=gkeys, how="left")
        merged["gap_vs_all"] = (merged["chronic_absent_rate"] - merged["rate_all"]).round(ROUND_TO)
        parts.append(merged)

    return pd.concat(parts, ignore_index=True) if parts else out

kpi_wide_eq = add_gap_vs_all(kpi_wide_s)
print("With gaps:", kpi_wide_eq.shape)
kpi_wide_eq.head()


With gaps: (25152, 14)


Unnamed: 0,geo_level,district_name,district_code,district_cds,county_name,county_code,year,subgroup,cohort,chronic_absent_count,chronic_absent_rate,non_chronic_rate,rate_all,gap_vs_all
0,district,ABC Unified,64212,1964212,Los Angeles,19,2024,All Students,18598.0,2559.0,0.1376,0.8624,0.1376,0.0
1,district,ABC Unified,64212,1964212,Los Angeles,19,2024,American Indian/Alaska Native,0.0,0.0,,,0.1376,
2,district,ABC Unified,64212,1964212,Los Angeles,19,2024,Asian,4303.0,195.0,0.0453,0.9547,0.1376,-0.0923
3,district,ABC Unified,64212,1964212,Los Angeles,19,2024,Black/African American,1292.0,280.0,0.2167,0.7833,0.1376,0.0791
4,district,ABC Unified,64212,1964212,Los Angeles,19,2024,English Learners,2213.0,418.0,0.1889,0.8111,0.1376,0.0513


In [9]:
id_vars = [c for c in ("geo_level","county_name","county_code","district_name","district_code","district_cds","year","subgroup","cohort") if c in kpi_wide_eq.columns]
value_vars = [c for c in ("chronic_absent_rate","non_chronic_rate","gap_vs_all") if c in kpi_wide_eq.columns]

kpi_long = kpi_wide_eq.melt(id_vars=id_vars, value_vars=value_vars,
                            var_name="metric", value_name="value").dropna(subset=["value"])
print("kpi_long:", kpi_long.shape)
kpi_long.head()


kpi_long: (54716, 11)


Unnamed: 0,geo_level,county_name,county_code,district_name,district_code,district_cds,year,subgroup,cohort,metric,value
0,district,Los Angeles,19,ABC Unified,64212,1964212,2024,All Students,18598.0,chronic_absent_rate,0.1376
2,district,Los Angeles,19,ABC Unified,64212,1964212,2024,Asian,4303.0,chronic_absent_rate,0.0453
3,district,Los Angeles,19,ABC Unified,64212,1964212,2024,Black/African American,1292.0,chronic_absent_rate,0.2167
4,district,Los Angeles,19,ABC Unified,64212,1964212,2024,English Learners,2213.0,chronic_absent_rate,0.1889
5,district,Los Angeles,19,ABC Unified,64212,1964212,2024,Female,9028.0,chronic_absent_rate,0.1402


In [10]:
kpi_wide_eq.to_csv(PROCESSED / "kpi_chronic_wide_all_levels.csv", index=False)
kpi_long.to_csv(PROCESSED / "kpi_chronic_long_all_levels.csv", index=False)

for lv in kpi_wide_eq["geo_level"].dropna().unique():
    kpi_wide_eq[kpi_wide_eq["geo_level"]==lv].to_csv(PROCESSED / f"kpi_chronic_wide_{lv}.csv", index=False)

print("Saved:",
      (PROCESSED / "kpi_chronic_wide_all_levels.csv").as_posix(),
      (PROCESSED / "kpi_chronic_long_all_levels.csv").as_posix())


Saved: /Users/ms/Projects/acoe-edu-data-portfolio/data/01_caldashboard/processed/kpi_chronic_wide_all_levels.csv /Users/ms/Projects/acoe-edu-data-portfolio/data/01_caldashboard/processed/kpi_chronic_long_all_levels.csv


In [11]:
kpi_wide_eq.to_csv(PROCESSED / "kpi_chronic_wide_all_levels.csv", index=False)
kpi_long.to_csv(PROCESSED / "kpi_chronic_long_all_levels.csv", index=False)

for lv in kpi_wide_eq["geo_level"].unique():
    kpi_wide_eq[kpi_wide_eq["geo_level"]==lv].to_csv(PROCESSED / f"kpi_chronic_wide_{lv}.csv", index=False)

print("Saved:",
      (PROCESSED / "kpi_chronic_wide_all_levels.csv").as_posix(),
      (PROCESSED / "kpi_chronic_long_all_levels.csv").as_posix())


Saved: /Users/ms/Projects/acoe-edu-data-portfolio/data/01_caldashboard/processed/kpi_chronic_wide_all_levels.csv /Users/ms/Projects/acoe-edu-data-portfolio/data/01_caldashboard/processed/kpi_chronic_long_all_levels.csv


In [12]:
# Top districts by chronic absenteeism (All Students)
mask_all = kpi_wide_eq["subgroup"].astype(str).str.lower().eq("all students")
display(
    kpi_wide_eq[(kpi_wide_eq["geo_level"]=="district") & mask_all]
    .sort_values("chronic_absent_rate", ascending=False)
    .head(10)[["district_name","year","chronic_absent_rate","cohort"]]
)

# Largest subgroup gaps (positive = subgroup higher than All Students)
display(
    kpi_wide_eq[(kpi_wide_eq["geo_level"]=="district") & ~mask_all]
    .dropna(subset=["gap_vs_all"])
    .sort_values("gap_vs_all", ascending=False)
    .head(10)[["district_name","year","subgroup","gap_vs_all","cohort"]]
)


Unnamed: 0,district_name,year,chronic_absent_rate,cohort
8480,Inyo County Office of Education,2024,0.7812,2180.0
1836,Blake Elementary,2024,0.7619,21.0
21559,Trinity County Office of Education,2024,0.75,160.0
18103,San Benito County Office of Education,2024,0.7426,101.0
8170,Hot Springs Elementary,2024,0.7368,19.0
9526,Klamath-Trinity Joint Unified,2024,0.6808,1015.0
21811,Twin Ridges Elementary,2024,0.678,118.0
9832,Lake County Office of Education,2024,0.6545,55.0
15402,Peninsula Union,2024,0.6438,73.0
18310,San Francisco County Office of Education,2024,0.6137,321.0


Unnamed: 0,district_name,year,subgroup,gap_vs_all,cohort
18838,San Rafael City High,2024,Foster,0.7173,12.0
22417,Warner Unified,2024,Foster,0.6923,16.0
19003,Santa Clara County Office of Education,2024,Foster,0.6807,31.0
13811,Newman-Crows Landing Unified,2024,Homeless,0.6749,12.0
616,Anderson Union High,2024,Homeless,0.64,26.0
18630,San Marcos Unified,2024,Foster,0.6256,21.0
5013,Downey Unified,2024,Native Hawaiian/Pacific Islander,0.597,12.0
3002,Capistrano Unified,2024,Foster,0.583,23.0
6544,Fremont Union High,2024,Homeless,0.5821,42.0
17296,Roseland,2024,Two or More Races,0.575,13.0
