[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abigailhaddad/fedscope_new/blob/main/age_bracket_analysis.ipynb)

# Federal Workforce Analysis by Age Bracket: January 2025 - November 2025

This notebook analyzes changes in the federal workforce by age group since January 2025.

**Data sources:**
- Employment snapshot from January 2025 (baseline)
- Separations data from February - November 2025
- Accessions data from February - November 2025

All data is from OPM (Office of Personnel Management) via HuggingFace.

---

## ⚠️ IMPORTANT: DATA FILTERING NOTE

**Each monthly OPM data file contains some delayed reporting from exactly 2 years prior** (~50-60 rows per file from 2023). 

**This analysis EXCLUDES those old records.** We filter on `personnel_action_effective_date_yyyymm` to only include personnel actions that actually occurred in 2025. This ensures we're measuring actual 2025 workforce changes, not delayed paperwork from prior years.

---

In [None]:
from huggingface_hub import hf_hub_download
import pandas as pd
from tqdm.notebook import tqdm

HF_USERNAME = "abigailhaddad"
BASELINE_MONTH = "202501"
CHANGE_MONTHS = ["202502", "202503", "202504", "202505", "202506",
                 "202507", "202508", "202509", "202510", "202511"]

# Order age brackets logically
AGE_ORDER = ["LESS THAN 20", "20-24", "25-29", "30-34", "35-39", "40-44", 
             "45-49", "50-54", "55-59", "60-64", "65 OR MORE"]

## 1. Load January 2025 Baseline (Employment Snapshot)

In [None]:
def download_dataset(data_type: str, month: str) -> pd.DataFrame:
    """Download a single month's dataset."""
    repo_id = f"{HF_USERNAME}/opm-federal-{data_type}-{month}"
    path = hf_hub_download(repo_id=repo_id, filename="data.parquet", repo_type="dataset")
    return pd.read_parquet(path)

# Load baseline
print("Downloading January 2025 employment data...")
emp_df = download_dataset("employment", BASELINE_MONTH)

# Aggregate by age bracket
emp_df["count"] = pd.to_numeric(emp_df["count"], errors="coerce").fillna(0)

baseline = emp_df.groupby("age_bracket")["count"].sum().reset_index()
baseline.columns = ["age_bracket", "baseline_jan2025"]

print(f"\nTotal workforce in January 2025: {baseline['baseline_jan2025'].sum():,.0f}")
print(f"\nBy age bracket:")
for age in AGE_ORDER:
    count = baseline[baseline['age_bracket'] == age]['baseline_jan2025'].values
    if len(count) > 0:
        print(f"  {age:15} {count[0]:>12,.0f}")

## 2. Load Separations and Accessions (Feb - Nov 2025)

In [None]:
def get_monthly_changes(data_type: str) -> pd.DataFrame:
    """Get monthly separations or accessions by age bracket.
    
    Filters on personnel_action_effective_date_yyyymm to exclude delayed 
    reporting from prior years.
    """
    date_col = "personnel_action_effective_date_yyyymm"
    all_data = []
    
    for month in tqdm(CHANGE_MONTHS, desc=f"Loading {data_type}"):
        df = download_dataset(data_type, month)
        # Only include actions that actually happened in 2025
        df = df[df[date_col].str.startswith("2025")].copy()
        df["count"] = pd.to_numeric(df["count"], errors="coerce").fillna(0)
        monthly = df.groupby("age_bracket")["count"].sum().reset_index()
        monthly["month"] = month
        all_data.append(monthly)
    
    combined = pd.concat(all_data, ignore_index=True)
    
    # Pivot to wide format
    pivoted = combined.pivot_table(
        index="age_bracket",
        columns="month",
        values="count",
        fill_value=0
    ).reset_index()
    
    # Flatten column names
    pivoted.columns = [f"{data_type}_{col}" if col != "age_bracket" else col
                       for col in pivoted.columns]
    
    # Add total column
    month_cols = [c for c in pivoted.columns if c.startswith(f"{data_type}_20")]
    pivoted[f"{data_type}_total"] = pivoted[month_cols].sum(axis=1)
    
    return pivoted

separations = get_monthly_changes("separations")
accessions = get_monthly_changes("accessions")

## 3. Merge and Calculate Changes

In [None]:
# Merge all data
result = baseline.merge(separations, on="age_bracket", how="left")
result = result.merge(accessions, on="age_bracket", how="left")
result = result.fillna(0)

# Calculate metrics
result["net_change"] = result["accessions_total"] - result["separations_total"]
result["pct_change"] = (result["net_change"] / result["baseline_jan2025"] * 100).round(2)
result["pct_separated"] = (result["separations_total"] / result["baseline_jan2025"] * 100).round(2)

# Sort by age order
result["age_order"] = result["age_bracket"].apply(lambda x: AGE_ORDER.index(x) if x in AGE_ORDER else 99)
result = result.sort_values("age_order").drop(columns=["age_order"])

print("Data merged successfully!")

## 4. Summary Statistics

In [None]:
total_baseline = result["baseline_jan2025"].sum()
total_sep = result["separations_total"].sum()
total_acc = result["accessions_total"].sum()
total_net = total_acc - total_sep

print("=" * 60)
print("FEDERAL WORKFORCE SUMMARY BY AGE")
print("=" * 60)
print(f"\nTotal workforce (Jan 2025):         {total_baseline:>12,.0f}")
print(f"Total separations (Feb-Nov 2025):   {total_sep:>12,.0f}")
print(f"Total accessions (Feb-Nov 2025):    {total_acc:>12,.0f}")
print(f"Net change:                         {total_net:>+12,.0f} ({total_net/total_baseline*100:+.1f}%)")
print(f"\nEstimated current workforce:        {total_baseline + total_net:>12,.0f}")

## 5. Changes by Age Bracket

In [None]:
# Display summary table
summary = result[["age_bracket", "baseline_jan2025", "separations_total", 
                  "accessions_total", "net_change", "pct_change"]].copy()

summary.columns = ["Age Bracket", "Jan 2025 Baseline", "Separations", 
                   "Accessions", "Net Change", "% Change"]

print("Workforce Changes by Age Bracket (Feb-Nov 2025):\n")
summary

## 6. Visualization: Net Change by Age

In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Chart 1: Absolute net change
ax1 = axes[0]
colors = ['green' if x > 0 else 'red' for x in result['net_change']]
ax1.barh(result['age_bracket'], result['net_change'], color=colors)
ax1.set_xlabel('Net Change (Accessions - Separations)')
ax1.set_title('Absolute Net Change by Age Bracket')
ax1.axvline(x=0, color='black', linewidth=0.5)

# Chart 2: Percentage change
ax2 = axes[1]
colors = ['green' if x > 0 else 'red' for x in result['pct_change']]
ax2.barh(result['age_bracket'], result['pct_change'], color=colors)
ax2.set_xlabel('% Change from Baseline')
ax2.set_title('Percentage Change by Age Bracket')
ax2.axvline(x=0, color='black', linewidth=0.5)

plt.tight_layout()
plt.show()

## 7. Monthly Trends by Age Bracket

In [None]:
# Calculate monthly net change
monthly_data = []
for month in CHANGE_MONTHS:
    for _, row in result.iterrows():
        sep = row.get(f"separations_{month}", 0)
        acc = row.get(f"accessions_{month}", 0)
        monthly_data.append({
            "age_bracket": row["age_bracket"],
            "month": month,
            "net_change": acc - sep
        })

monthly_df = pd.DataFrame(monthly_data)
monthly_pivot = monthly_df.pivot(index="month", columns="age_bracket", values="net_change")
monthly_pivot = monthly_pivot[AGE_ORDER]  # Reorder columns

print("Monthly Net Change by Age Bracket:")
monthly_pivot

## 8. Full Data Table

Click the download icon to save as CSV.

In [None]:
# Reorder columns for clarity
display_cols = ["age_bracket", "baseline_jan2025", 
                "separations_total", "accessions_total", "net_change", "pct_change"]

# Add monthly columns
sep_months = sorted([c for c in result.columns if c.startswith("separations_20")])
acc_months = sorted([c for c in result.columns if c.startswith("accessions_20")])
display_cols.extend(sep_months)
display_cols.extend(acc_months)

full_data = result[display_cols].copy()

full_data

## 9. Download as CSV

In [None]:
# Save to CSV
output_path = "age_bracket_analysis.csv"
full_data.to_csv(output_path, index=False)
print(f"Saved to: {output_path}")
print(f"Rows: {len(full_data)}")

# Create download link (works in Jupyter/Colab)
try:
    from IPython.display import FileLink
    display(FileLink(output_path))
except:
    print(f"\nDownload the file directly: {output_path}")