[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abigailhaddad/fedscope_new/blob/main/2210_workforce_analysis.ipynb)

# IT Workforce (2210) Analysis: January 2025 - November 2025

This notebook analyzes changes in the federal IT workforce (occupational series 2210) by agency since January 2025.

**Data sources:**
- Employment snapshot from January 2025 (baseline)
- Separations data from February - November 2025
- Accessions data from February - November 2025

All data is from OPM (Office of Personnel Management) via HuggingFace.

---

## ⚠️ IMPORTANT: DATA FILTERING NOTE

**Each monthly OPM data file contains some delayed reporting from exactly 2 years prior** (~50-60 rows per file from 2023). 

**This analysis EXCLUDES those old records.** We filter on `personnel_action_effective_date_yyyymm` to only include personnel actions that actually occurred in 2025. This ensures we're measuring actual 2025 workforce changes, not delayed paperwork from prior years.

---

In [None]:
from huggingface_hub import hf_hub_download
import pandas as pd
from tqdm.notebook import tqdm

HF_USERNAME = "abigailhaddad"
SERIES_CODE = "2210"  # IT Specialist
BASELINE_MONTH = "202501"
CHANGE_MONTHS = ["202502", "202503", "202504", "202505", "202506",
                 "202507", "202508", "202509", "202510", "202511"]

## 1. Load January 2025 Baseline (Employment Snapshot)

In [None]:
def download_dataset(data_type: str, month: str) -> pd.DataFrame:
    """Download a single month's dataset."""
    repo_id = f"{HF_USERNAME}/opm-federal-{data_type}-{month}"
    path = hf_hub_download(repo_id=repo_id, filename="data.parquet", repo_type="dataset")
    return pd.read_parquet(path)

# Load baseline
print("Downloading January 2025 employment data...")
emp_df = download_dataset("employment", BASELINE_MONTH)

# Filter to 2210 and aggregate by agency
it_emp = emp_df[emp_df["occupational_series_code"] == SERIES_CODE].copy()
it_emp["count"] = pd.to_numeric(it_emp["count"], errors="coerce").fillna(0)

baseline = it_emp.groupby(["agency", "agency_code"])["count"].sum().reset_index()
baseline.columns = ["agency", "agency_code", "baseline_jan2025"]
baseline = baseline.sort_values("baseline_jan2025", ascending=False)

print(f"\nFound {len(baseline)} agencies with 2210 employees")
print(f"Total 2210 workforce in January 2025: {baseline['baseline_jan2025'].sum():,.0f}")

## 2. Load Separations and Accessions (Feb - Nov 2025)

In [None]:
def get_monthly_changes(data_type: str) -> pd.DataFrame:
    """Get monthly separations or accessions for 2210 by agency.
    
    Filters on personnel_action_effective_date_yyyymm to exclude delayed 
    reporting from prior years.
    """
    date_col = "personnel_action_effective_date_yyyymm"
    all_data = []
    
    for month in tqdm(CHANGE_MONTHS, desc=f"Loading {data_type}"):
        df = download_dataset(data_type, month)
        # Filter to 2210 AND only actions that actually happened in 2025
        it_df = df[(df["occupational_series_code"] == SERIES_CODE) & 
                   (df[date_col].str.startswith("2025"))].copy()
        it_df["count"] = pd.to_numeric(it_df["count"], errors="coerce").fillna(0)
        monthly = it_df.groupby(["agency", "agency_code"])["count"].sum().reset_index()
        monthly["month"] = month
        all_data.append(monthly)
    
    combined = pd.concat(all_data, ignore_index=True)
    
    # Pivot to wide format
    pivoted = combined.pivot_table(
        index=["agency", "agency_code"],
        columns="month",
        values="count",
        fill_value=0
    ).reset_index()
    
    # Flatten column names
    pivoted.columns = [f"{data_type}_{col}" if col not in ["agency", "agency_code"] else col
                       for col in pivoted.columns]
    
    # Add total column
    month_cols = [c for c in pivoted.columns if c.startswith(f"{data_type}_20")]
    pivoted[f"{data_type}_total"] = pivoted[month_cols].sum(axis=1)
    
    return pivoted

separations = get_monthly_changes("separations")
accessions = get_monthly_changes("accessions")

## 3. Merge and Calculate Changes

In [None]:
# Merge all data
result = baseline.merge(separations, on=["agency", "agency_code"], how="left")
result = result.merge(accessions, on=["agency", "agency_code"], how="left")
result = result.fillna(0)

# Calculate metrics
result["net_change"] = result["accessions_total"] - result["separations_total"]
result["pct_change"] = (result["net_change"] / result["baseline_jan2025"] * 100).round(2)
result["pct_separated"] = (result["separations_total"] / result["baseline_jan2025"] * 100).round(2)

# Sort by percentage change (most negative first)
result = result.sort_values("pct_change")

print("Data merged successfully!")

## 4. Summary Statistics

In [None]:
total_baseline = result["baseline_jan2025"].sum()
total_sep = result["separations_total"].sum()
total_acc = result["accessions_total"].sum()
total_net = total_acc - total_sep

print("=" * 60)
print("FEDERAL IT WORKFORCE (2210) SUMMARY")
print("=" * 60)
print(f"\nTotal 2210 workforce (Jan 2025):    {total_baseline:>10,.0f}")
print(f"Total separations (Feb-Nov 2025):   {total_sep:>10,.0f}")
print(f"Total accessions (Feb-Nov 2025):    {total_acc:>10,.0f}")
print(f"Net change:                         {total_net:>+10,.0f} ({total_net/total_baseline*100:+.1f}%)")
print(f"\nEstimated current workforce:        {total_baseline + total_net:>10,.0f}")

## 5. Agencies with Largest % Workforce Lost

Filtered to agencies with at least 50 IT employees (to avoid noise from tiny agencies).

In [None]:
# Filter to agencies with meaningful baseline (at least 50 employees)
significant = result[result["baseline_jan2025"] >= 50].copy()

# Show top 15 by % lost
top_losses = significant.head(15)[["agency", "agency_code", "baseline_jan2025", 
                                    "separations_total", "accessions_total", 
                                    "net_change", "pct_change"]].copy()

top_losses.columns = ["Agency", "Code", "Jan 2025 Baseline", "Separations", 
                      "Accessions", "Net Change", "% Change"]

print("Top 15 Agencies by % IT Workforce Lost (min 50 employees):\n")
top_losses

## 6. Largest Agencies by Absolute Losses

In [None]:
# Sort by absolute net change
by_absolute = result.sort_values("net_change").head(15)[["agency", "agency_code", 
                                                          "baseline_jan2025", 
                                                          "separations_total", 
                                                          "accessions_total",
                                                          "net_change", "pct_change"]].copy()

by_absolute.columns = ["Agency", "Code", "Jan 2025 Baseline", "Separations", 
                       "Accessions", "Net Change", "% Change"]

print("Top 15 Agencies by Absolute IT Workforce Lost:\n")
by_absolute

## 7. Full Data Table

Click the download icon to save as CSV.

In [None]:
# Reorder columns for clarity
display_cols = ["agency", "agency_code", "baseline_jan2025", 
                "separations_total", "accessions_total", "net_change", "pct_change"]

# Add monthly columns
sep_months = [c for c in result.columns if c.startswith("separations_20")]
acc_months = [c for c in result.columns if c.startswith("accessions_20")]
display_cols.extend(sorted(sep_months))
display_cols.extend(sorted(acc_months))

full_data = result[display_cols].copy()
full_data = full_data.sort_values("pct_change")

full_data

## 8. Download as CSV

In [None]:
# Save to CSV
output_path = "2210_workforce_analysis.csv"
full_data.to_csv(output_path, index=False)
print(f"Saved to: {output_path}")
print(f"Rows: {len(full_data)}")

# Create download link (works in Jupyter/Colab)
try:
    from IPython.display import FileLink
    display(FileLink(output_path))
except:
    print(f"\nDownload the file directly: {output_path}")