[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abigailhaddad/fedscope_new/blob/main/reconciliation_analysis.ipynb)

# Federal Workforce Data Reconciliation Analysis

Comparing snapshot employment data vs flow data (accessions - separations).

**All data filtered by `personnel_action_effective_date_yyyymm` to use actual action dates.**

In [None]:
from huggingface_hub import hf_hub_download
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

HF_USERNAME = "abigailhaddad"

def download_dataset(data_type: str, month: str) -> pd.DataFrame:
    repo_id = f"{HF_USERNAME}/opm-federal-{data_type}-{month}"
    path = hf_hub_download(repo_id=repo_id, filename="data.parquet", repo_type="dataset")
    return pd.read_parquet(path)

## Table 1: VATA (Veterans Health Administration) - Monthly Reconciliation

October 2024 - September 2025

In [None]:
AGENCY_SUBELEMENT_CODE = "VATA"
MONTHS = ["202410", "202411", "202412", "202501", "202502", "202503",
          "202504", "202505", "202506", "202507", "202508", "202509"]

print("Loading VATA data...")
vata_data = []
prev_employment = None

for month in MONTHS:
    # Employment snapshot
    emp = download_dataset("employment", month)
    emp_sub = emp[emp["agency_subelement_code"] == AGENCY_SUBELEMENT_CODE].copy()
    emp_sub["count"] = pd.to_numeric(emp_sub["count"], errors="coerce").fillna(0)
    employment = emp_sub["count"].sum()
    
    # Accessions (filtered by effective date)
    acc = download_dataset("accessions", month)
    date_col = "personnel_action_effective_date_yyyymm"
    acc_sub = acc[(acc["agency_subelement_code"] == AGENCY_SUBELEMENT_CODE) & 
                  (acc[date_col] == month)].copy()
    acc_sub["count"] = pd.to_numeric(acc_sub["count"], errors="coerce").fillna(0)
    accessions = acc_sub["count"].sum()
    
    # Separations (filtered by effective date)
    sep = download_dataset("separations", month)
    sep_sub = sep[(sep["agency_subelement_code"] == AGENCY_SUBELEMENT_CODE) & 
                  (sep[date_col] == month)].copy()
    sep_sub["count"] = pd.to_numeric(sep_sub["count"], errors="coerce").fillna(0)
    separations = sep_sub["count"].sum()
    
    net_flow = accessions - separations
    
    if prev_employment is not None:
        actual_change = employment - prev_employment
        difference = actual_change - net_flow
    else:
        actual_change = None
        difference = None
    
    vata_data.append({
        "Month": month,
        "Snapshot": int(employment),
        "Accessions": int(accessions),
        "Separations": int(separations),
        "Net Flow": int(net_flow),
        "Actual Change": int(actual_change) if actual_change is not None else None,
        "Difference": int(difference) if difference is not None else None
    })
    
    prev_employment = employment

vata_df = pd.DataFrame(vata_data)

print("\nVATA (Veterans Health Administration) Monthly Reconciliation:")
display(vata_df)

# Summary statistics
total_accessions = vata_df["Accessions"].sum()
total_separations = vata_df["Separations"].sum()
total_net_flow = total_accessions - total_separations
total_actual_change = vata_df.iloc[-1]["Snapshot"] - vata_df.iloc[0]["Snapshot"]
total_difference = vata_df["Difference"].sum()

print("\n" + "="*70)
print("SUMMARY")
print("="*70)
print(f"Starting employment ({MONTHS[0]}):  {vata_df.iloc[0]['Snapshot']:>10,}")
print(f"Ending employment ({MONTHS[-1]}):    {vata_df.iloc[-1]['Snapshot']:>10,}")
print(f"\nTotal accessions:                  {total_accessions:>10,}")
print(f"Total separations:                 {total_separations:>10,}")
print(f"Total net flow:                    {total_net_flow:>+10,}")
print(f"\nActual change (snapshot):          {total_actual_change:>+10,}")
print(f"Total discrepancy:                 {total_difference:>+10,}")
print(f"\nExpected ending employment:        {vata_df.iloc[0]['Snapshot'] + total_net_flow:>10,} (if flow data matched)")
print(f"Actual ending employment:          {vata_df.iloc[-1]['Snapshot']:>10,}")

## Table 2: All Agencies - Single Month Reconciliation

August 2025 to September 2025

In [None]:
PREV_MONTH = "202508"
CURR_MONTH = "202509"

print(f"Loading all agencies data for {PREV_MONTH} to {CURR_MONTH}...")

# Employment snapshots
emp_prev = download_dataset("employment", PREV_MONTH)
emp_prev["count"] = pd.to_numeric(emp_prev["count"], errors="coerce").fillna(0)
prev_totals = emp_prev.groupby(["agency", "agency_code"])["count"].sum().reset_index()
prev_totals.columns = ["agency", "agency_code", "prev_emp"]

emp_curr = download_dataset("employment", CURR_MONTH)
emp_curr["count"] = pd.to_numeric(emp_curr["count"], errors="coerce").fillna(0)
curr_totals = emp_curr.groupby(["agency", "agency_code"])["count"].sum().reset_index()
curr_totals.columns = ["agency", "agency_code", "curr_emp"]

# Accessions (filtered by effective date)
acc = download_dataset("accessions", CURR_MONTH)
date_col = "personnel_action_effective_date_yyyymm"
acc = acc[acc[date_col] == CURR_MONTH].copy()
acc["count"] = pd.to_numeric(acc["count"], errors="coerce").fillna(0)
accessions = acc.groupby(["agency", "agency_code"])["count"].sum().reset_index()
accessions.columns = ["agency", "agency_code", "accessions"]

# Separations (filtered by effective date)
sep = download_dataset("separations", CURR_MONTH)
sep = sep[sep[date_col] == CURR_MONTH].copy()
sep["count"] = pd.to_numeric(sep["count"], errors="coerce").fillna(0)
separations = sep.groupby(["agency", "agency_code"])["count"].sum().reset_index()
separations.columns = ["agency", "agency_code", "separations"]

# Merge
all_agencies = prev_totals.merge(curr_totals, on=["agency", "agency_code"], how="outer")
all_agencies = all_agencies.merge(accessions, on=["agency", "agency_code"], how="left")
all_agencies = all_agencies.merge(separations, on=["agency", "agency_code"], how="left")
all_agencies = all_agencies.fillna(0)

# Calculate
all_agencies["actual_change"] = all_agencies["curr_emp"] - all_agencies["prev_emp"]
all_agencies["net_flow"] = all_agencies["accessions"] - all_agencies["separations"]
all_agencies["difference"] = all_agencies["actual_change"] - all_agencies["net_flow"]

# Convert to int
for col in ["prev_emp", "curr_emp", "accessions", "separations", "actual_change", "net_flow", "difference"]:
    all_agencies[col] = all_agencies[col].astype(int)

# Sort by absolute difference
all_agencies["abs_diff"] = all_agencies["difference"].abs()
all_agencies = all_agencies.sort_values("abs_diff", ascending=False)

# Select columns
all_agencies_df = all_agencies[["agency", "agency_code", "prev_emp", "curr_emp", 
                                 "accessions", "separations", "net_flow", 
                                 "actual_change", "difference"]].copy()

all_agencies_df.columns = ["Agency", "Code", "Aug Snapshot", "Sep Snapshot",
                           "Accessions", "Separations", "Net Flow", 
                           "Actual Change", "Difference"]

print(f"\nAll Agencies Reconciliation ({PREV_MONTH} to {CURR_MONTH}):")
all_agencies_df