[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abigailhaddad/fedscope_new/blob/main/demo.ipynb)

# OPM Federal Workforce Data: Accessions & Separations Over Time

This notebook loads federal workforce accession (new hires) and separation (departures) data from HuggingFace and visualizes trends over time.

In [None]:
# Install dependencies (for Colab)
!pip install -q datasets pandas matplotlib

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from datasets import load_dataset
from huggingface_hub import list_datasets
from datetime import datetime

## 1. Discover Available Datasets

First, let's find all accession and separation datasets on HuggingFace and check we have every month we expect.

In [None]:
# Find all datasets
HF_USERNAME = "abigailhaddad"

accession_datasets = []
separation_datasets = []

for d in list_datasets(author=HF_USERNAME):
    name = d.id.split('/')[-1]
    if 'accessions-' in name:
        # Extract YYYYMM
        month = name.split('-')[-1]
        accession_datasets.append({'repo': d.id, 'month': month})
    elif 'separations-' in name:
        month = name.split('-')[-1]
        separation_datasets.append({'repo': d.id, 'month': month})

print(f"Found {len(accession_datasets)} accession datasets")
print(f"Found {len(separation_datasets)} separation datasets")

In [None]:
# Check for missing months
def generate_expected_months(start_year=2021, start_month=1, end_year=2025, end_month=11):
    """Generate list of expected YYYYMM strings."""
    months = []
    for year in range(start_year, end_year + 1):
        for month in range(1, 13):
            if year == start_year and month < start_month:
                continue
            if year == end_year and month > end_month:
                break
            months.append(f"{year}{month:02d}")
    return months

expected = set(generate_expected_months())
acc_months = set(d['month'] for d in accession_datasets)
sep_months = set(d['month'] for d in separation_datasets)

missing_acc = expected - acc_months
missing_sep = expected - sep_months

if missing_acc:
    print(f"Missing accession months: {sorted(missing_acc)}")
else:
    print("All accession months present!")

if missing_sep:
    print(f"Missing separation months: {sorted(missing_sep)}")
else:
    print("All separation months present!")

## 2. Load All Data

Load each monthly dataset and count the number of records.

In [None]:
def load_counts(datasets_list, desc="Loading"):
    """Load each dataset and get row count."""
    counts = []
    for d in sorted(datasets_list, key=lambda x: x['month']):
        try:
            ds = load_dataset(d['repo'], split='train')
            counts.append({
                'month': d['month'],
                'date': datetime.strptime(d['month'], '%Y%m'),
                'count': len(ds)
            })
            print(f"  {d['month']}: {len(ds):,} records")
        except Exception as e:
            print(f"  {d['month']}: ERROR - {e}")
    return pd.DataFrame(counts)

print("Loading accessions...")
acc_df = load_counts(accession_datasets)

print("\nLoading separations...")
sep_df = load_counts(separation_datasets)

## 3. Plot Accessions vs Separations Over Time

In [None]:
fig, ax = plt.subplots(figsize=(14, 6))

ax.plot(acc_df['date'], acc_df['count'], marker='o', markersize=3, label='Accessions (New Hires)', color='#2ecc71')
ax.plot(sep_df['date'], sep_df['count'], marker='o', markersize=3, label='Separations (Departures)', color='#e74c3c')

ax.set_xlabel('Month')
ax.set_ylabel('Count')
ax.set_title('Federal Workforce: Monthly Accessions vs Separations (2021-2025)')
ax.legend()
ax.grid(True, alpha=0.3)

# Format y-axis with commas
ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: format(int(x), ',')))

plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Net change (accessions - separations)
merged = acc_df.merge(sep_df, on=['month', 'date'], suffixes=('_acc', '_sep'))
merged['net_change'] = merged['count_acc'] - merged['count_sep']

fig, ax = plt.subplots(figsize=(14, 5))

colors = ['#2ecc71' if x > 0 else '#e74c3c' for x in merged['net_change']]
ax.bar(merged['date'], merged['net_change'], color=colors, width=20)

ax.axhline(y=0, color='black', linestyle='-', linewidth=0.5)
ax.set_xlabel('Month')
ax.set_ylabel('Net Change (Accessions - Separations)')
ax.set_title('Federal Workforce: Monthly Net Change')
ax.grid(True, alpha=0.3, axis='y')

ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: format(int(x), ',')))

plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

print(f"\nTotal accessions: {merged['count_acc'].sum():,}")
print(f"Total separations: {merged['count_sep'].sum():,}")
print(f"Net change: {merged['net_change'].sum():,}")

## 4. Explore the Data Structure

Let's look at what columns are available in the data.

In [None]:
# Load a sample month to inspect columns
sample_acc = load_dataset(accession_datasets[0]['repo'], split='train')
sample_sep = load_dataset(separation_datasets[0]['repo'], split='train')

print("Accessions columns:")
print(sample_acc.column_names)

print("\nSeparations columns:")
print(sample_sep.column_names)

In [None]:
# Show sample data
print("Sample accession records:")
sample_acc.to_pandas().head()