[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abigailhaddad/fedscope_new/blob/main/demo.ipynb)

# OPM Federal Workforce Data: Accessions & Separations Over Time

This notebook loads federal workforce accession (new hires) and separation (departures) data from HuggingFace and visualizes trends over time.

**No authentication required** - all datasets are public.

In [None]:
# Install dependencies (for Colab)
!pip install -q duckdb pandas matplotlib

In [None]:
import duckdb
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

## 1. Define Dataset URLs

The data is stored as parquet files on HuggingFace. DuckDB can query them directly via HTTP.

In [None]:
# Generate all dataset URLs
HF_USERNAME = "abigailhaddad"
BASE_URL = f"https://huggingface.co/datasets/{HF_USERNAME}"

def generate_months(start_year=2021, start_month=1, end_year=2025, end_month=11):
    """Generate list of YYYYMM strings."""
    months = []
    for year in range(start_year, end_year + 1):
        for month in range(1, 13):
            if year == start_year and month < start_month:
                continue
            if year == end_year and month > end_month:
                break
            months.append(f"{year}{month:02d}")
    return months

months = generate_months()

# Build URLs for each dataset
accession_urls = [f"{BASE_URL}/opm-federal-accessions-{m}/resolve/main/data.parquet" for m in months]
separation_urls = [f"{BASE_URL}/opm-federal-separations-{m}/resolve/main/data.parquet" for m in months]

print(f"Generated {len(accession_urls)} accession URLs")
print(f"Generated {len(separation_urls)} separation URLs")
print(f"\nExample: {accession_urls[0]}")

In [None]:
# Verify datasets exist by checking a few
def check_url(url):
    """Quick check if parquet URL is accessible."""
    try:
        duckdb.sql(f"SELECT 1 FROM '{url}' LIMIT 1")
        return True
    except:
        return False

# Check first and last of each type
print("Checking dataset availability...")
print(f"  First accession ({months[0]}): {'OK' if check_url(accession_urls[0]) else 'MISSING'}")
print(f"  Last accession ({months[-1]}): {'OK' if check_url(accession_urls[-1]) else 'MISSING'}")
print(f"  First separation ({months[0]}): {'OK' if check_url(separation_urls[0]) else 'MISSING'}")
print(f"  Last separation ({months[-1]}): {'OK' if check_url(separation_urls[-1]) else 'MISSING'}")

## 2. Load All Data with DuckDB

DuckDB reads parquet files directly via HTTP - no download needed. This is much faster than the datasets library.

In [None]:
%%time
from concurrent.futures import ThreadPoolExecutor, as_completed

def get_count(args):
    """Get row count for a single parquet URL."""
    url, month = args
    try:
        # Each thread needs its own connection
        conn = duckdb.connect()
        result = conn.sql(f"SELECT COUNT(*) FROM '{url}'").fetchone()
        conn.close()
        return {'month': month, 'date': datetime.strptime(month, '%Y%m'), 'count': result[0]}
    except:
        return None

def load_counts_parallel(urls, months, data_type, max_workers=16):
    """Load counts in parallel using thread pool."""
    counts = []
    errors = []
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(get_count, (url, month)): month 
                   for url, month in zip(urls, months)}
        
        for future in as_completed(futures):
            result = future.result()
            if result:
                counts.append(result)
            else:
                errors.append(futures[future])
    
    if errors:
        print(f"  Missing {data_type} months: {sorted(errors)}")
    else:
        print(f"  All {len(counts)} {data_type} months loaded!")
    
    # Sort by month
    return pd.DataFrame(counts).sort_values('month').reset_index(drop=True)

print("Loading accessions and separations in parallel...")
acc_df = load_counts_parallel(accession_urls, months, "accession")
sep_df = load_counts_parallel(separation_urls, months, "separation")

print(f"\nLoaded {len(acc_df)} accession months, {len(sep_df)} separation months")

## 3. Plot Accessions vs Separations Over Time

In [None]:
fig, ax = plt.subplots(figsize=(14, 6))

ax.plot(acc_df['date'], acc_df['count'], marker='o', markersize=3, label='Accessions (New Hires)', color='#2ecc71')
ax.plot(sep_df['date'], sep_df['count'], marker='o', markersize=3, label='Separations (Departures)', color='#e74c3c')

ax.set_xlabel('Month')
ax.set_ylabel('Count')
ax.set_title('Federal Workforce: Monthly Accessions vs Separations (2021-2025)')
ax.legend()
ax.grid(True, alpha=0.3)

# Format y-axis with commas
ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: format(int(x), ',')))

plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Net change (accessions - separations)
merged = acc_df.merge(sep_df, on=['month', 'date'], suffixes=('_acc', '_sep'))
merged['net_change'] = merged['count_acc'] - merged['count_sep']

fig, ax = plt.subplots(figsize=(14, 5))

colors = ['#2ecc71' if x > 0 else '#e74c3c' for x in merged['net_change']]
ax.bar(merged['date'], merged['net_change'], color=colors, width=20)

ax.axhline(y=0, color='black', linestyle='-', linewidth=0.5)
ax.set_xlabel('Month')
ax.set_ylabel('Net Change (Accessions - Separations)')
ax.set_title('Federal Workforce: Monthly Net Change')
ax.grid(True, alpha=0.3, axis='y')

ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: format(int(x), ',')))

plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

print(f"\nTotal accessions: {merged['count_acc'].sum():,}")
print(f"Total separations: {merged['count_sep'].sum():,}")
print(f"Net change: {merged['net_change'].sum():,}")

## 4. Explore the Data Structure

Let's look at what columns are available in the data.

In [None]:
# Inspect columns using DuckDB
print("Accessions columns:")
acc_cols = duckdb.sql(f"DESCRIBE SELECT * FROM '{accession_urls[0]}'").df()
print(acc_cols['column_name'].tolist())

print("\nSeparations columns:")
sep_cols = duckdb.sql(f"DESCRIBE SELECT * FROM '{separation_urls[0]}'").df()
print(sep_cols['column_name'].tolist())

In [None]:
# Show sample data
print("Sample accession records:")
duckdb.sql(f"SELECT * FROM '{accession_urls[0]}' LIMIT 5").df()