[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abigailhaddad/fedscope_new/blob/main/state_redaction_analysis.ipynb)

# State Redaction Analysis: Which Federal Jobs Hide Location?

This notebook explores which occupations and agencies have redacted or missing state data in OPM federal workforce records.

In [None]:
# @title Setup
!pip install -q duckdb pandas plotly

import duckdb
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

# Helper for downloads
import base64
from IPython.display import HTML, display

def download_csv(df, filename):
    csv = df.to_csv(index=False)
    b64 = base64.b64encode(csv.encode()).decode()
    display(HTML(f'<a href="data:file/csv;base64,{b64}" download="{filename}" style="background:#4CAF50;color:white;padding:8px 16px;text-decoration:none;border-radius:4px;display:inline-block;margin:10px 0;">Download {filename}</a>'))

In [None]:
# @title Load Data from HuggingFace
HF_USERNAME = "abigailhaddad"
BASE_URL = f"https://huggingface.co/datasets/{HF_USERNAME}"

def generate_months(start_year=2021, start_month=1, end_year=2025, end_month=11):
    months = []
    for year in range(start_year, end_year + 1):
        for month in range(1, 13):
            if year == start_year and month < start_month:
                continue
            if year == end_year and month > end_month:
                break
            months.append(f"{year}{month:02d}")
    return months

months = generate_months()
accession_urls = [f"{BASE_URL}/opm-federal-accessions-{m}/resolve/main/data.parquet" for m in months]

print(f"Loading {len(accession_urls)} months of accessions...")

In [None]:
%%time
# @title Load into DuckDB
db = duckdb.connect()

acc_url_list = ", ".join([f"'{url}'" for url in accession_urls])
db.execute(f"""
    CREATE TABLE accessions AS 
    SELECT *, personnel_action_effective_date_yyyymm as month
    FROM read_parquet([{acc_url_list}])
""")

acc_count = db.execute("SELECT SUM(CAST(count AS INTEGER)) FROM accessions").fetchone()[0]
print(f"Loaded {acc_count:,} accessions")

## Overall State Data Quality

In [None]:
# @title Overall State Data Quality
quality = db.execute("""
    SELECT 
        CASE 
            WHEN duty_station_state IN ('REDACTED', 'NO DATA REPORTED') THEN duty_station_state
            ELSE 'Valid State'
        END as state_status,
        SUM(CAST(count AS INTEGER)) as people
    FROM accessions
    GROUP BY 1
    ORDER BY people DESC
""").df()

total = quality['people'].sum()
quality['pct'] = (quality['people'] / total * 100).round(2)

print("State Data Quality - All Accessions (2021-2025)")
print("="*50)
for _, row in quality.iterrows():
    print(f"{row['state_status']:20} {row['people']:>12,} ({row['pct']:>5.1f}%)")
print(f"{'TOTAL':20} {total:>12,}")

In [None]:
# @title State Redaction Over Time
monthly_quality = db.execute("""
    WITH totals AS (
        SELECT month, SUM(CAST(count AS INTEGER)) as total
        FROM accessions
        GROUP BY month
    ),
    redacted AS (
        SELECT month, SUM(CAST(count AS INTEGER)) as redacted_count
        FROM accessions
        WHERE duty_station_state IN ('REDACTED', 'NO DATA REPORTED')
        GROUP BY month
    )
    SELECT t.month,
           t.total,
           COALESCE(r.redacted_count, 0) as redacted,
           ROUND(100.0 * COALESCE(r.redacted_count, 0) / t.total, 2) as pct_redacted
    FROM totals t
    LEFT JOIN redacted r ON t.month = r.month
    ORDER BY t.month
""").df()

monthly_quality['date'] = pd.to_datetime(monthly_quality['month'], format='%Y%m')

fig = go.Figure()
fig.add_trace(go.Scatter(
    x=monthly_quality['date'], 
    y=monthly_quality['pct_redacted'],
    mode='lines+markers',
    line=dict(color='#e74c3c', width=2),
    hovertemplate='%{x|%b %Y}<br>%{y:.1f}% redacted<extra></extra>'
))

fig.update_layout(
    title='Percentage of Accessions with Redacted/Missing State Data',
    xaxis_title='Month',
    yaxis_title='% Redacted or Missing',
    template='plotly_white',
    height=400
)
fig.show()

## Agencies with 100% State Redaction

These agencies have redacted or missing state data for ALL of their accessions.

In [None]:
# @title Agencies with 100% Redaction
agencies_100 = db.execute("""
    WITH agency_stats AS (
        SELECT agency,
               SUM(CAST(count AS INTEGER)) as total,
               SUM(CASE WHEN duty_station_state IN ('REDACTED', 'NO DATA REPORTED') 
                        THEN CAST(count AS INTEGER) ELSE 0 END) as redacted
        FROM accessions
        GROUP BY agency
    )
    SELECT agency, total as accessions
    FROM agency_stats
    WHERE redacted = total
    ORDER BY total DESC
""").df()

print(f"Agencies with 100% state redaction: {len(agencies_100)}")
print("="*70)
if len(agencies_100) > 0:
    for _, row in agencies_100.iterrows():
        print(f"{row['agency'][:60]:60} {row['accessions']:>8,}")
    download_csv(agencies_100, 'agencies_100_redaction.csv')
else:
    print("No agencies have 100% redaction.")

In [None]:
# @title Agencies with 0% Redaction (All Valid State Data)
agencies_0 = db.execute("""
    WITH agency_stats AS (
        SELECT agency,
               SUM(CAST(count AS INTEGER)) as total,
               SUM(CASE WHEN duty_station_state IN ('REDACTED', 'NO DATA REPORTED') 
                        THEN CAST(count AS INTEGER) ELSE 0 END) as redacted
        FROM accessions
        GROUP BY agency
        HAVING total >= 100  -- Minimum sample size
    )
    SELECT agency, total as accessions
    FROM agency_stats
    WHERE redacted = 0
    ORDER BY total DESC
""").df()

print(f"Agencies with 0% redaction (100+ accessions): {len(agencies_0)}")
print("="*70)
if len(agencies_0) > 0:
    for _, row in agencies_0.iterrows():
        print(f"{row['agency'][:60]:60} {row['accessions']:>8,}")
    download_csv(agencies_0, 'agencies_0_redaction.csv')
else:
    print("No agencies have 0% redaction.")

## Occupations with 100% State Redaction

These job series have redacted or missing state data for ALL of their accessions.

In [None]:
# @title Occupations with 100% Redaction
occs_100 = db.execute("""
    WITH occ_stats AS (
        SELECT occupational_series,
               SUM(CAST(count AS INTEGER)) as total,
               SUM(CASE WHEN duty_station_state IN ('REDACTED', 'NO DATA REPORTED') 
                        THEN CAST(count AS INTEGER) ELSE 0 END) as redacted
        FROM accessions
        GROUP BY occupational_series
    )
    SELECT occupational_series, total as accessions
    FROM occ_stats
    WHERE redacted = total
    ORDER BY total DESC
""").df()

print(f"Occupations with 100% state redaction: {len(occs_100)}")
print("="*80)
if len(occs_100) > 0:
    for _, row in occs_100.iterrows():
        print(f"{row['occupational_series'][:70]:70} {row['accessions']:>8,}")
    download_csv(occs_100, 'occupations_100_redaction.csv')
else:
    print("No occupations have 100% redaction.")

In [None]:
# @title Occupations with 0% Redaction (All Valid State Data)
occs_0 = db.execute("""
    WITH occ_stats AS (
        SELECT occupational_series,
               SUM(CAST(count AS INTEGER)) as total,
               SUM(CASE WHEN duty_station_state IN ('REDACTED', 'NO DATA REPORTED') 
                        THEN CAST(count AS INTEGER) ELSE 0 END) as redacted
        FROM accessions
        GROUP BY occupational_series
        HAVING total >= 100  -- Minimum sample size
    )
    SELECT occupational_series, total as accessions
    FROM occ_stats
    WHERE redacted = 0
    ORDER BY total DESC
""").df()

print(f"Occupations with 0% redaction (100+ accessions): {len(occs_0)}")
print("="*80)
if len(occs_0) > 0:
    for _, row in occs_0.iterrows():
        print(f"{row['occupational_series'][:70]:70} {row['accessions']:>8,}")
    download_csv(occs_0, 'occupations_0_redaction.csv')
else:
    print("No occupations have 0% redaction.")

## Partial Redaction: Agencies and Occupations with Some But Not All Redaction

These are more interesting cases - where redaction varies within an agency or occupation.

In [None]:
# @title Agencies with Partial Redaction (>0% but <100%)
agencies_partial = db.execute("""
    WITH agency_stats AS (
        SELECT agency,
               SUM(CAST(count AS INTEGER)) as total,
               SUM(CASE WHEN duty_station_state IN ('REDACTED', 'NO DATA REPORTED') 
                        THEN CAST(count AS INTEGER) ELSE 0 END) as redacted
        FROM accessions
        GROUP BY agency
        HAVING total >= 100
    )
    SELECT agency, 
           total as accessions,
           redacted,
           ROUND(100.0 * redacted / total, 1) as pct_redacted
    FROM agency_stats
    WHERE redacted > 0 AND redacted < total
    ORDER BY pct_redacted DESC
""").df()

print(f"Agencies with partial redaction (100+ accessions): {len(agencies_partial)}")
print("="*90)
if len(agencies_partial) > 0:
    display(agencies_partial)
    download_csv(agencies_partial, 'agencies_partial_redaction.csv')
else:
    print("No agencies have partial redaction.")

In [None]:
# @title Occupations with Partial Redaction (>0% but <100%)
occs_partial = db.execute("""
    WITH occ_stats AS (
        SELECT occupational_series,
               SUM(CAST(count AS INTEGER)) as total,
               SUM(CASE WHEN duty_station_state IN ('REDACTED', 'NO DATA REPORTED') 
                        THEN CAST(count AS INTEGER) ELSE 0 END) as redacted
        FROM accessions
        GROUP BY occupational_series
        HAVING total >= 100
    )
    SELECT occupational_series, 
           total as accessions,
           redacted,
           ROUND(100.0 * redacted / total, 1) as pct_redacted
    FROM occ_stats
    WHERE redacted > 0 AND redacted < total
    ORDER BY pct_redacted DESC
""").df()

print(f"Occupations with partial redaction (100+ accessions): {len(occs_partial)}")
print("="*100)
if len(occs_partial) > 0:
    display(occs_partial)
    download_csv(occs_partial, 'occupations_partial_redaction.csv')
else:
    print("No occupations have partial redaction.")

## Summary

In [None]:
# @title Summary Statistics
print("="*70)
print("STATE REDACTION SUMMARY")
print("="*70)

# Overall stats
overall = db.execute("""
    SELECT 
        SUM(CAST(count AS INTEGER)) as total,
        SUM(CASE WHEN duty_station_state IN ('REDACTED', 'NO DATA REPORTED') 
                 THEN CAST(count AS INTEGER) ELSE 0 END) as redacted
    FROM accessions
""").fetchone()
print(f"\nTotal accessions: {overall[0]:,}")
print(f"With redacted state: {overall[1]:,} ({100*overall[1]/overall[0]:.1f}%)")

# Agency counts
agency_counts = db.execute("""
    WITH agency_stats AS (
        SELECT agency,
               SUM(CAST(count AS INTEGER)) as total,
               SUM(CASE WHEN duty_station_state IN ('REDACTED', 'NO DATA REPORTED') 
                        THEN CAST(count AS INTEGER) ELSE 0 END) as redacted
        FROM accessions
        GROUP BY agency
    )
    SELECT 
        COUNT(*) as total_agencies,
        SUM(CASE WHEN redacted = total THEN 1 ELSE 0 END) as agencies_100_pct,
        SUM(CASE WHEN redacted = 0 THEN 1 ELSE 0 END) as agencies_0_pct,
        SUM(CASE WHEN redacted > 0 AND redacted < total THEN 1 ELSE 0 END) as agencies_partial
    FROM agency_stats
""").fetchone()
print(f"\nAgencies:")
print(f"  Total: {agency_counts[0]}")
print(f"  100% redacted: {agency_counts[1]}")
print(f"  0% redacted: {agency_counts[2]}")
print(f"  Partial: {agency_counts[3]}")

# Occupation counts
occ_counts = db.execute("""
    WITH occ_stats AS (
        SELECT occupational_series,
               SUM(CAST(count AS INTEGER)) as total,
               SUM(CASE WHEN duty_station_state IN ('REDACTED', 'NO DATA REPORTED') 
                        THEN CAST(count AS INTEGER) ELSE 0 END) as redacted
        FROM accessions
        GROUP BY occupational_series
    )
    SELECT 
        COUNT(*) as total_occs,
        SUM(CASE WHEN redacted = total THEN 1 ELSE 0 END) as occs_100_pct,
        SUM(CASE WHEN redacted = 0 THEN 1 ELSE 0 END) as occs_0_pct,
        SUM(CASE WHEN redacted > 0 AND redacted < total THEN 1 ELSE 0 END) as occs_partial
    FROM occ_stats
""").fetchone()
print(f"\nOccupations:")
print(f"  Total: {occ_counts[0]}")
print(f"  100% redacted: {occ_counts[1]}")
print(f"  0% redacted: {occ_counts[2]}")
print(f"  Partial: {occ_counts[3]}")

print("\n" + "="*70)