[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abigailhaddad/fedscope_new/blob/main/state_redaction_analysis.ipynb)

# State Redaction Analysis: Which Federal Jobs Hide Location?

This notebook explores which occupations and agencies have redacted or missing state data in OPM federal workforce records.

**Key finding preview:** Some jobs (like certain intel/security roles) have high redaction rates, while others (like meat inspectors) have nearly 100% valid state data.

In [None]:
# @title Setup
!pip install -q duckdb pandas plotly

import duckdb
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Helper for downloads
import base64
from IPython.display import HTML, display

def download_csv(df, filename):
    csv = df.to_csv(index=False)
    b64 = base64.b64encode(csv.encode()).decode()
    display(HTML(f'<a href="data:file/csv;base64,{b64}" download="{filename}" style="background:#4CAF50;color:white;padding:8px 16px;text-decoration:none;border-radius:4px;display:inline-block;margin:10px 0;">Download {filename}</a>'))

In [None]:
# @title Load Data from HuggingFace
HF_USERNAME = "abigailhaddad"
BASE_URL = f"https://huggingface.co/datasets/{HF_USERNAME}"

def generate_months(start_year=2021, start_month=1, end_year=2025, end_month=11):
    months = []
    for year in range(start_year, end_year + 1):
        for month in range(1, 13):
            if year == start_year and month < start_month:
                continue
            if year == end_year and month > end_month:
                break
            months.append(f"{year}{month:02d}")
    return months

months = generate_months()
accession_urls = [f"{BASE_URL}/opm-federal-accessions-{m}/resolve/main/data.parquet" for m in months]
separation_urls = [f"{BASE_URL}/opm-federal-separations-{m}/resolve/main/data.parquet" for m in months]

print(f"Loading {len(accession_urls)} months of accessions and separations...")

In [None]:
%%time
# @title Load into DuckDB
db = duckdb.connect()

acc_url_list = ", ".join([f"'{url}'" for url in accession_urls])
db.execute(f"""
    CREATE TABLE accessions AS 
    SELECT *, personnel_action_effective_date_yyyymm as month
    FROM read_parquet([{acc_url_list}])
""")

sep_url_list = ", ".join([f"'{url}'" for url in separation_urls])
db.execute(f"""
    CREATE TABLE separations AS 
    SELECT *, personnel_action_effective_date_yyyymm as month
    FROM read_parquet([{sep_url_list}])
""")

acc_count = db.execute("SELECT SUM(CAST(count AS INTEGER)) FROM accessions").fetchone()[0]
sep_count = db.execute("SELECT SUM(CAST(count AS INTEGER)) FROM separations").fetchone()[0]
print(f"Loaded {acc_count:,} accessions and {sep_count:,} separations")

## Overall State Data Quality

First, let's see how much state data is redacted or missing across all federal hires.

In [None]:
# @title Overall State Data Quality
quality = db.execute("""
    SELECT 
        CASE 
            WHEN duty_station_state IN ('REDACTED', 'NO DATA REPORTED') THEN duty_station_state
            ELSE 'Valid State'
        END as state_status,
        SUM(CAST(count AS INTEGER)) as people
    FROM accessions
    GROUP BY 1
    ORDER BY people DESC
""").df()

total = quality['people'].sum()
quality['pct'] = (quality['people'] / total * 100).round(2)

print("State Data Quality - All Accessions (2021-2025)")
print("="*50)
for _, row in quality.iterrows():
    print(f"{row['state_status']:20} {row['people']:>12,} ({row['pct']:>5.1f}%)")
print(f"{'TOTAL':20} {total:>12,}")

In [None]:
# @title State Redaction Over Time
monthly_quality = db.execute("""
    WITH totals AS (
        SELECT month, SUM(CAST(count AS INTEGER)) as total
        FROM accessions
        GROUP BY month
    ),
    redacted AS (
        SELECT month, SUM(CAST(count AS INTEGER)) as redacted_count
        FROM accessions
        WHERE duty_station_state IN ('REDACTED', 'NO DATA REPORTED')
        GROUP BY month
    )
    SELECT t.month,
           t.total,
           COALESCE(r.redacted_count, 0) as redacted,
           ROUND(100.0 * COALESCE(r.redacted_count, 0) / t.total, 2) as pct_redacted
    FROM totals t
    LEFT JOIN redacted r ON t.month = r.month
    ORDER BY t.month
""").df()

monthly_quality['date'] = pd.to_datetime(monthly_quality['month'], format='%Y%m')

fig = go.Figure()
fig.add_trace(go.Scatter(
    x=monthly_quality['date'], 
    y=monthly_quality['pct_redacted'],
    mode='lines+markers',
    line=dict(color='#e74c3c', width=2),
    hovertemplate='%{x|%b %Y}<br>%{y:.1f}% redacted<extra></extra>'
))

fig.update_layout(
    title='Percentage of Accessions with Redacted/Missing State Data',
    xaxis_title='Month',
    yaxis_title='% Redacted or Missing',
    template='plotly_white',
    height=400
)
fig.show()

## Which Agencies Have the Most State Redaction?

Some agencies may redact location data more than others due to security concerns.

In [None]:
# @title Redaction Rate by Agency (Top 30 agencies by volume)
agency_quality = db.execute("""
    WITH agency_totals AS (
        SELECT agency, SUM(CAST(count AS INTEGER)) as total
        FROM accessions
        GROUP BY agency
        HAVING total >= 1000  -- Only agencies with significant hiring
    ),
    agency_redacted AS (
        SELECT agency, SUM(CAST(count AS INTEGER)) as redacted
        FROM accessions
        WHERE duty_station_state IN ('REDACTED', 'NO DATA REPORTED')
        GROUP BY agency
    )
    SELECT t.agency,
           t.total,
           COALESCE(r.redacted, 0) as redacted,
           ROUND(100.0 * COALESCE(r.redacted, 0) / t.total, 2) as pct_redacted
    FROM agency_totals t
    LEFT JOIN agency_redacted r ON t.agency = r.agency
    ORDER BY pct_redacted DESC
    LIMIT 30
""").df()

# Show table
print("Agencies with Highest State Redaction Rates")
print("(Minimum 1,000 accessions)")
print("="*80)
display(agency_quality.head(15))

download_csv(agency_quality, 'agency_state_redaction.csv')

In [None]:
# @title Agency Redaction Rates - Bar Chart
# Sort by redaction rate for visualization
top_redacted = agency_quality.nlargest(15, 'pct_redacted')

fig = go.Figure()
fig.add_trace(go.Bar(
    y=top_redacted['agency'],
    x=top_redacted['pct_redacted'],
    orientation='h',
    marker_color='#e74c3c',
    text=top_redacted['pct_redacted'].apply(lambda x: f'{x:.1f}%'),
    textposition='outside',
    hovertemplate='%{y}<br>%{x:.1f}% redacted<br>Total: %{customdata:,}<extra></extra>',
    customdata=top_redacted['total']
))

fig.update_layout(
    title='Top 15 Agencies by State Redaction Rate',
    xaxis_title='% of Accessions with Redacted State',
    yaxis=dict(autorange='reversed'),
    template='plotly_white',
    height=500,
    margin=dict(l=350)
)
fig.show()

In [None]:
# @title Agencies with LOWEST Redaction (Best Data Quality)
best_quality = agency_quality.nsmallest(15, 'pct_redacted')

fig = go.Figure()
fig.add_trace(go.Bar(
    y=best_quality['agency'],
    x=best_quality['pct_redacted'],
    orientation='h',
    marker_color='#2ecc71',
    text=best_quality['pct_redacted'].apply(lambda x: f'{x:.1f}%'),
    textposition='outside',
    hovertemplate='%{y}<br>%{x:.1f}% redacted<br>Total: %{customdata:,}<extra></extra>',
    customdata=best_quality['total']
))

fig.update_layout(
    title='Top 15 Agencies with BEST State Data Quality (Lowest Redaction)',
    xaxis_title='% of Accessions with Redacted State',
    yaxis=dict(autorange='reversed'),
    template='plotly_white',
    height=500,
    margin=dict(l=350)
)
fig.show()

## Which Occupations Have the Most State Redaction?

Certain job types may have systematically redacted locations.

In [None]:
# @title Redaction Rate by Occupation Series
occ_quality = db.execute("""
    WITH occ_totals AS (
        SELECT occupational_series, SUM(CAST(count AS INTEGER)) as total
        FROM accessions
        GROUP BY occupational_series
        HAVING total >= 500  -- Only occupations with significant hiring
    ),
    occ_redacted AS (
        SELECT occupational_series, SUM(CAST(count AS INTEGER)) as redacted
        FROM accessions
        WHERE duty_station_state IN ('REDACTED', 'NO DATA REPORTED')
        GROUP BY occupational_series
    )
    SELECT t.occupational_series,
           t.total,
           COALESCE(r.redacted, 0) as redacted,
           ROUND(100.0 * COALESCE(r.redacted, 0) / t.total, 2) as pct_redacted
    FROM occ_totals t
    LEFT JOIN occ_redacted r ON t.occupational_series = r.occupational_series
    ORDER BY pct_redacted DESC
""").df()

print(f"Total occupations analyzed: {len(occ_quality)}")
print("\nOccupations with HIGHEST State Redaction:")
print("="*80)
display(occ_quality.head(20))

download_csv(occ_quality, 'occupation_state_redaction.csv')

In [None]:
# @title Top 20 Occupations by Redaction Rate
top_occ_redacted = occ_quality.nlargest(20, 'pct_redacted')

fig = go.Figure()
fig.add_trace(go.Bar(
    y=top_occ_redacted['occupational_series'],
    x=top_occ_redacted['pct_redacted'],
    orientation='h',
    marker_color='#e74c3c',
    text=top_occ_redacted['pct_redacted'].apply(lambda x: f'{x:.1f}%'),
    textposition='outside',
    hovertemplate='%{y}<br>%{x:.1f}% redacted<br>Total hires: %{customdata:,}<extra></extra>',
    customdata=top_occ_redacted['total']
))

fig.update_layout(
    title='Top 20 Occupations by State Redaction Rate',
    xaxis_title='% of Accessions with Redacted State',
    yaxis=dict(autorange='reversed'),
    template='plotly_white',
    height=600,
    margin=dict(l=400)
)
fig.show()

In [None]:
# @title Occupations with BEST Data Quality (Lowest Redaction)
best_occ = occ_quality.nsmallest(20, 'pct_redacted')

fig = go.Figure()
fig.add_trace(go.Bar(
    y=best_occ['occupational_series'],
    x=best_occ['pct_redacted'],
    orientation='h',
    marker_color='#2ecc71',
    text=best_occ['pct_redacted'].apply(lambda x: f'{x:.2f}%'),
    textposition='outside',
    hovertemplate='%{y}<br>%{x:.2f}% redacted<br>Total hires: %{customdata:,}<extra></extra>',
    customdata=best_occ['total']
))

fig.update_layout(
    title='Top 20 Occupations with BEST State Data Quality',
    xaxis_title='% of Accessions with Redacted State',
    yaxis=dict(autorange='reversed'),
    template='plotly_white',
    height=600,
    margin=dict(l=400)
)
fig.show()

## Redaction by Occupational Group

Let's look at broader job categories to see patterns.

In [None]:
# @title Redaction by Occupational Group
group_quality = db.execute("""
    WITH grp_totals AS (
        SELECT occupational_group, SUM(CAST(count AS INTEGER)) as total
        FROM accessions
        GROUP BY occupational_group
    ),
    grp_redacted AS (
        SELECT occupational_group, SUM(CAST(count AS INTEGER)) as redacted
        FROM accessions
        WHERE duty_station_state IN ('REDACTED', 'NO DATA REPORTED')
        GROUP BY occupational_group
    )
    SELECT t.occupational_group,
           t.total,
           COALESCE(r.redacted, 0) as redacted,
           ROUND(100.0 * COALESCE(r.redacted, 0) / t.total, 2) as pct_redacted
    FROM grp_totals t
    LEFT JOIN grp_redacted r ON t.occupational_group = r.occupational_group
    ORDER BY pct_redacted DESC
""").df()

fig = go.Figure()
fig.add_trace(go.Bar(
    y=group_quality['occupational_group'],
    x=group_quality['pct_redacted'],
    orientation='h',
    marker_color=group_quality['pct_redacted'].apply(
        lambda x: '#e74c3c' if x > 5 else '#f39c12' if x > 1 else '#2ecc71'
    ),
    text=group_quality['pct_redacted'].apply(lambda x: f'{x:.1f}%'),
    textposition='outside',
    hovertemplate='%{y}<br>%{x:.1f}% redacted<br>Total: %{customdata:,}<extra></extra>',
    customdata=group_quality['total']
))

fig.update_layout(
    title='State Redaction Rate by Occupational Group',
    xaxis_title='% Redacted',
    yaxis=dict(autorange='reversed'),
    template='plotly_white',
    height=700,
    margin=dict(l=350)
)
fig.show()

download_csv(group_quality, 'occupational_group_redaction.csv')

## Deep Dive: Which Agency + Occupation Combos Have Most Redaction?

Let's find the specific agency/job combinations with the highest redaction.

In [None]:
# @title Agency + Occupation Combinations with Highest Redaction
combo_quality = db.execute("""
    WITH combo_totals AS (
        SELECT agency, occupational_series, SUM(CAST(count AS INTEGER)) as total
        FROM accessions
        GROUP BY agency, occupational_series
        HAVING total >= 100  -- Minimum sample size
    ),
    combo_redacted AS (
        SELECT agency, occupational_series, SUM(CAST(count AS INTEGER)) as redacted
        FROM accessions
        WHERE duty_station_state IN ('REDACTED', 'NO DATA REPORTED')
        GROUP BY agency, occupational_series
    )
    SELECT t.agency,
           t.occupational_series,
           t.total,
           COALESCE(r.redacted, 0) as redacted,
           ROUND(100.0 * COALESCE(r.redacted, 0) / t.total, 2) as pct_redacted
    FROM combo_totals t
    LEFT JOIN combo_redacted r 
        ON t.agency = r.agency AND t.occupational_series = r.occupational_series
    WHERE COALESCE(r.redacted, 0) > 0  -- Only show combos with some redaction
    ORDER BY pct_redacted DESC
    LIMIT 50
""").df()

print("Agency + Occupation Combinations with Highest Redaction")
print("(Minimum 100 accessions, showing top 50 with any redaction)")
print("="*100)
display(combo_quality.head(30))

download_csv(combo_quality, 'agency_occupation_redaction.csv')

In [None]:
# @title Heatmap: Top Agencies vs Top Occupations (by redaction volume)
# Get top agencies and occupations by total redacted volume
heatmap_data = db.execute("""
    WITH data AS (
        SELECT agency, occupational_series, 
               SUM(CAST(count AS INTEGER)) as total,
               SUM(CASE WHEN duty_station_state IN ('REDACTED', 'NO DATA REPORTED') 
                        THEN CAST(count AS INTEGER) ELSE 0 END) as redacted
        FROM accessions
        GROUP BY agency, occupational_series
    ),
    top_agencies AS (
        SELECT agency FROM data GROUP BY agency ORDER BY SUM(redacted) DESC LIMIT 10
    ),
    top_occs AS (
        SELECT occupational_series FROM data GROUP BY occupational_series ORDER BY SUM(redacted) DESC LIMIT 10
    )
    SELECT d.agency, d.occupational_series, d.total, d.redacted,
           ROUND(100.0 * d.redacted / d.total, 1) as pct_redacted
    FROM data d
    WHERE d.agency IN (SELECT agency FROM top_agencies)
      AND d.occupational_series IN (SELECT occupational_series FROM top_occs)
      AND d.total >= 50
""").df()

# Pivot for heatmap
pivot = heatmap_data.pivot(index='agency', columns='occupational_series', values='pct_redacted').fillna(0)

fig = go.Figure(data=go.Heatmap(
    z=pivot.values,
    x=pivot.columns,
    y=pivot.index,
    colorscale='RdYlGn_r',
    hovertemplate='Agency: %{y}<br>Occupation: %{x}<br>Redaction: %{z:.1f}%<extra></extra>'
))

fig.update_layout(
    title='State Redaction Rate: Top Agencies vs Top Occupations',
    xaxis_title='Occupation Series',
    yaxis_title='Agency',
    template='plotly_white',
    height=500,
    xaxis=dict(tickangle=45),
    margin=dict(l=300, b=200)
)
fig.show()

## Separations: Same Pattern?

Let's check if the redaction patterns are similar for separations (departures).

In [None]:
# @title Compare Accessions vs Separations Redaction
comparison = db.execute("""
    WITH acc AS (
        SELECT 'Accessions' as data_type,
               SUM(CAST(count AS INTEGER)) as total,
               SUM(CASE WHEN duty_station_state IN ('REDACTED', 'NO DATA REPORTED') 
                        THEN CAST(count AS INTEGER) ELSE 0 END) as redacted
        FROM accessions
    ),
    sep AS (
        SELECT 'Separations' as data_type,
               SUM(CAST(count AS INTEGER)) as total,
               SUM(CASE WHEN duty_station_state IN ('REDACTED', 'NO DATA REPORTED') 
                        THEN CAST(count AS INTEGER) ELSE 0 END) as redacted
        FROM separations
    )
    SELECT *, ROUND(100.0 * redacted / total, 2) as pct_redacted
    FROM acc
    UNION ALL
    SELECT *, ROUND(100.0 * redacted / total, 2) as pct_redacted
    FROM sep
""").df()

print("Redaction Comparison: Accessions vs Separations")
print("="*60)
display(comparison)

In [None]:
# @title Agency Redaction: Accessions vs Separations Side-by-Side
agency_compare = db.execute("""
    WITH acc AS (
        SELECT agency,
               SUM(CAST(count AS INTEGER)) as acc_total,
               SUM(CASE WHEN duty_station_state IN ('REDACTED', 'NO DATA REPORTED') 
                        THEN CAST(count AS INTEGER) ELSE 0 END) as acc_redacted
        FROM accessions
        GROUP BY agency
    ),
    sep AS (
        SELECT agency,
               SUM(CAST(count AS INTEGER)) as sep_total,
               SUM(CASE WHEN duty_station_state IN ('REDACTED', 'NO DATA REPORTED') 
                        THEN CAST(count AS INTEGER) ELSE 0 END) as sep_redacted
        FROM separations
        GROUP BY agency
    )
    SELECT COALESCE(a.agency, s.agency) as agency,
           COALESCE(a.acc_total, 0) as accessions_total,
           ROUND(100.0 * COALESCE(a.acc_redacted, 0) / NULLIF(a.acc_total, 0), 2) as accessions_pct_redacted,
           COALESCE(s.sep_total, 0) as separations_total,
           ROUND(100.0 * COALESCE(s.sep_redacted, 0) / NULLIF(s.sep_total, 0), 2) as separations_pct_redacted
    FROM acc a
    FULL OUTER JOIN sep s ON a.agency = s.agency
    WHERE COALESCE(a.acc_total, 0) + COALESCE(s.sep_total, 0) >= 1000
    ORDER BY (COALESCE(a.acc_redacted, 0) + COALESCE(s.sep_redacted, 0)) DESC
    LIMIT 20
""").df()

print("Agency Redaction Rates: Accessions vs Separations")
display(agency_compare)

download_csv(agency_compare, 'agency_accessions_separations_redaction.csv')

## Summary Statistics

In [None]:
# @title Summary: Key Findings
print("="*70)
print("KEY FINDINGS: State Data Redaction in Federal Workforce Data")
print("="*70)

# Overall stats
overall = db.execute("""
    SELECT 
        SUM(CAST(count AS INTEGER)) as total,
        SUM(CASE WHEN duty_station_state IN ('REDACTED', 'NO DATA REPORTED') 
                 THEN CAST(count AS INTEGER) ELSE 0 END) as redacted
    FROM accessions
""").fetchone()
print(f"\nOverall: {overall[1]:,} of {overall[0]:,} accessions ({100*overall[1]/overall[0]:.1f}%) have redacted state")

# Highest agency
top_agency = db.execute("""
    SELECT agency, 
           SUM(CAST(count AS INTEGER)) as total,
           SUM(CASE WHEN duty_station_state IN ('REDACTED', 'NO DATA REPORTED') 
                    THEN CAST(count AS INTEGER) ELSE 0 END) as redacted
    FROM accessions
    GROUP BY agency
    HAVING total >= 1000
    ORDER BY 100.0 * redacted / total DESC
    LIMIT 1
""").fetchone()
print(f"\nHighest redaction agency: {top_agency[0]}")
print(f"   {top_agency[2]:,} of {top_agency[1]:,} ({100*top_agency[2]/top_agency[1]:.1f}%)")

# Highest occupation
top_occ = db.execute("""
    SELECT occupational_series, 
           SUM(CAST(count AS INTEGER)) as total,
           SUM(CASE WHEN duty_station_state IN ('REDACTED', 'NO DATA REPORTED') 
                    THEN CAST(count AS INTEGER) ELSE 0 END) as redacted
    FROM accessions
    GROUP BY occupational_series
    HAVING total >= 500
    ORDER BY 100.0 * redacted / total DESC
    LIMIT 1
""").fetchone()
print(f"\nHighest redaction occupation: {top_occ[0]}")
print(f"   {top_occ[2]:,} of {top_occ[1]:,} ({100*top_occ[2]/top_occ[1]:.1f}%)")

# Count of 100% valid
perfect_occs = db.execute("""
    SELECT COUNT(*) FROM (
        SELECT occupational_series
        FROM accessions
        GROUP BY occupational_series
        HAVING SUM(CAST(count AS INTEGER)) >= 500
           AND SUM(CASE WHEN duty_station_state IN ('REDACTED', 'NO DATA REPORTED') 
                        THEN CAST(count AS INTEGER) ELSE 0 END) = 0
    )
""").fetchone()[0]
total_occs = db.execute("""
    SELECT COUNT(*) FROM (
        SELECT occupational_series
        FROM accessions
        GROUP BY occupational_series
        HAVING SUM(CAST(count AS INTEGER)) >= 500
    )
""").fetchone()[0]
print(f"\nOccupations with 0% redaction: {perfect_occs} of {total_occs} ({100*perfect_occs/total_occs:.0f}%)")
print("   (Occupations with 500+ accessions)")

print("\n" + "="*70)

## Query Your Own Analysis

Use SQL to explore specific agencies or occupations:

```python
# Check redaction for a specific agency
db.execute("""
    SELECT duty_station_state, SUM(CAST(count AS INTEGER)) as n
    FROM accessions
    WHERE agency LIKE '%YOUR AGENCY%'
    GROUP BY duty_station_state
    ORDER BY n DESC
""").df()

# Check redaction for a specific occupation
db.execute("""
    SELECT duty_station_state, SUM(CAST(count AS INTEGER)) as n
    FROM accessions
    WHERE occupational_series LIKE '%YOUR OCCUPATION%'
    GROUP BY duty_station_state
    ORDER BY n DESC
""").df()
```