In [1]:
# Flags MPV incidents by whether sheriffs offices or local police departments were responsible
# 
# This notebook processes the Mapping Police Violence data to identify which incidents
# involved sheriff's offices versus local police departments. This is crucial for our
# analysis because the MPV data doesn't always clearly indicate agency type.
# 
# We use two approaches to identify sheriff involvement:
# 1. Match ORI codes from MPV data to our ORI lookup table created in notebook 1
# 2. Search for "sheriff" in the agency name field
# 
# Both methods are used because neither alone captures all cases perfectly.

In [2]:
import pandas as pd

In [3]:
# Load the ORI lookup table created in notebook 1
# This table maps ORI codes to agency types (sheriff vs police)
# We use this to identify agency types for incidents that have ORI codes

leoka_lookup = (
    pd.read_csv(
        "../outputs/leoka_ori_type_lookup.csv",
        na_values = ["nan"],
        dtype = {"ori": str, "ori9":str},  # Ensure ORI codes are treated as strings
    )
    .assign(
        # Fill missing ori9 values with ori values for consistency
        ori9 = lambda x: x["ori9"].fillna(x["ori"].astype(str))
    )
)

# Create lists of ORI codes for sheriff's offices and police departments
# These will be used to check if an incident's ORI matches either type
sheriff_oris = leoka_lookup[leoka_lookup["agency_type"] == "sheriffs office"]["ori9"].unique()
police_oris = leoka_lookup[leoka_lookup["agency_type"] == "local police department"]["ori9"].unique()

print(f"Sheriff ORI codes: {len(sheriff_oris)}")
print(f"Police ORI codes: {len(police_oris)}")
print(f"Sample sheriff ORI: {sheriff_oris[0] if len(sheriff_oris) > 0 else 'None'}")
print(f"Sample police ORI: {police_oris[0] if len(police_oris) > 0 else 'None'}")

sheriff_oris, police_oris

Sheriff ORI codes: 3063
Police ORI codes: 14263
Sample sheriff ORI: AL0010000
Sample police ORI: AK0010100


(array(['AL0010000', 'AL0020000', 'AL0030000', ..., 'WY0210000',
        'WY0220000', 'WY0230000'], dtype=object),
 array(['AK0010100', 'AK0010200', 'AK0010300', ..., 'WY0230100',
        'WY0230200', 'WYDI05000'], dtype=object))

In [4]:
# Load the Mapping Police Violence dataset
# This dataset contains information about people killed by police, including:
# - Victim demographics and incident details
# - Agency information (sometimes including ORI codes)
# - Circumstances of the incident
# 
# We parse the date column to enable time-based analysis
# The dataset is indexed by incident to create unique identifiers

incidents = (
    pd.read_csv(
        "../data/Mapping Police Violence.csv",
        parse_dates = ["date"],  # Parse date column for time-based analysis
    )
    .rename_axis("incident_uid")  # Create unique identifier for each incident
    .reset_index()
)

print(f"Loaded {len(incidents)} incidents from Mapping Police Violence dataset")
print(f"Date range: {incidents['date'].min()} to {incidents['date'].max()}")
print(f"Incidents with ORI codes: {incidents['ori'].notna().sum()}")
incidents.head(3)

Loaded 14676 incidents from Mapping Police Violence dataset
Date range: 2012-05-22 00:00:00 to 2025-06-30 00:00:00
Incidents with ORI codes: 14555


Unnamed: 0,incident_uid,name,age,gender,race,victim_image,date,street_address,city,state,...,congressperson_party,prosecutor_head,prosecutor_race,prosecutor_gender,prosecutor_party,prosecutor_term,prosecutor_in_court,prosecutor_special,independent_investigation,prosecutor_url
0,0,David Bahrami,26.0,Male,White,https://antiochherald.com/wp-content/uploads/2...,2025-06-30,3841 Osprey Dr,Antioch,California,...,,,,,,,,,,
1,1,Daphine Mae Jennings,58.0,Female,White,,2025-06-30,200 block of Mountain Laurel Drive,La Grange,Georgia,...,,,,,,,,,,
2,2,Joshua B. Coffey,48.0,Male,White,,2025-06-30,1400 block of South Third Street,Niles,Michigan,...,,,,,,,,,,


In [5]:
# Flag incidents by agency type using two complementary methods
# 
# METHOD 1: ORI CODE MATCHING
# - Check if the incident's ORI code matches any sheriff or police ORI in our lookup table
# - This is the most reliable method when ORI codes are available
# 
# METHOD 2: AGENCY NAME KEYWORD SEARCH  
# - Search for "sheriff" or "police" in the agency name field
# - This catches cases where ORI codes are missing or don't match our lookup table
# 
# COMBINED APPROACH:
# - We use both methods because each has limitations:
#   - ORI matching: Not all incidents have ORI codes
#   - Keyword search: Can have false positives/negatives
# - An incident is flagged as sheriff/police if EITHER method identifies it as such

incidents_flagged = (
    incidents
    .assign(
        # METHOD 1: ORI CODE MATCHING
        # Check if incident ORI appears in our list of sheriff ORIs
        sheriff_ori = lambda x: x["ori"].str.lower().apply( 
            lambda ori: any( [sheriff_ori in str(ori).lower() for sheriff_ori in sheriff_oris] ) 
        ),
        # Check if incident ORI appears in our list of police ORIs  
        police_ori = lambda x: x["ori"].str.lower().apply( 
            lambda ori: any( [police_ori in str(ori).lower() for police_ori in police_oris] ) 
        ),
        
        # METHOD 2: KEYWORD SEARCH IN AGENCY NAME
        # Search for "sheriff" in the agency name (case insensitive)
        has_sheriff_in_name = lambda x: x["agency_responsible"].str.contains("sheriff", case=False, na=False),
        # Search for "police" in the agency name (case insensitive)
        has_police_in_name = lambda x: x["agency_responsible"].str.contains("police", case=False, na=False),
    )
    .assign(
        # COMBINED FLAGS: Use OR logic to combine both methods
        # An incident is flagged as sheriff if EITHER ORI matches OR name contains "sheriff"
        sheriff = lambda x: x["sheriff_ori"] | x["has_sheriff_in_name"],
        # An incident is flagged as police if EITHER ORI matches OR name contains "police"  
        police = lambda x: x["police_ori"] | x["has_police_in_name"],
        # Everything else is categorized as "others" (federal, state, etc.)
        others = lambda x: ~(x["sheriff"] | x["police"]),
    )
)

print(f"Flagging results:")
print(f"Sheriff incidents: {incidents_flagged['sheriff'].sum()}")
print(f"Police incidents: {incidents_flagged['police'].sum()}")
print(f"Other incidents: {incidents_flagged['others'].sum()}")
print(f"Total incidents: {len(incidents_flagged)}")

incidents_flagged.head(3)

Flagging results:
Sheriff incidents: 4384
Police incidents: 10170
Other incidents: 768
Total incidents: 14676


Unnamed: 0,incident_uid,name,age,gender,race,victim_image,date,street_address,city,state,...,prosecutor_special,independent_investigation,prosecutor_url,sheriff_ori,police_ori,has_sheriff_in_name,has_police_in_name,sheriff,police,others
0,0,David Bahrami,26.0,Male,White,https://antiochherald.com/wp-content/uploads/2...,2025-06-30,3841 Osprey Dr,Antioch,California,...,,,,False,False,False,True,False,True,False
1,1,Daphine Mae Jennings,58.0,Female,White,,2025-06-30,200 block of Mountain Laurel Drive,La Grange,Georgia,...,,,,False,False,True,False,True,False,False
2,2,Joshua B. Coffey,48.0,Male,White,,2025-06-30,1400 block of South Third Street,Niles,Michigan,...,,,,False,False,True,False,True,False,False


In [6]:
# Save the flagged incidents data for use in analysis notebooks
# This is a key output that will be used in subsequent analysis to:
# - Calculate incident rates by agency type
# - Analyze geographic patterns
# - Compare sheriff vs police involvement in lethal force incidents

incidents_flagged.to_csv("../outputs/incidents_flagged.csv", index = False)
print("Saved flagged incidents data to ../outputs/incidents_flagged.csv")
print("This file contains all MPV incidents with sheriff/police/other flags")

Saved flagged incidents data to ../outputs/incidents_flagged.csv
This file contains all MPV incidents with sheriff/police/other flags


## State-level analysis of incidents by agency type

This section analyzes incidents by state to understand geographic patterns
in sheriff vs police involvement in lethal force incidents

In [7]:
# Create a simplified version of the incidents data for easier analysis
# This focuses on the key variables needed for state-level analysis:
# - State and year for geographic and temporal analysis
# - Agency type flags for counting incidents by type

incidents_short = (
    incidents_flagged
    .assign(
        year = lambda f: f["date"].dt.year,  # Extract year from date for temporal analysis
    )
    [["incident_uid", "state", "year", "sheriff", "police", "others"]]  # Select only needed columns
)

print(f"Created simplified incidents dataset with {len(incidents_short)} records")
print(f"Years covered: {incidents_short['year'].min()} to {incidents_short['year'].max()}")
incidents_short.head()

Created simplified incidents dataset with 14676 records
Years covered: 2012 to 2025


Unnamed: 0,incident_uid,state,year,sheriff,police,others
0,0,California,2025,False,True,False
1,1,Georgia,2025,True,False,False
2,2,Michigan,2025,True,False,False
3,3,Florida,2025,True,False,False
4,4,North Carolina,2025,False,True,False


In [8]:
# Calculate incident counts and percentages by state and year
# This creates a comprehensive table showing:
# - Total incidents per state per year
# - Count of sheriff, police, and other incidents
# - Percentage breakdown by agency type
# 
# This data will be used to:
# - Identify states with high sheriff involvement in lethal force
# - Track changes over time
# - Compare with arrest data to calculate rates

incidents_by_state = (
    incidents_short
    .groupby(
        ["state", "year"]
    )
    .agg(
        total = ("incident_uid", "count"),      # Total incidents in state/year
        sheriff = ("sheriff", "sum"),           # Sheriff incidents (boolean sum)
        police = ("police", "sum"),             # Police incidents (boolean sum)
        others = ("others", "sum")              # Other agency incidents (boolean sum)
    )
    .assign(
        # Calculate percentages to understand agency involvement patterns
        pct_sheriff_incidents = lambda x: x["sheriff"] / x["total"],
        pct_police_incidents = lambda x: x["police"] / x["total"],
        pct_others_incidents = lambda x: x["others"] / x["total"],
    )
    .reset_index()
)

print(f"Created state-level incidents data with {len(incidents_by_state)} state-year records")
print(f"Sample of states with highest sheriff involvement:")
sample_high_sheriff = incidents_by_state.nlargest(5, 'pct_sheriff_incidents')[['state', 'year', 'total', 'sheriff', 'pct_sheriff_incidents']]
print(sample_high_sheriff.to_string(index=False))

incidents_by_state.head()

Created state-level incidents data with 656 state-year records
Sample of states with highest sheriff involvement:
         state  year  total  sheriff  pct_sheriff_incidents
      Nebraska  2025      1        1               1.000000
 New Hampshire  2014      1        1               1.000000
       Wyoming  2013      1        1               1.000000
South Carolina  2018     14       12               0.857143
         Idaho  2017      6        5               0.833333


Unnamed: 0,state,year,total,sheriff,police,others,pct_sheriff_incidents,pct_police_incidents,pct_others_incidents
0,Alabama,2013,20,8,10,2,0.4,0.5,0.1
1,Alabama,2014,18,8,10,0,0.444444,0.555556,0.0
2,Alabama,2015,18,6,11,2,0.333333,0.611111,0.111111
3,Alabama,2016,25,8,20,0,0.32,0.8,0.0
4,Alabama,2017,26,9,17,0,0.346154,0.653846,0.0


In [9]:
# Load the agency staffing data created in notebook 1
# This data shows the number of agencies, officers, and total staff by state, year, and agency type
# We'll use this to understand the relationship between staffing levels and incident patterns

agency_staff = pd.read_csv("../outputs/agency_staff_all_years.csv")
print(f"Loaded staffing data with {len(agency_staff)} records")
print(f"Agency types in staffing data: {agency_staff['agency_type'].unique()}")
agency_staff.head()

Loaded staffing data with 12255 records
Agency types in staffing data: ['local police department' 'state law enforcement agency'
 'sheriffs office' 'special jurisdiction' 'constable/marshal' 'federal']


Unnamed: 0,year,state_abb,agency_type,agencies,officers,total_staff
0,1960,AK,local police department,6,110.0,138.0
1,1960,AK,state law enforcement agency,1,0.0,0.0
2,1960,AL,local police department,106,1815.0,2023.0
3,1960,AL,sheriffs office,67,0.0,0.0
4,1960,AR,local police department,66,731.0,780.0


In [10]:
# Create a staffing summary by state for 2022 (most recent year with complete data)
# This shows the relative size and importance of sheriff's offices vs police departments
# in each state by calculating:
# - Total staff for each agency type
# - Percentage of total law enforcement staff in sheriff's offices vs police departments
# 
# This is important because it shows which states rely more heavily on sheriffs
# for law enforcement, which may correlate with incident patterns

staff_by_state = (
    agency_staff
    .loc[ lambda x: x["year"] == 2023 ]  # Use most recent complete year
    .groupby(
        ["agency_type", "state_abb"]
    )
    .sum()  # Sum staff across all agencies of each type in each state
    .reset_index()
    .pivot_table(
        index = ["state_abb"],
        columns = "agency_type",
        values = "total_staff",
    )
    .fillna(0)  # Fill missing values with 0 for states without certain agency types
    .reset_index()
    .assign(
        # Combine smaller agency types into "other" category
        other = lambda x: x["state law enforcement agency"] + x["federal"] + x["special jurisdiction"],
        # Calculate total law enforcement staff
        total_agency_staff = lambda x: x["local police department"] + x["sheriffs office"] + x["other"],
        # Calculate percentages to show relative importance of each agency type
        pct_sheriff = lambda x: x["sheriffs office"] / x["total_agency_staff"],
        pct_local_police = lambda x: x["local police department"] / x["total_agency_staff"],
        pct_other = lambda x: x["other"] / x["total_agency_staff"],
    )
    [[
        "state_abb",
        "total_agency_staff",
        "local police department",
        "sheriffs office",
        "pct_sheriff",
        "pct_local_police",
        "pct_other",
    ]]
    .rename_axis(None, axis=1)  # Clean up column names
    .sort_values("pct_sheriff", ascending=False)  # Sort by sheriff percentage
    .reset_index(drop=True)
)

# Save this staffing analysis for use in other notebooks
staff_by_state.to_csv("../outputs/staff_by_state.csv", index=False)
print(f"Created staffing data for {len(staff_by_state)} states")
print("States with highest percentage of sheriff staff:")
print(staff_by_state.head()[['state_abb', 'pct_sheriff', 'sheriffs office', 'local police department']].to_string(index=False))

staff_by_state.head()

Created staffing data for 51 states
States with highest percentage of sheriff staff:
state_abb  pct_sheriff  sheriffs office  local police department
       LA     0.643781          12837.0                   6531.0
       MT     0.555652           1922.0                   1177.0
       FL     0.531603          39959.0                  29102.0
       ID     0.530961           2984.0                   2066.0
       SD     0.518571           1466.0                   1158.0


Unnamed: 0,state_abb,total_agency_staff,local police department,sheriffs office,pct_sheriff,pct_local_police,pct_other
0,LA,19940.0,6531.0,12837.0,0.643781,0.327533,0.028686
1,MT,3459.0,1177.0,1922.0,0.555652,0.340272,0.104076
2,FL,75167.0,29102.0,39959.0,0.531603,0.387165,0.081232
3,ID,5620.0,2066.0,2984.0,0.530961,0.367616,0.101423
4,SD,2827.0,1158.0,1466.0,0.518571,0.409622,0.071808


In [11]:
# Combine staffing data with incident data for comprehensive state-level analysis
# This creates a single dataset that shows for each state:
# - Law enforcement staffing levels by agency type
# - Incident counts and percentages by agency type  
# - This enables analysis of whether states with more sheriff staff also have more sheriff incidents
# 
# We focus on 2022 data for the most recent complete picture

combined_states = (
    staff_by_state.rename( columns = {"state_abb": "state"})  # Match column names
    .merge(
        incidents_by_state.loc[ lambda x: x["year"] == 2023 ],  # Use 2023 incident data
        on = "state",
        how = "left"  # Keep all states, even those with no incidents
    )
    .rename(
        columns = {
            "total": "total_incidents",
            "sheriff": "sheriff_incidents", 
            "police": "police_incidents",
            "others": "other_incidents"
        }
    )
    .reset_index(drop=True)
)

# Save this combined dataset for use in analysis notebooks
combined_states.to_csv("../outputs/state_staff_and_incidents.csv", index=False)
print(f"Created combined state analysis with {len(combined_states)} states")
print("This dataset enables analysis of staffing vs incident patterns by state")

# Show sample of states with both high sheriff staffing and incidents
print("\nStates with significant sheriff presence (>40% of staff) and their incident patterns:")
high_sheriff_states = combined_states[combined_states['pct_sheriff'] > 0.4]
if len(high_sheriff_states) > 0:
    print(high_sheriff_states[['state', 'pct_sheriff', 'total_incidents', 'sheriff_incidents', 'pct_sheriff_incidents']].to_string(index=False))

combined_states.head()

Created combined state analysis with 51 states
This dataset enables analysis of staffing vs incident patterns by state

States with significant sheriff presence (>40% of staff) and their incident patterns:
state  pct_sheriff  total_incidents  sheriff_incidents  pct_sheriff_incidents
   LA     0.643781              NaN                NaN                    NaN
   MT     0.555652              NaN                NaN                    NaN
   FL     0.531603              NaN                NaN                    NaN
   ID     0.530961              NaN                NaN                    NaN
   SD     0.518571              NaN                NaN                    NaN
   WY     0.499266              NaN                NaN                    NaN
   MN     0.456055              NaN                NaN                    NaN
   IA     0.452747              NaN                NaN                    NaN
   SC     0.452444              NaN                NaN                    NaN
   MS     0.44

Unnamed: 0,state,total_agency_staff,local police department,sheriffs office,pct_sheriff,pct_local_police,pct_other,year,total_incidents,sheriff_incidents,police_incidents,other_incidents,pct_sheriff_incidents,pct_police_incidents,pct_others_incidents
0,LA,19940.0,6531.0,12837.0,0.643781,0.327533,0.028686,,,,,,,,
1,MT,3459.0,1177.0,1922.0,0.555652,0.340272,0.104076,,,,,,,,
2,FL,75167.0,29102.0,39959.0,0.531603,0.387165,0.081232,,,,,,,,
3,ID,5620.0,2066.0,2984.0,0.530961,0.367616,0.101423,,,,,,,,
4,SD,2827.0,1158.0,1466.0,0.518571,0.409622,0.071808,,,,,,,,


---
---
---