In [1]:
# Process Return A and LEOKA data by state, year and agency type
# Requires these files from Open ICPSR:
    # https://www.openicpsr.org/openicpsr/project/100707/version/V20/view?path=/openicpsr/100707/fcr:versions/V20/ucr_offenses_known_yearly_1960_2022_dta.zip&type=file
    # https://www.openicpsr.org/openicpsr/project/102180/version/V13/view?path=/openicpsr/102180/fcr:versions/V13/ucr_leoka_yearly_1960_2022_rds.zip&type=fil

In [2]:
import pandas as pd
import numpy as np
import pyreadr


In [3]:
pd.set_option('display.max_columns', 500)

In [4]:
keepcols = [
    "ori",
    "ori9",
    "fips_state_code",
    "fips_county_code",
    "agency_name",
    "state",
    "state_abb",
    "year",
    "agency_type",
   'population_group',
    "msa",
    "population",
    "covered_by",
    "total_employees_officers",
    "total_employees_civilians",
    "total_employees_total",
    "officers_killed_total",
    "assaults_no_injury_total"
]

In [5]:
# codebook https://www.icpsr.umich.edu/web/NACJD/studies/38800/datadocumentation
leoka = (
    pyreadr
    .read_r(
        # jacob kaplan's concatenated files
        f"../data/leoka_yearly_1960_2022.rds",
    )
    [None]
    [keepcols]
    .assign(
        ori=lambda x: x.ori.astype(str),
        ori9=lambda x: x.ori9.astype(str),
        year = lambda x: x.year.astype(int).astype(str),
        agency_name=lambda x: x.agency_name.astype(str),
        state=lambda x: x.state.astype(str),
        state_abb=lambda x: x.state_abb.astype(str),
        # agency_type=lambda x: x.agency_type.astype(str),
        msa=lambda x: x.msa.astype(str),
    )
)
leoka.head(3)

Unnamed: 0,ori,ori9,fips_state_code,fips_county_code,agency_name,state,state_abb,year,agency_type,population_group,msa,population,covered_by,total_employees_officers,total_employees_civilians,total_employees_total,officers_killed_total,assaults_no_injury_total
0,AK00101,AK0010100,2,20,anchorage,alaska,AK,2022,local police department,"city 250,000 thru 499,999",38,285821.0,"no, it is not covered by another agency",406.0,157.0,563.0,0.0,536.0
1,AK00101,AK0010100,2,20,anchorage,alaska,AK,2021,local police department,"city 250,000 thru 499,999",38,286238.0,"no, it is not covered by another agency",413.0,161.0,574.0,0.0,0.0
2,AK00101,AK0010100,2,20,anchorage,alaska,AK,2020,local police department,"city 250,000 thru 499,999",38,286388.0,"no, it is not covered by another agency",425.0,167.0,592.0,0.0,372.0


In [6]:
len(leoka["ori9"].unique())

22033

In [7]:
# count sheriff's offices and local police departments
sheriffs = leoka[leoka["agency_type"] == "sheriffs office"]
police = leoka[leoka["agency_type"] == "local police department"]

print(sheriffs["ori"].nunique(), police["ori"].nunique())

3063 14242


In [8]:
"""
This code calculates the ratio of unique 'ori9' values for sheriff's offices with a population greater than 10,000 
to the total number of unique 'ori9' values for all sheriff's offices.
"""

(
    leoka
    .loc[lambda x: x.agency_type == "sheriffs office"] 
    .loc[lambda x: x.population > 10000]
    ["ori9"].nunique()

/ leoka.loc[ lambda x: x.agency_type == "sheriffs office"]["ori9"].nunique()
)

0.7432211695524339

In [9]:
# check that agency types do not change over time
assert((
    leoka
    .groupby("ori")
    .agency_type
    .nunique()
    .max()
) == 1)

In [10]:
# get a lookup for ORI and type
type_lookup = (
    leoka
    .loc[
        lambda x: ~x.agency_type.isin(["special jurisdiction", "state law enforcement agency", "federal", "constable/marshal", np.nan])
    ]
    .groupby(
        ["ori", "ori9", "year"]
    ).agg(
        agency_type=("agency_type", "first"),
        agency_name = ("agency_name", "first"),
        population_group = ("population_group", "first")
    )
    .reset_index()
)

type_lookup.to_csv("../outputs/leoka_ori_type_lookup.csv", index=False)
type_lookup.head()

Unnamed: 0,ori,ori9,year,agency_type,agency_name,population_group
0,AK00101,AK0010100,1960,local police department,anchorage,"city 25,000 thru 49,999"
1,AK00101,AK0010100,1961,local police department,anchorage,"city 25,000 thru 49,999"
2,AK00101,AK0010100,1962,local police department,anchorage,"city 25,000 thru 49,999"
3,AK00101,AK0010100,1963,local police department,anchorage,"city 50,000 thru 99,999"
4,AK00101,AK0010100,1964,local police department,anchorage,"city 50,000 thru 99,999"


In [11]:
# get agency counts and staff by state for every year
num_agencies = (
    leoka
    .groupby( ["year", "state_abb", "agency_type"] )
    .agg(
        agencies = ("ori", "nunique"),
        officers = ("total_employees_officers", "sum"),
        total_staff = ("total_employees_total", "sum")
    )
    .reset_index()
)

num_agencies.to_csv("../outputs/agency_staff_all_years.csv", index = False)

num_agencies.head(10)

Unnamed: 0,year,state_abb,agency_type,agencies,officers,total_staff
0,1960,AK,local police department,6,110.0,138.0
1,1960,AK,state law enforcement agency,1,0.0,0.0
2,1960,AL,local police department,106,1815.0,2023.0
3,1960,AL,sheriffs office,67,0.0,0.0
4,1960,AR,local police department,66,731.0,780.0
5,1960,AR,sheriffs office,75,0.0,0.0
6,1960,AZ,local police department,30,1066.0,1224.0
7,1960,AZ,sheriffs office,14,0.0,0.0
8,1960,CA,local police department,354,15949.0,18934.0
9,1960,CA,sheriffs office,58,0.0,0.0


In [12]:
# group by year and type, summing the agency type population
agency_population = (
    leoka
    .groupby( ["year", "state_abb", "agency_type"])
    .agg(
        agency_population=("population", "sum"),
    )
    .reset_index()
    .pivot(
        index=["year", "state_abb"],
        columns="agency_type",
        values="agency_population"
    )
    .fillna(0)
    .assign(
        total_population=lambda x: x.sum(axis=1),
        other_pop = lambda x: x["federal"] + x["constable/marshal"] + x["special jurisdiction"] + x["state law enforcement agency"],
        percent_sheriff_pop=lambda x: (x["sheriffs office"]/x["total_population"] * 100).round(1),
        percent_police_pop=lambda x: (x["local police department"]/x["total_population"] * 100).round(1),
    )
    .sort_values("percent_sheriff_pop", ascending=False)
    .drop(columns=["federal", "constable/marshal", "special jurisdiction", "state law enforcement agency"])
    .reset_index()
    # rename index
    .pipe(lambda x: x.rename_axis(None, axis=1))
)

# Table of agency population by year and state
agency_population.to_csv("../outputs/agency_population.csv", index = False)

agency_population.head()

Unnamed: 0,year,state_abb,local police department,sheriffs office,total_population,other_pop,percent_sheriff_pop,percent_police_pop
0,1971,SC,842786.0,1823483.0,2683742.0,17473.0,67.9,31.4
1,2021,WV,576898.0,1207480.0,1784378.0,0.0,67.7,32.3
2,2022,WV,579590.0,1195365.0,1774955.0,0.0,67.3,32.7
3,2020,WV,590854.0,1205328.0,1796182.0,0.0,67.1,32.9
4,2019,WV,597473.0,1205093.0,1802566.0,0.0,66.9,33.1


## Arrests


In [13]:
usecols = [
    "ori",
    "ori9",
    "state_abb",
    "year",
    "number_of_months_missing",
    "agency_type",
    "actual_all_crimes",
]

In [14]:
arrests = (
    pyreadr
    .read_r(
        # jacob kaplan's concatenated files
        f"../data/offenses_known_yearly_1960_2022.rds",
    )
    [None]
    [usecols]
)

# lookup for ori to give arrests, year and type
# arrests[["ori", "year", "agency_type", "actual_all_crimes"]].to_csv("../outputs/arrests_ori_lookup.csv", index=False)
arrests.head(1)

Unnamed: 0,ori,ori9,state_abb,year,number_of_months_missing,agency_type,actual_all_crimes
0,AK00101,AK0010100,AK,2022.0,0.0,local police department,15517.0


In [15]:
type_and_year = (
    arrests
    .pivot_table(
        index = "year",
        columns = "agency_type",
        values = "actual_all_crimes",
        aggfunc = "sum"
    )
    .reset_index()
    .rename_axis(None, axis=1)
    # .to_csv("../outputs/arrests_lookup_2022.csv", index=False)
)

# annual arrests by type
# type_and_year.to_csv("../outputs/annual_arrests_lookup.csv", index=False)
type_and_year.head()

Unnamed: 0,year,constable/marshal,federal,local police department,sheriffs office,special jurisdiction,state law enforcement agency
0,1960.0,419.0,,2421538.0,399575.0,2028.0,45711.0
1,1961.0,496.0,,2694533.0,439181.0,2760.0,74682.0
2,1962.0,483.0,,2683280.0,394278.0,2916.0,38579.0
3,1963.0,595.0,4.0,3194299.0,485087.0,3407.0,31055.0
4,1964.0,650.0,0.0,3591606.0,498924.0,3880.0,77492.0


In [16]:
# codebook https://www.icpsr.umich.edu/web/NACJD/studies/38799/variables
arrests_by_state = (
    arrests
    .pivot_table(
        index=["year", "state_abb"],
        columns="agency_type",
        values="actual_all_crimes",
        aggfunc="sum"
    )
    .reset_index()
    .assign(
        other = lambda x: x["federal"] + x["constable/marshal"] + x["special jurisdiction"] + x["state law enforcement agency"],
        year = lambda x: x.year.astype(int).astype(str),
    )
    .drop(columns=["federal", "constable/marshal", "special jurisdiction", "state law enforcement agency"])
    .fillna(0)
    .rename_axis(None, axis=1)
)

# annual arrests by state and type
arrests_by_state.to_csv("../outputs/arrests_by_state.csv", index = False)
arrests_by_state.head()

Unnamed: 0,year,state_abb,local police department,sheriffs office,other
0,1960,AK,2400.0,0.0,0.0
1,1960,AL,27850.0,4549.0,0.0
2,1960,AR,9241.0,3049.0,0.0
3,1960,AZ,32953.0,5449.0,0.0
4,1960,CA,410865.0,123944.0,0.0


In [17]:
# codebook https://www.icpsr.umich.edu/web/NACJD/studies/38799/variables
arrests_by_year = (
    arrests_by_state
    .drop(columns="state_abb")
    .groupby("year")
    .sum()
)

# annual arrests by type of agency
arrests_by_year.to_csv("../outputs/arrests_by_year.csv")
arrests_by_year.head()

Unnamed: 0_level_0,local police department,sheriffs office,other
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1960,2421538.0,399575.0,0.0
1961,2694533.0,439181.0,0.0
1962,2683280.0,394278.0,0.0
1963,3194299.0,485087.0,0.0
1964,3591606.0,498924.0,0.0


In [18]:
# # get a subset of medium sized agencies
# medium_agencies = (
#     leoka
#     .loc[lambda x: x.population > 10000]
#     .loc[lambda x: x.population < 100000]
#     .loc[lambda x: x.agency_type.isin(["sheriffs office", "local police department"])]
#     [["year", "ori", "population"]]
# )

# medium_agencies.head()

# # population for sheriffs and local police departments with populations between 10,000 and 100,000
# medium_agencies.to_csv("../outputs/medium_agencies.csv", index=False)

---

---

---