# Analysis of Chicago Police Homicide Clearance Data


In [1]:
import re
import pandas as pd
from src import schemas


## Data import and cleaning


In [2]:
rename_columns = lambda df: df.rename(
    columns=lambda col: re.sub(
        r"\s+", "_", re.sub(r"[^A-z]+", " ", col.strip()).lower().strip()
    )
)

status = (
    pd.read_excel(
        "input/18731-P744333_Homicide_status_2010-YTD.xlsx",
        sheet_name=1,
        parse_dates=["Injury Date", "Death Date", "CompStat Date", "Date Cleared"],
    )
    .pipe(rename_columns)
    .replace(
        {
            "cleared_i": {"Y": True, "N": False},
            "victim_sex": {"M": "male", "F": "female", "X": "unknown/other"},
        }
    )
    .rename(columns={"homicide": "homicide_no", "rd": "case_no"})
    .assign(
        incident_year=lambda df: df.compstat_date.dt.year,
        clearance_year=lambda df: df.date_cleared.dt.year,
    )
    .drop(["victim_sex", "victim_age"], axis=1)
    .drop_duplicates()
    .pipe(schemas.status_schema)
)

victims = (
    pd.read_csv(
        "input/Violence_Reduction_-_Victims_of_Homicides_and_Non-Fatal_Shootings.csv",
        parse_dates=["DATE"],
    )
    .pipe(rename_columns)
    .query("victimization_primary == 'HOMICIDE'")
    .rename(columns={"race": "victim_race", "case_number": "case_no"})[
        ["case_no", "unique_id", "victim_race"]
    ]
    .replace(
        {
            "victim_race": {
                "BLK": "black",
                "WHI": "white",
                "WWH": "hispanic",
                "WBH": "hispanic",
                "API": "api",
                "I": "indian",
                "UNKNOWN": "unknown",
            }
        }
    )
    .pipe(schemas.victims_schema)
)


## Analysis


### Overall Clearance Rates

The number of homicides nearly doubled over about a decade. In 2010, there were 520, and in 2020, there were 842.


In [3]:
total_incidents = (
    status.groupby("incident_year")
    .homicide_no.nunique()
    .to_frame("total_incidents")
    .query("incident_year >= 2010")
)
total_incidents


Unnamed: 0_level_0,total_incidents
incident_year,Unnamed: 1_level_1
2010,441
2011,446
2012,510
2013,425
2014,424
2015,493
2016,780
2017,660
2018,579
2019,500


Chicago PD's overall clearance rate appears to be relatively good for the city with the most homicides of any in the country -- 46% in 2020, which is just a few points below the national average.


In [4]:
overall_clearance = (
    total_incidents.join(
        status.query("cleared_i == True")
        .groupby("clearance_year")
        .case_no.nunique()
        .to_frame("total_clearances")
    )
    .assign(clearance_rate=lambda df: df.total_clearances / df.total_incidents)
    .query("incident_year >= 2010")
)

overall_clearance


Unnamed: 0_level_0,total_incidents,total_clearances,clearance_rate
incident_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010.0,441,206,0.46712
2011.0,446,201,0.450673
2012.0,510,178,0.34902
2013.0,425,224,0.527059
2014.0,424,210,0.495283
2015.0,493,210,0.425963
2016.0,780,207,0.265385
2017.0,660,212,0.321212
2018.0,579,247,0.426598
2019.0,500,253,0.506


But when you remove exceptional clearances, the picture is much worse. In 2020, only about a quarter of homicides were cleared by arrest

In [5]:
arrest = total_incidents.join(
    status.query("(cleared_i == True) & (cleared_exceptionally_by.isna())")
    .groupby("clearance_year")
    .case_no.nunique()
    .to_frame("total_clearances")
).assign(clearance_rate=lambda df: df.total_clearances / df.total_incidents)

arrest


Unnamed: 0_level_0,total_incidents,total_clearances,clearance_rate
incident_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010.0,441,133,0.301587
2011.0,446,155,0.347534
2012.0,510,139,0.272549
2013.0,425,161,0.378824
2014.0,424,171,0.403302
2015.0,493,143,0.290061
2016.0,780,170,0.217949
2017.0,660,134,0.20303
2018.0,579,140,0.241796
2019.0,500,109,0.218


The share of clearances that were made by arrest is also decreasing. In 2013, more than 80% of clearances were by arrest, while in 2020 it was less than 50%.

In [6]:
def add_multiindex_level(df, colname):
    df.columns = pd.MultiIndex.from_tuples([(colname, c) for c in df.columns])
    return df


merge_cols = ["incident_year", "total_incidents"]

compare_df = (
    overall_clearance.reset_index()
    .set_index(merge_cols)
    .pipe(add_multiindex_level, "all clearances")
    .merge(
        arrest.reset_index()
        .set_index(merge_cols)
        .pipe(add_multiindex_level, "arrest only"),
        on=merge_cols,
    )
    .assign(
        pct_clearances_by_arrest=lambda df: df[("arrest only", "total_clearances")]
        / df[("all clearances", "total_clearances")]
    )
)

compare_df


Unnamed: 0_level_0,Unnamed: 1_level_0,all clearances,all clearances,arrest only,arrest only,pct_clearances_by_arrest
Unnamed: 0_level_1,Unnamed: 1_level_1,total_clearances,clearance_rate,total_clearances,clearance_rate,Unnamed: 6_level_1
incident_year,total_incidents,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2010.0,441,206,0.46712,133,0.301587,0.645631
2011.0,446,201,0.450673,155,0.347534,0.771144
2012.0,510,178,0.34902,139,0.272549,0.780899
2013.0,425,224,0.527059,161,0.378824,0.71875
2014.0,424,210,0.495283,171,0.403302,0.814286
2015.0,493,210,0.425963,143,0.290061,0.680952
2016.0,780,207,0.265385,170,0.217949,0.821256
2017.0,660,212,0.321212,134,0.20303,0.632075
2018.0,579,247,0.426598,140,0.241796,0.566802
2019.0,500,253,0.506,109,0.218,0.43083


At the same time, the share of clearances marked "bar to prosecute" is increasing rapidly. In 2016, it was less than 10 percent. In 2021, it was 34.

In [7]:
# exceptional clearance comparison
ec_compare = (
    status.query("cleared_i == True")
    .assign(
        cleared_exceptionally_by=(
            lambda df: df.cleared_exceptionally_by.fillna("arrest").str.lower()
        )
    )
    .rename(columns={"cleared_exceptionally_by": "clearance_description"})
    .pipe(
        lambda df: pd.crosstab(
            index=df.clearance_year,
            columns=df.clearance_description,
            values=df.case_no,
            aggfunc="nunique",
        )
    )
    .pipe(lambda df: df.div(df.sum(axis=1), axis=0))
)

ec_compare


clearance_description,arrest,bar to prosecute,death of offender
clearance_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010.0,0.645631,0.252427,0.101942
2011.0,0.771144,0.174129,0.054726
2012.0,0.780899,0.123596,0.095506
2013.0,0.71875,0.21875,0.0625
2014.0,0.814286,0.12381,0.061905
2015.0,0.677725,0.175355,0.146919
2016.0,0.821256,0.10628,0.072464
2017.0,0.632075,0.221698,0.146226
2018.0,0.566802,0.323887,0.109312
2019.0,0.429134,0.440945,0.129921


### Clearance Rates by Race

In [8]:
# dataframe containing one row per incident and the race of the victims.
# if there are multiple victims of mixed races, consider those separately
victim_race_df = (
    status.assign(
        unique_id=lambda df: df.assign(
            victim_index=lambda df1: df1.groupby("case_no").cumcount() + 1
        ).apply(lambda row: f"HOM-{row.case_no}-#{row.victim_index}", axis=1)
    )
    .merge(victims, how="left", on="unique_id")
    .groupby(
        [
            "unique_id",
            "incident_year",
            "injury_date",
            "death_date",
            "compstat_date",
            "cleared_i",
            "date_cleared",
            "clearance_year",
            "cleared_exceptionally_by",
        ],
        dropna=False,
    )
    .victim_race.apply(lambda vals: list(set(vals)))
    .to_frame("victim_race")
    .assign(
        victim_race=lambda df: df.victim_race.apply(
            lambda races: "no data"
            if pd.isna(races)
            else races[0]
            if len(races) == 1
            else "multiple"
        )
    )
    .reset_index()
)
victim_race_df


Unnamed: 0,unique_id,incident_year,injury_date,death_date,compstat_date,cleared_i,date_cleared,clearance_year,cleared_exceptionally_by,victim_race
0,HOM-A000067-#1,1979,1979-01-01 00:10:00,1979-06-01 15:35:00,1979-06-01 15:35:00,True,2016-09-21,2016.0,BAR TO PROSECUTE,no data
1,HOM-A020847-#1,1979,1979-01-20 23:15:00,1979-01-20 23:15:00,1979-01-20 23:15:00,True,2021-08-18,2021.0,BAR TO PROSECUTE,no data
2,HOM-A032861-#1,1962,1962-02-01 22:25:00,1962-02-01 23:52:00,1962-02-01 23:52:00,True,2013-06-26,2013.0,BAR TO PROSECUTE,no data
3,HOM-A075689-#1,1979,1979-03-03 23:45:00,1979-03-04 16:05:00,1979-03-04 16:05:00,True,2011-09-26,2011.0,DEATH OF OFFENDER,no data
4,HOM-A165086-#1,1996,1996-03-06 23:05:00,1996-03-06 23:05:00,1996-03-06 23:05:00,True,2013-06-12,2013.0,DEATH OF OFFENDER,black
...,...,...,...,...,...,...,...,...,...,...
7495,HOM-Z528496-#1,1995,1995-11-05 15:00:00,1995-11-05 18:00:00,1995-11-05 18:00:00,True,2016-09-21,2016.0,BAR TO PROSECUTE,black
7496,HOM-Z532086-#1,1995,1995-11-10 05:24:00,1995-11-10 05:24:00,1995-11-10 05:24:00,True,2021-09-15,2021.0,BAR TO PROSECUTE,hispanic
7497,HOM-Z532086-#2,1995,1995-11-10 05:24:00,1995-11-10 05:24:00,1995-11-10 05:24:00,True,2021-09-15,2021.0,BAR TO PROSECUTE,hispanic
7498,HOM-Z569626-#1,1995,1995-11-13 22:55:00,1995-11-13 23:20:00,1995-11-13 23:20:00,True,2010-08-31,2010.0,,black


Like we see with the national trends, the clearance rate for Black victims of homicide is much lower than it is for white victims

In [9]:
race_comparison = (
    victim_race_df.assign(
        clearance_type=lambda df: df.apply(
            lambda row: "not_cleared"
            if row.cleared_i == False
            else "arrest"
            if pd.isna(row.cleared_exceptionally_by)
            else row.cleared_exceptionally_by.lower(),
            axis=1,
        )
    )
    .pipe(
        lambda df: pd.crosstab(
            index=df.victim_race,
            columns=df.clearance_type,
            values=df.unique_id,
            aggfunc="nunique",
        )
    )
    .assign(
        all_cleared=lambda df: df[[c for c in df.columns if c != "not_cleared"]].sum(
            axis=1
        )
    )
    .pipe(
        lambda df: df.div(
            df[[c for c in df.columns if c != "all_cleared"]].sum(axis=1), axis=0
        )
    )
)

race_comparison


clearance_type,arrest,bar to prosecute,death of offender,not_cleared,all_cleared
victim_race,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
api,0.578947,0.026316,0.105263,0.289474,0.710526
black,0.260775,0.097716,0.046673,0.594836,0.405164
hispanic,0.311808,0.119926,0.0369,0.531365,0.468635
indian,0.5,,,0.5,0.5
no data,0.20764,0.100881,0.062684,0.628795,0.371205
unknown,0.285714,0.071429,0.071429,0.571429,0.428571
white,0.464052,0.107843,0.071895,0.356209,0.643791


The clearance rate for Black victims has remained consistently low, never exceeding 55%. 

In [10]:
get_race_annual = lambda query_string: (
    pd.crosstab(
        index=victim_race_df.incident_year,
        columns=victim_race_df.victim_race,
        values=victim_race_df.unique_id,
        aggfunc="nunique",
    )
    .pipe(lambda df: df[df.index > 2010])
    .pipe(add_multiindex_level, "incidents")
    .join(
        victim_race_df.query(query_string)
        .pipe(
            lambda df: pd.crosstab(
                index=df.clearance_year,
                columns=df.victim_race,
                values=df.unique_id,
                aggfunc="nunique",
            )
        )
        .pipe(lambda df: df[df.index > 2010])
        .pipe(add_multiindex_level, "clearances")
    )
    .pipe(
        lambda df: df.join(
            df["clearances"]
            .div(df["incidents"])
            .pipe(add_multiindex_level, "clearance_rate")
        )
    )["clearance_rate"]
)

race_annual = get_race_annual("cleared_i == True")
race_annual


Unnamed: 0_level_0,api,black,hispanic,indian,no data,unknown,white
incident_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2011.0,1.0,0.477509,0.544118,,0.462687,,0.666667
2012.0,0.4,0.381818,0.296296,,0.327869,,0.566667
2013.0,,0.532203,0.647887,,0.404255,,1.0
2014.0,0.25,0.525547,0.5,,0.421053,,0.518519
2015.0,4.0,0.434251,0.478261,,0.424658,0.333333,0.75
2016.0,1.0,0.259887,0.313043,,0.293478,0.333333,0.540541
2017.0,2.0,0.338983,0.34375,,0.444444,,0.473684
2018.0,0.75,0.391521,0.635135,,0.402985,,0.757576
2019.0,,0.477011,0.730159,,0.470588,,1.05
2020.0,0.666667,0.465863,0.448598,,0.385714,,0.653846


But it's lower by about the same amount as other races

In [11]:
race_nonexc_annual = get_race_annual(
    "cleared_i == True & cleared_exceptionally_by.isna()"
)
race_nonexc_annual.subtract(race_annual).mean()


api        -0.316667
black      -0.153813
hispanic   -0.172591
indian           NaN
no data    -0.187339
unknown     0.000000
white      -0.206396
dtype: float64

People of color also take longer for their cases to be closed

In [12]:
days_to_closure = (
    victim_race_df.assign(time_to_closure=lambda df: df.date_cleared - df.injury_date)
    .groupby("victim_race")
    .time_to_closure.mean()
    .dt.round("d")
    .to_frame("avg_days_to_closure")
    .query("index != 'no data'")
    .query("index != 'unknown'")
)
days_to_closure


Unnamed: 0_level_0,avg_days_to_closure
victim_race,Unnamed: 1_level_1
api,400 days
black,901 days
hispanic,798 days
indian,663 days
white,531 days


## Output

In [13]:
with pd.ExcelWriter("output/chicago_police_clearance_rate.xlsx") as writer:
    compare_df.to_excel(writer, sheet_name="Clearance rates")
    ec_compare.to_excel(writer, sheet_name="Share of clearances by type")
    race_comparison.to_excel(writer, sheet_name="By race")
    race_annual.to_excel(writer, sheet_name="By race - annual")
    days_to_closure.to_excel(writer, sheet_name="By race - time to close")
