In [108]:
import os
import pandas as pd
import numpy as np
import math

In [109]:
data_dir = os.getcwd().replace("/notebooks", "/data/interim")
interim_csv_name = "step1_combined_police_data.csv"
interim_csv_path = os.path.join(data_dir, interim_csv_name)

interim_df = pd.read_csv(interim_csv_path)

  interactivity=interactivity, compiler=compiler, result=result)


In [110]:
interim_df.columns

Index(['dept', 'city', 'state', 'address_or_intersection', 'charge_code',
       'cross_street', 'date', 'encounter_type', 'incident_id',
       'incident_reason', 'latitude', 'longitude', 'number_of_suspects',
       'number_officers_involved', 'officer_age', 'officer_death',
       'officer_ethnicity', 'officer_gender', 'officer_id', 'officer_injured',
       'officer_injuries_detail', 'officer_race', 'officer_yrs_experience',
       'reason_for_force', 'reason_for_force_detailed', 'search_conducted',
       'search_conducted_person', 'search_conducted_vehicle', 'search_reason',
       'subject_age', 'subject_death', 'subject_description', 'subject_gender',
       'subject_id', 'subject_injured', 'subject_injuries_detail',
       'subject_priors', 'subject_race', 'subject_weapon_possessed',
       'suspected_crime_type', 'suspected_crime_type_detailed',
       'type_of_force_used', 'weapon_or_tool_used'],
      dtype='object')

In [158]:
interim_df["subject_injured"].value_counts()

-1.0        8641
0.0         8502
FALSE       6316
TRUE        3745
0            602
-1           397
Yes           43
No             8
Yes, Yes       2
No, No         1
Name: subject_injured, dtype: int64

In [144]:
def standardize_categorical_column(df, col_name):
    
    col_val_dict = {
        "reason_for_force" : {
            "Tensed" : ["Tensed", "TENSED", "tensed"],
            "Necessary to effect arrest/detention" : ["NECESSARY TO EFFECT ARREST / DETENTION", "Barricaded Person", "Arrest", "Detention/Frisk"],
            "Commission of Crime" : ["Commission of Crime", "COMMISSION OF CRIME", "Property Destruction", "commission of crime"],
            "Commission of Crime - Assault of Officer" : ["Assaulted Officer", "Assaulting Officer(s)"],
            "Commission of Crime - Assault of Police Animal" : ["Assaulting Police Horse", "Assaulting Police K9"],
            "Commission of Crime - Assault of Person(s)" : ["Assaulting Citizen(s)", "Assault to Other Person"],
            "Fleeing" : ["Fled on Foot", "Fleeing"],
            "Fleeing in Vehicle / Vehicle Pursuit" : ["NECESSARY TO IMMOBILIZE A VEHICLE PURSUIT (PIT)", "Fled in Vehicle"],
            "Non-compliance" : ["Verbal Non-Compliance", "Non-Compliant", "verbal non-compliance"],
            "Aggressive/Combative Suspect" : ["Active Aggression", "Combative Suspect"],
            "Restraint" : ["TO RESTRAIN FOR SUBJECTS SAFETY", "NECESSARY TO DEFEND ANOTHER", "IN CUSTODY, MAINTAINING CONTROL"],
            "Self-Defense or Defense of Citizen" : ["NECESSARY TO DEFEND REPORTING OFFICER", "Danger to self or others"],
            "Weapon Display / Prevent Violence" : ["Weapon Display", "TO PREVENT A VIOLENT FELONY"],
            "Other" : ["Aggressive Animal", "OTHER (DOCUMENT IN SUPPLEMENT)", "Other", "Crowd Disbursement", "Unspecified"]
        },
        "officer_race" : {
            "Black" : ["B(Black)", "Black", "B"],
            "White" :["W(White)", "White", "W","W, W", "W, W, W", "W, W, W, W, W", "W,W"],
            "Hispanic" : ["H(Hispanic)", "Hispanic"],
            "Asian" : ["A(Asian or Pacific Islander)", "Asian"],
            "Middle Eastern" : ["M(Middle Eastern or East Indian)"],
            "American Indian" : ["I(American Indian or Alaskan Native)", "American Ind"],
            "Other" : ["Other", "O"],
            "Multiple Races" : ["W, B", "W, B, W", "B, W, W", "O,W", "W,W, W, W, W, B, W, W, B, W, W", "W, O, W", "B, W, W, W", "W, B, B, W"],
            np.nan : ["NO DATA ENTERED", "UNKNOWN"]
        }, 
        "incident_reason" : {
            "Crowd or Emergency Response" : ["ERG Incident", "Call for Cover", "Crowd Control"],
            "Canine Request" : ["Request For PSD"],
            "Viewed Incident" : ["Crime in Progress", "VIEWED OFFENSE", "CRIMINAL VIOLATION"],
            "Traffic or Pedestrian Stop" : ["TRAFFIC STOP", "MOTOR VEHICLE VIOLATION", "Pedestrian Stop", "Traffic Stop"],
            "Off Duty" : ["Off-Duty Incident", "Off-Duty Employment"],
            "Response to Call" : ["RADIO CALL", "CITIZEN ASSIST", "Call For Service", "Assisting a Citizen", "DISPATCHED CALL", "Service Call"],
            "Assisting other law enforcement" : ["SWAT Incident", "Assisting Other Agency"],
            "Investigation" : ["INVESTIGATIVE", "Investigation", "Suspicious Activity"],
            "Other" : ["OTHER (SPECIFY)", "Other ( In Narrative)", "OTHER", "Accidental Discharge"],
            "Warrant" : ["Warrant Service", "Warrant Execution", "WARRANT SERVICE", "Search Warrant"],
            "Arrest" : ["Arrest", "Self-Initiated Activity", "TACTICAL OPERATION"],
            np.nan : ["Not Applicable", np.nan, "-"]
        },
        # https://static1.squarespace.com/static/5086f19ce4b0ad16ff15598d/t/56a2569205caa7ee9f29e6a2/1453479570208/rule323.pdf
        "encounter_type": {
            "Frisk" : ["F"],
            "Frisk, Stop" : ["FS"],
            "Intelligence" : ["I"],
            "Intelligence, Observation" : ["IO"],
            "Intelligence, Observation, Frisk" : ["IOF"],
            "Intelligence, Observation, Frisk, Stop" : ["IOFS"],
            "Observation" : ["O"],
            "Observation, Frisk" : ["OF"],
            "Observation, Stop" : ["OS"],
            "Observation, Frisk, Stop": ["OFS"],
            "Intelligence, Frisk" : ["IF"],
            "Intelligence, Frisk, Stop" : ["IFS"],
            "Intelligence, Stop" : ["IS"],
            "Intelligence, Observation, Stop" : ["IOS"],
            "Probable Cause" : ["P"],
            "Probable Cause, Frisk" : ["PF"],
            "Probable Cause, Intelligence" : ["PI"],
            "Probable Cause, Observation" : ["PO"],
            "Probable Cause, Intelligence, Frisk" : ["PIF"],
            "Probable Cause, Intelligence, Observation" : ["PIO"],
            "Probable Cause, Intelligence, Stop" : ["PIS"],
            "Probable Cause, Observation, Frisk" : ["POF"],
            "Probable Cause, Intelligence, Observation, Frisk" : ["PIOF"],
            "Probable Cause, Intelligence, Observation, Stop" : ["PIOS"],
            "Probable Cause, Intelligence, Observation, Frisk, Stop" : ["PIOFS"],    
            "Stop" : ["S"]
        }
    }
    
    #if col_name == "officer_gender":
        
    col_vals = col_val_dict[col_name]
    for k, v in col_vals.items():
        df.loc[df[col_name].isin(v), col_name] = k

    return df  

In [145]:
def standardize_numerical_column(df, col_name):

    def convert_to_list(row):
        if isinstance(row, str):
            row_list = row.split(",")
            return_val = [float(num) for num in row_list]
        elif math.isnan(row):
            return_val = row
        elif isinstance(row, float) or isinstance(row, int):
            return_val = [row]

        return return_val
        
    def get_avg_of_list(row):
        if isinstance(row, list):
            return_val = np.mean(row)
        elif math.isnan(row):
            return_val = row
        return return_val
    
    col_as_list = df[col_name].apply(convert_to_list)
    avg_series = col_as_list.apply(get_avg_of_list)
    
    df[col_name] = avg_series
    
    return df

def standardize_boolean_column(df, col_name):
    def convert_to_list(row):
        if isinstance(row, str):
            row_list = row.split(",")
            return_val = [word for word in row_list]
        elif math.isnan(row):
            return_val = row
        elif isinstance(row, float) or isinstance(row, int):
            return_val = [row]

        return return_val
    
    def get_bool(row):
        if isinstance(row, list):
            return_val = True if 'YES' in [string.upper() for string in row] else False
        elif math.isnan(row):
            return_val = row
        return return_val

    col_as_list = df[col_name].apply(convert_to_list)
    bool_series = col_as_list.apply(get_bool)
    df[col_name] = bool_series
    
    return df

In [146]:
print(interim_df["officer_race"].head())

0    Black
1    White
2    Black
3    Black
4    Black
Name: officer_race, dtype: object


In [147]:
# new_df = standardize_categorical_column(interim_df, "incident_reason")
# new_df = standardize_categorical_column(interim_df, "encounter_type")
# new_df = standardize_numerical_column(interim_df, "officer_yrs_experience")
# new_df = standardize_numerical_column(interim_df, "officer_age")
# new_df = standardize_boolean_column(interim_df, "officer_death")
# new_df = standardize_boolean_column(interim_df, "officer_injured")
new_df = standardize_categorical_column(interim_df, "reason_for_force")

In [148]:
new_df["reason_for_force"].unique()

array([nan, 'Fleeing', 'Aggressive/Combative Suspect', 'Resisting Arrest',
       'Non-compliance', 'Commission of Crime - Assault of Person(s)',
       'Commission of Crime - Assault of Officer', 'Commission of Crime',
       'Tensed', 'Fleeing in Vehicle / Vehicle Pursuit', 'Other',
       'Commission of Crime - Assault of Police Animal',
       'Necessary to effect arrest/detention', 'Restraint',
       'Self-Defense or Defense of Citizen',
       'Weapon Display / Prevent Violence'], dtype=object)

In [127]:
new_df["officer_race"].value_counts()

Black              90642
White              44531
Hispanic           20750
Asian               1379
Middle Eastern       462
Other                139
American Indian       77
Multiple Races         9
Name: officer_race, dtype: int64