In [15]:
## Public Records Request Notes
## First PRR received on 3/29/2022 for data that spans 2018 - 3/29/2022
## Second PPR received on 11/22/2022 for data that spans 3/29/2022 - 11/22/2022

In [16]:
import pandas as pd
import re
from lib import standardize_item_numbers

In [17]:
def clean():
    dfa = pd.read_csv("../data/police_reports/electronic_police_report_2018_2022.csv")
    dfa = dfa.pipe(standardize_item_numbers, ["item_number"])
    
    dfb = pd.read_csv("../data/real_time_crime_center/rtcc.csv")
    dfb = dfb.pipe(standardize_item_numbers, ["item_number"])
    dfb["rtcc_footage_requested"] = "yes"

    df = pd.merge(dfb, dfa, on="item_number")
    return df

In [18]:
def drop_rows_missing_offender_race(df):
    df.loc[:, "offender_race"] = df.offender_race.fillna("").str.lower().str.strip()\
    .str.replace("unknown", "", regex=False)\
    # .str.replace("hispanic", "", regex=False)\
    # .str.replace("asian", "", regex=False)\
    # .str.replace(r"amer\. ind\.", "", regex=True)
    return df[~((df.offender_race == ""))]

def drop_rows_missing_victim_race(df):
    df.loc[:, "victim_race"] = df.victim_race.fillna("").str.lower().str.strip()\
    .str.replace("unknown", "", regex=False)\
    .str.replace(r"native hawaiian or other pacific islander", "", regex=False)
    # .str.replace("hispanic", "", regex=False)\
    # .str.replace("asian", "", regex=False)\
    # .str.replace(r"amer\. ind\.", "", regex=True)
    return df[~((df.victim_race == ""))]


def filter_offender_gender(df):
    df.loc[:, "offender_gender"] = (
        df.offender_gender.str.lower().str.strip().fillna("").str.replace("unknown", "", regex=False)
    )
    return df[~((df.offender_gender == ""))]



def filter_victim_gender(df):
    df.loc[:, "victim_gender"] = (
        df.offender_gender.str.lower().str.strip().fillna("").str.replace("unknown", "", regex=False)
    )
    return df[~((df.victim_gender == ""))]


def filter_arrested(df):
    df.loc[:, "offenderstatus"] = df.offenderstatus.str.lower().str.strip().fillna("")
    df = df[df.offenderstatus.isin(["arrested"])]
    return df[~((df.offenderstatus == ""))]


def drop_rows_missing_data(df):
    return df[~((df.year.fillna("") == ""))]
  
  
def drop_rows_missing_charge_desc(df):
    df.loc[:, "charge_description"] = df.charge_description.str.lower().str.strip().fillna("")
    return df[~((df.charge_description == ""))]

In [19]:
df = clean()
df = df[df.rtcc_footage_requested.isin(["yes"])]
df.shape

  dfa = pd.read_csv("../data/police_reports/electronic_police_report_2018_2022.csv")


(22742, 28)

In [20]:
df.columns

Index(['item_number', 'Signal Code', 'rtcc_footage_requested', 'Unnamed: 0',
       'district', 'location', 'disposition', 'signal_type',
       'signal_description', 'occurred_date_time', 'charge_code',
       'charge_description', 'offender_race', 'offender_gender',
       'offender_age', 'offender_number', 'person_type', 'victim_race',
       'victim_gender', 'victim_age', 'victim_number', 'victim_fatal_status',
       'hate_crime', 'report_type', 'year', 'offenderid', 'offenderstatus',
       'persontype'],
      dtype='object')

In [21]:
df = df.pipe(drop_rows_missing_offender_race).drop_duplicates(subset=["offender_race", "offender_gender","offender_age", "item_number"])

In [22]:
df.columns

Index(['item_number', 'Signal Code', 'rtcc_footage_requested', 'Unnamed: 0',
       'district', 'location', 'disposition', 'signal_type',
       'signal_description', 'occurred_date_time', 'charge_code',
       'charge_description', 'offender_race', 'offender_gender',
       'offender_age', 'offender_number', 'person_type', 'victim_race',
       'victim_gender', 'victim_age', 'victim_number', 'victim_fatal_status',
       'hate_crime', 'report_type', 'year', 'offenderid', 'offenderstatus',
       'persontype'],
      dtype='object')

In [23]:
df.shape

(4765, 28)

In [24]:
df.loc[:, "offender_age"] = df.offender_age.astype(str).str.replace("nan", "", regex=False)
df = df[~((df.offender_age.fillna("") ==""))]
df.shape

(2787, 28)

In [25]:
df.offender_age.unique()

array(['21.0', '23.0', '19.0', '22.0', '25.0', '43.0', '16.0', '36.0',
       '30.0', '41.0', '33.0', '40.0', '24.0', '17.0', '28.0', '35.0',
       '31.0', '20.0', '58.0', '18.0', '32.0', '29.0', '44.0', '15.0',
       '39.0', '38.0', '27.0', '26.0', '34.0', '48.0', '45.0', '50.0',
       '63.0', '42.0', '47.0', '52.0', '55.0', '53.0', '37.0', '60.0',
       '46.0', '56.0', '14.0', '51.0', '13.0', '49.0', '57.0', '61.0',
       '69.0', '66.0', '67.0', '68.0', '59.0', '54.0', '62.0', '64.0',
       '70.0', '11.0', '12.0', '65.0', '2.0', '83.0', '10.0', '71.0',
       '75.0'], dtype=object)

In [26]:
df.offender_age.count().sum()

2787

In [27]:
youth = df[df.offender_age.isin(["10.0", "11.0", "12.0", "13.0", "14.0", "15.0",  "16.0", "17.0", "18.0",])]
youth.offender_age.count().sum()

443

In [28]:
(youth.offender_age.count().sum()/df.offender_age.count().sum())*100 

15.895227843559384

In [29]:
youth.offender_race.value_counts(normalize=True)

black       0.979684
white       0.015801
hispanic    0.004515
Name: offender_race, dtype: float64