In [21]:
## Public Records Request Notes
## First PRR received on 3/29/2022 for data that spans 2018 - 3/29/2022
## Second PPR received on 11/22/2022 for data that spans 3/29/2022 - 11/22/2022

In [22]:
import pandas as pd
import re

In [23]:
def standardize_item_no(df):
    df.loc[:, "item_number"] = df.item_number.str.lower().str.strip()
    return df

def clean():
    df1 = pd.read_csv("../data/electronic_police_report_2018.csv")
    df2 = pd.read_csv("../data/electronic_police_report_2019.csv")
    df3 = pd.read_csv("../data/electronic_police_report_2020.csv")
    df4 = pd.read_csv("../data/electronic_police_report_2021.csv")
    df5 = pd.read_csv("../data/electronic_police_report_2022.csv")

    df1["year"] = 2018
    df2["year"] = 2019
    df3["year"] = 2020
    df4["year"] = 2021
    df5["year"] = 2022

    dfs = [df1, df2, df3, df4, df5]
    dfa = pd.concat(dfs, join="outer")

    dfa = dfa.pipe(standardize_item_no)
    
    dfb = pd.read_csv("../data/rtcc.csv")
    dfb = dfb.pipe(standardize_item_no)
    dfb["rtcc_footage_requested"] = "yes"

    df = pd.merge(dfb, dfa, on="item_number")
    return dfb

In [24]:
def drop_rows_missing_offender_race(df):
    df.loc[:, "offender_race"] = df.offender_race.fillna("").str.lower().str.strip()\
    .str.replace("unknown", "", regex=False)\
    # .str.replace("hispanic", "", regex=False)\
    # .str.replace("asian", "", regex=False)\
    # .str.replace(r"amer\. ind\.", "", regex=True)
    return df[~((df.offender_race == ""))]

def drop_rows_missing_victim_race(df):
    df.loc[:, "victim_race"] = df.victim_race.fillna("").str.lower().str.strip()\
    .str.replace("unknown", "", regex=False)\
    .str.replace(r"native hawaiian or other pacific islander", "", regex=False)
    # .str.replace("hispanic", "", regex=False)\
    # .str.replace("asian", "", regex=False)\
    # .str.replace(r"amer\. ind\.", "", regex=True)
    return df[~((df.victim_race == ""))]


def filter_offender_gender(df):
    df.loc[:, "offender_gender"] = (
        df.offender_gender.str.lower().str.strip().fillna("").str.replace("unknown", "", regex=False)
    )
    return df[~((df.offender_gender == ""))]



def filter_victim_gender(df):
    df.loc[:, "victim_gender"] = (
        df.offender_gender.str.lower().str.strip().fillna("").str.replace("unknown", "", regex=False)
    )
    return df[~((df.victim_gender == ""))]


def filter_arrested(df):
    df.loc[:, "offenderstatus"] = df.offenderstatus.str.lower().str.strip().fillna("")
    df = df[df.offenderstatus.isin(["arrested"])]
    return df[~((df.offenderstatus == ""))]


def drop_rows_missing_data(df):
    return df[~((df.year.fillna("") == ""))]
  
  
def drop_rows_missing_charge_desc(df):
    df.loc[:, "charge_description"] = df.charge_description.str.lower().str.strip().fillna("")
    return df[~((df.charge_description == ""))]

In [25]:
df = clean()
df = df[df.rtcc_footage_requested.isin(["yes"])]
df.shape

  if await self.run_code(code, result, async_=asy):


(21615, 3)

In [26]:
df = df.pipe(drop_rows_missing_offender_race).drop_duplicates(subset=["offender_race", "offender_gender", "item_number"])

AttributeError: 'DataFrame' object has no attribute 'offender_race'

In [None]:
df.columns

Index(['item_number', 'Signal Code', 'rtcc_footage_requested', 'district',
       'location', 'disposition', 'signal_type', 'signal_description',
       'occurred_date_time', 'charge_code', 'charge_description',
       'offender_race', 'offender_gender', 'offender_age', 'offender_number',
       'person_type', 'victim_race', 'victim_gender', 'victim_age',
       'victim_number', 'victim_fatal_status', 'hate_crime', 'report_type',
       'year', 'offenderid', 'offenderstatus', 'persontype'],
      dtype='object')

In [None]:
df.shape

(8215, 27)

In [None]:
df.loc[:, "offender_age"] = df.offender_age.astype(str).str.replace("nan", "", regex=False)
df = df[~((df.offender_age.fillna("") ==""))]
df.shape

(2567, 27)

In [None]:
df.offender_age.unique()

array(['21.0', '23.0', '22.0', '25.0', '43.0', '16.0', '36.0', '30.0',
       '41.0', '19.0', '33.0', '40.0', '24.0', '17.0', '28.0', '35.0',
       '31.0', '20.0', '58.0', '18.0', '32.0', '44.0', '15.0', '39.0',
       '26.0', '38.0', '27.0', '34.0', '45.0', '50.0', '63.0', '56.0',
       '42.0', '29.0', '47.0', '52.0', '48.0', '55.0', '53.0', '37.0',
       '60.0', '46.0', '14.0', '51.0', '49.0', '57.0', '13.0', '61.0',
       '69.0', '66.0', '67.0', '68.0', '59.0', '54.0', '62.0', '64.0',
       '70.0', '11.0', '12.0', '65.0', '2.0', '83.0', '10.0', '71.0',
       '75.0'], dtype=object)

In [None]:
df.offender_age.count().sum()

2567

In [None]:
youth = df[df.offender_age.isin(["10.0", "11.0", "12.0", "13.0", "14.0", "15.0",  "16.0", "17.0", "18.0",])]
youth.offender_age.count().sum()

372

In [None]:
(youth.offender_age.count().sum()/df.offender_age.count().sum())*100 

14.49162446435528

In [None]:
youth.offender_race.value_counts(normalize=True)

BLACK       0.975806
WHITE       0.016129
HISPANIC    0.005376
UNKNOWN     0.002688
Name: offender_race, dtype: float64