In [1]:
#import relevant packages for analysis

from __future__ import print_function

import os
import sys

import seaborn as sns
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd

PROJ_ROOT = os.path.join(os.pardir)

In [2]:
%load_ext watermark
%watermark -a "Bryan Dickinson" -d -t -v -p numpy,pandas

Bryan Dickinson 2019-08-07 11:23:18 

CPython 3.7.3
IPython 7.7.0

numpy 1.16.4
pandas 0.25.0


# Step 1: Import the dataset, explore and ready the data for analysis

In [3]:
#Create the path to the data and read into a pandas dataframe

terry_data = os.path.join(PROJ_ROOT, 
                         'data',
                         'Terry_Stops.csv')

df = pd.read_csv(terry_data)

In [4]:
df.head(2)

Unnamed: 0,Subject Age Group,Subject ID,GO / SC Num,Terry Stop ID,Stop Resolution,Weapon Type,Officer ID,Officer YOB,Officer Gender,Officer Race,...,Reported Time,Initial Call Type,Final Call Type,Call Type,Officer Squad,Arrest Flag,Frisk Flag,Precinct,Sector,Beat
0,18 - 25,,20150000202641,11662,-,,1128,1971.0,M,Hispanic or Latino,...,18:23:00.0000000,ROBBERY - IP/JO (INCLUDES STRONG ARM),--ROBBERY - STRONG ARM,911,SOUTH PCT 1ST W - ROBERT,N,N,South,R,R3
1,46 - 55,,20150000224628,12260,-,,1690,1980.0,M,White,...,19:39:00.0000000,ASLT - IP/JO - PERSON SHOT OR SHOT AT,--HOMICIDE,911,EAST PCT 2ND W - BEATS,N,N,East,C,C3


I will not need all of these columns for my analysis. I will proceed to inspect the columns and clean the columns that will be kept

In [5]:
df.shape

(34521, 23)

In [6]:
df.columns

Index(['Subject Age Group', 'Subject ID', 'GO / SC Num', 'Terry Stop ID',
       'Stop Resolution', 'Weapon Type', 'Officer ID', 'Officer YOB',
       'Officer Gender', 'Officer Race', 'Subject Perceived Race',
       'Subject Perceived Gender', 'Reported Date', 'Reported Time',
       'Initial Call Type', 'Final Call Type', 'Call Type', 'Officer Squad',
       'Arrest Flag', 'Frisk Flag', 'Precinct', 'Sector', 'Beat'],
      dtype='object')

In [7]:
# create a sub dataframe with the selected columns needed for analysis
df = df[['Subject Age Group', 'Subject ID', 'Stop Resolution',
         'Weapon Type', 'Officer ID', 'Officer YOB', 
         'Officer Gender', 'Officer Race', 'Subject Perceived Race', 
         'Subject Perceived Gender', 'Reported Date', 'Initial Call Type','Call Type',
         'Officer Squad', 'Arrest Flag', 'Frisk Flag',
         'Precinct', 'Sector', 'Beat' ]]


In [8]:
# rename the columns by remvoing the space and capital letters

df.columns = ['subject_age', 'subject_id', 'stop_resolution',
         'weapon_type', 'officer_id', 'officer_age', 
         'officer_gender', 'officer_race', 'subject_race', 
         'subject_gender', 'date', 'initial_call_type', 'call_type',
         'officer_squad', 'arrest', 'frisk',
         'precinct', 'sector', 'beat' ]

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34521 entries, 0 to 34520
Data columns (total 19 columns):
subject_age          34521 non-null object
subject_id           33285 non-null float64
stop_resolution      34521 non-null object
weapon_type          34521 non-null object
officer_id           34521 non-null int64
officer_age          34519 non-null float64
officer_gender       34521 non-null object
officer_race         34521 non-null object
subject_race         34521 non-null object
subject_gender       34521 non-null object
date                 34521 non-null object
initial_call_type    21968 non-null object
call_type            21968 non-null object
officer_squad        34153 non-null object
arrest               34521 non-null object
frisk                34521 non-null object
precinct             21968 non-null object
sector               21968 non-null object
beat                 21968 non-null object
dtypes: float64(2), int64(1), object(16)
memory usage: 5.0+ MB


In [10]:
# change the data type for the category & datetime columns
df = df.astype({'subject_age': 'category', 'officer_gender':'category', 
                'officer_race': 'category', 'subject_race':'category', 
                'subject_gender':'category', 'date':'datetime64','arrest':'category', 
                'frisk':'category','precinct':'category', 'sector':'category',
                'beat': 'category'})

In [11]:
#replace '-' with NaN values
df.replace('-', np.nan, inplace = True)

## Explore the value of the columns

In [12]:
# There are multiple NaN values for the subjects age - these will be left in, as these are originally 'perceived' age groups 
# from officers, and will not affect a portion of the analysis
df.subject_age.value_counts(dropna = False)

26 - 35         11497
18 - 25          7457
36 - 45          6985
46 - 55          4324
1 - 17           1670
56 and Above     1573
NaN              1015
-                   0
Name: subject_age, dtype: int64

In [13]:
df.officer_age.value_counts(dropna = False, ascending = True).head(3)

1951.0    1
NaN       2
1946.0    2
Name: officer_age, dtype: int64

In [14]:
#remove the 2 NaNs from the officer_age column by filling them in with the mean age
df.officer_age.fillna(int(df.officer_age.mean()), inplace = True)

In [15]:
#replace the original 'officer YOB' column with the age of the officer, using the year of the report & the officer YOB
df['officer_age'] = (df.date.dt.year - df.officer_age)

In [16]:
df.officer_age.describe()

count    34521.000000
mean        35.137945
std          8.625263
min         21.000000
25%         29.000000
50%         33.000000
75%         40.000000
max        118.000000
Name: officer_age, dtype: float64

In [17]:
# view the top range of the age range for officers
df[df.officer_age >70]

Unnamed: 0,subject_age,subject_id,stop_resolution,weapon_type,officer_id,officer_age,officer_gender,officer_race,subject_race,subject_gender,date,initial_call_type,call_type,officer_squad,arrest,frisk,precinct,sector,beat
1820,18 - 25,6989.0,Arrest with GO or Supplemental,,-9,118.0,N,Unknown,White,Male,2018-02-07,,,,Y,N,,,
23956,36 - 45,3196.0,Street Check,,-9,116.0,N,Unknown,White,Male,2016-09-14,,,,N,N,,,
27618,18 - 25,18244.0,Street Check,,-9,117.0,N,Unknown,Unknown,Female,2017-06-06,,,,N,N,,,
31095,36 - 45,24954.0,Street Check,,-9,116.0,N,Unknown,White,Female,2016-01-02,,,,N,N,,,
31185,18 - 25,24821.0,Street Check,,-9,115.0,N,Unknown,Black,Male,2015-05-19,,,,N,N,,,


In [18]:
#drop the rows with officers aged over 100 years, these 5 rows have multiple columns with NaN or unkown data
df = df.drop(df[df.officer_age >70].index)

In [19]:
#No additional negative values similar to above
df.officer_id.describe()

count    34516.000000
mean      1848.543284
std        784.092783
min          7.000000
25%       1585.000000
50%       1805.000000
75%       2156.000000
max       5158.000000
Name: officer_id, dtype: float64

In [20]:
df.officer_gender.unique()

[M, F]
Categories (2, object): [M, F]

In [21]:
#There are NaNs, but there are unknowns, these will stay unchanged in the dataset
df.officer_race.value_counts(ascending = False, dropna = False)

White                            26858
Hispanic or Latino                1771
Two or More Races                 1588
Asian                             1413
Black or African American         1384
Not Specified                      845
Nat Hawaiian/Oth Pac Islander      369
American Indian/Alaska Native      286
Unknown                              2
Name: officer_race, dtype: int64

In [22]:
df[df['officer_race'] == 'Unknown']

Unnamed: 0,subject_age,subject_id,stop_resolution,weapon_type,officer_id,officer_age,officer_gender,officer_race,subject_race,subject_gender,date,initial_call_type,call_type,officer_squad,arrest,frisk,precinct,sector,beat
13504,56 and Above,6864.0,GO Report,,4766,37.0,M,Unknown,Hispanic,Male,2018-07-10,UNKNOWN - ANI/ALI - WRLS PHNS (INCL OPEN LINE),"TELEPHONE OTHER, NOT 911",TRAINING - FIELD TRAINING SQUAD,N,Y,North,B,B2
29071,26 - 35,6490.0,Street Check,,4766,37.0,M,Unknown,White,Female,2018-05-21,,,TRAINING - FIELD TRAINING SQUAD,N,N,,,


In [23]:
df[df['officer_race'] == 'Not Specified'].head()

Unnamed: 0,subject_age,subject_id,stop_resolution,weapon_type,officer_id,officer_age,officer_gender,officer_race,subject_race,subject_gender,date,initial_call_type,call_type,officer_squad,arrest,frisk,precinct,sector,beat
68,18 - 25,1806.0,,,1824,27.0,M,Not Specified,Unknown,Male,2015-07-18,,,WEST PCT 1ST W - KING/QUEEN,N,N,,,
224,46 - 55,13405.0,,,2169,29.0,M,Not Specified,Black,Male,2017-04-22,ASLT - IP/JO - PERSON SHOT OR SHOT AT,"TELEPHONE OTHER, NOT 911",EAST PCT 3RD W - E/G RELIEF,N,Y,East,G,G2
230,36 - 45,23667.0,,,1047,49.0,M,Not Specified,White,Male,2018-09-20,ASLT - IP/JO - WITH OR W/O WPNS (NO SHOOTINGS),911,WEST PCT 2ND W - K/Q RELIEF,N,N,West,K,K1
241,36 - 45,15030.0,,,2201,40.0,M,Not Specified,Unknown,Male,2019-04-29,,,SOUTHWEST PCT 2ND W - WILLIAM,N,N,,,
346,1 - 17,,Arrest with GO or Supplemental,Handgun,1576,34.0,M,Not Specified,Black,Male,2016-01-05,UNKNOWN - ANI/ALI - LANDLINE (INCLUDES OPEN LINE),911,SOUTH PCT OPS - NIGHT ACT,Y,Y,South,S,S3


In [24]:
#replace the 'unkown' values with 'Not Specified'
df.replace({'officer_race' : {'Unknown':'Not Specified'}}, inplace = True)

In [25]:
# view the races listed in the dataset
df.subject_race.unique()

[Black, White, NaN, Multi-Racial, Unknown, Asian, Hispanic, American Indian / Alaskan Native, Other]
Categories (8, object): [Black, White, Multi-Racial, Unknown, Asian, Hispanic, American Indian / Alaskan Native, Other]

In [26]:
df.subject_race.value_counts(dropna = False)

White                               17226
Black                               10486
Hispanic                             1684
Unknown                              1569
American Indian / Alaskan Native     1094
Asian                                1030
Multi-Racial                          809
NaN                                   466
Other                                 152
-                                       0
Name: subject_race, dtype: int64

In [27]:
len(df[pd.isnull(df.subject_race)])

466

In [28]:
#drop the 466 rows where there is no subject data (age, race, gender )
df = df.drop(df[pd.isnull(df.subject_race)].index)


In [29]:
df.date.describe()

count                   34050
unique                   1514
top       2015-10-01 00:00:00
freq                       91
first     2015-03-15 00:00:00
last      2019-05-07 00:00:00
Name: date, dtype: object

In [30]:
df.call_type.unique()

array(['911', 'ONVIEW', nan, 'TELEPHONE OTHER, NOT 911',
       'ALARM CALL (NOT POLICE ALARM)', 'TEXT MESSAGE',
       'SCHEDULED EVENT (RECURRING)'], dtype=object)

In [31]:
df.initial_call_type.value_counts(dropna = False)

NaN                                               12380
SUSPICIOUS PERSON, VEHICLE OR INCIDENT             1899
DISTURBANCE, MISCELLANEOUS/OTHER                   1670
SUSPICIOUS STOP - OFFICER INITIATED ONVIEW         1619
ASLT - IP/JO - WITH OR W/O WPNS (NO SHOOTINGS)     1378
                                                  ...  
MISSING - ADULT                                       1
ESCAPE - PRISONER                                     1
ALARM - RESIDENTIAL - SILENT/AUD PANIC/DURESS         1
ANIMAL, REPORT - BITE                                 1
DEMONSTRATIONS                                        1
Name: initial_call_type, Length: 157, dtype: int64

In [32]:
df.initial_call_type.nunique()

156

In [33]:
df.call_type.value_counts(dropna = False)

911                              13989
NaN                              12380
ONVIEW                            5309
TELEPHONE OTHER, NOT 911          2212
ALARM CALL (NOT POLICE ALARM)      158
TEXT MESSAGE                         1
SCHEDULED EVENT (RECURRING)          1
Name: call_type, dtype: int64

In [34]:
df.frisk.unique()

[N, NaN, Y]
Categories (2, object): [N, Y]

In [35]:
df.frisk.value_counts(dropna = False)

N      26324
Y       7311
NaN      415
-          0
Name: frisk, dtype: int64

In [36]:
df.arrest.unique()

[N, Y]
Categories (2, object): [N, Y]

In [37]:
#create the dictionary that will be used for replacing string categories with values
clean_flags = {'frisk' : {'Y' : 1, 'N' : 0}, 
              'arrest' : {'Y' : 1, 'N' : 0}}

#replace str categories with 1s and 0s
df.replace(clean_flags, inplace = True)

In [38]:
df.shape

(34050, 19)

In [39]:
#change the layout of the columns and save the cleaned file to csv.
df = df[['date','officer_id','officer_age', 'officer_race', 'officer_gender','officer_squad',
               'subject_id','subject_age','subject_race','subject_gender',
              'stop_resolution','weapon_type','initial_call_type','call_type','arrest', 'frisk',
              'precinct', 'sector', 'beat' ]]



#create the path for the new file
terry_data_clean = os.path.join(PROJ_ROOT, 
                         'data', 'processed',
                         'Terry_Stops_Clean.csv')

df.to_csv(terry_data_clean, index = False)