In [2]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
pd.set_option('max_columns', 120)
pd.set_option('max_colwidth', 5000)
%matplotlib inline

# inspired by 
# https://github.com/mbadi/Stop_and_Frisk_Classification-/blob/master/NYPD_Stop-and-Frisk.ipynb
# https://github.com/alexamanpreet/Machine-Learning-using-DNN-on-NYPD-s-Stop-and-Frisk/blob/master/Machine%20Learning%20using%20DNN%20on%20NYPD's%20%20Stop-and-Frisk.ipynb

### 

- pct PRECINCT OF STOP (FROM 1 TO 123)
- inout		WAS STOP INSIDE OR OUTSIDE
- perobs		PERIOD OF OBSERVATION (MMM)
- perstop		PERIOD OF STOP (MMM)
- explnstp		DID OFFICER EXPLAIN REASON FOR STOP ?
- arstmade		WAS AN ARREST MADE ?
- sumissue		WAS A SUMMONS ISSUED ?
- offunif		WAS OFFICER IN UNIFORM ?
- frisked		WAS SUSPECT FRISKED ?
- searched		WAS SUSPECT SEARCHED ?
- contrabn		WAS CONTRABAND FOUND ON SUSPECT ?
- pistol		WAS A PISTOL FOUND ON SUSPECT ?
- rf_attir		REASON FOR FRISK - INAPPROPRIATE ATTIRE FOR SEASON
- sex		SUSPECT'S SEX
- race		SUSPECT'S RACE
- age		SUSPECT'S AGE

##
- A - ASIAN/PACIFIC ISLANDER
- B - BLACK
- I - AMERICAN INDIAN/ALASKAN NATIVE
- P - BLACK-HISPANIC
- Q - WHITE-HISPANIC
- W - WHITE
- U - UNKNOWN
- Z - OTHER

In [3]:
## READ DATA 
df = pd.read_csv('../data/sqf_2013_2016.csv')
df.shape

  interactivity=interactivity, compiler=compiler, result=result)


(272606, 112)

##  Preprocessing 

In [4]:
## Remove columns that have many missing values
df = df.dropna(thresh=len(df)/2, axis=1) 
df.shape

(272606, 112)

In [5]:
# Keep these columns 
colnames = ['age','sex','race', 'perobs','perstop',
            'explnstp','sumissue','inout', 'rf_attir', 'searched', 
            'frisked', 'offunif', 'pistol','contrabn','arstmade']
df = df[colnames]
df.shape

(272606, 15)

In [8]:
## Remove Null values 
df = df.replace('(Null)', np.nan)
df = df.replace('**', np.nan)
df = df.replace(' ', np.nan)
df = df.dropna()
df.shape

(261379, 15)

In [9]:
### Encoding YES or NO
le = LabelEncoder()
#convert the categorical columns into numeric
df['sex'] = le.fit_transform(df['sex'])
df['explnstp'] = le.fit_transform(df['explnstp'])
df['sumissue'] = le.fit_transform(df['sumissue'])
df['inout'] = le.fit_transform(df['inout'])
df['rf_attir'] = le.fit_transform(df['rf_attir'])
df['searched'] = le.fit_transform(df['searched'])
df['frisked'] = le.fit_transform(df['frisked'])
df['offunif'] = le.fit_transform(df['offunif'])
df['pistol'] = le.fit_transform(df['pistol'])
df['contrabn'] = le.fit_transform(df['contrabn'])
df['arstmade'] = le.fit_transform(df['arstmade'])

In [10]:
df.shape

(261379, 15)

In [11]:
df.head()

Unnamed: 0,age,sex,race,perobs,perstop,explnstp,sumissue,inout,rf_attir,searched,frisked,offunif,pistol,contrabn,arstmade
0,41,1,B,6,1,1,0,0,2,0,1,0,1,0,0
1,32,1,A,15,3,1,0,0,1,0,0,0,1,0,0
2,20,1,B,10,5,1,0,0,1,1,1,0,1,0,0
3,57,1,B,1,1,1,0,1,1,0,0,1,1,0,0
4,31,1,B,5,2,1,0,1,1,0,0,1,1,0,0
