This week we are tasked with exploring Linear and Logistic Regression models to predict an outcome. Based on our initial Exploratory analysis, we will perform feature extraction and analysis to determine what relevant features can be used to build our model.

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import tensorflow as tf

In [5]:
#Importing data
openpolicing_path="../ma_statewide_2020_04_01.csv"

data=pd.read_csv(openpolicing_path,low_memory=False)

In [3]:
data.head()

Unnamed: 0,raw_row_number,date,location,county_name,subject_age,subject_race,subject_sex,type,arrest_made,citation_issued,...,contraband_weapons,contraband_alcohol,contraband_other,frisk_performed,search_conducted,search_basis,reason_for_stop,vehicle_type,vehicle_registration_state,raw_Race
0,1,2007-06-06,MIDDLEBOROUGH,Plymouth County,33.0,white,male,vehicular,False,True,...,,False,,,False,,Speed,Passenger,MA,White
1,2,2007-06-07,SEEKONK,Bristol County,36.0,white,male,vehicular,False,False,...,False,False,False,False,True,other,,Commercial,MA,White
2,3,2007-06-07,MEDFORD,Middlesex County,56.0,white,female,vehicular,False,False,...,,False,,,False,,,Passenger,MA,White
3,4,2007-06-07,MEDFORD,Middlesex County,37.0,white,male,vehicular,False,False,...,,False,,,False,,,Commercial,MA,White
4,5,2007-06-07,EVERETT,Middlesex County,22.0,hispanic,female,vehicular,False,True,...,,False,,,False,,,Commercial,MA,Hispanic


We check the data types to see if appropriate types are being used. We will convert them accordinly.

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3416238 entries, 0 to 3416237
Data columns (total 24 columns):
 #   Column                      Dtype  
---  ------                      -----  
 0   raw_row_number              int64  
 1   date                        object 
 2   location                    object 
 3   county_name                 object 
 4   subject_age                 float64
 5   subject_race                object 
 6   subject_sex                 object 
 7   type                        object 
 8   arrest_made                 object 
 9   citation_issued             object 
 11  outcome                     object 
 12  contraband_found            object 
 13  contraband_drugs            object 
 14  contraband_weapons          object 
 15  contraband_alcohol          bool   
 16  contraband_other            object 
 17  frisk_performed             object 
 18  search_conducted            bool   
 19  search_basis                object 
 20  reason_for_stop      

We convert the date from object to date type and extract time for future analyses.

In [6]:
##Convert date to datetype
data.date
data["date"] = pd.to_datetime(data.date, format="%Y-%m-%d")
#Extract time
data['time'] = [d.time() for d in data['date']]
#Convert date back to original form
data['date'] = pd.to_datetime(data['date']).dt.date

In [6]:
data.head()

Unnamed: 0,raw_row_number,date,location,county_name,subject_age,subject_race,subject_sex,type,arrest_made,citation_issued,...,contraband_alcohol,contraband_other,frisk_performed,search_conducted,search_basis,reason_for_stop,vehicle_type,vehicle_registration_state,raw_Race,time
0,1,2007-01-06,MIDDLEBOROUGH,Plymouth County,33.0,white,male,vehicular,False,True,...,False,,,False,,Speed,Passenger,MA,White,00:06:00
1,2,2007-01-07,SEEKONK,Bristol County,36.0,white,male,vehicular,False,False,...,False,False,False,True,other,,Commercial,MA,White,00:06:00
2,3,2007-01-07,MEDFORD,Middlesex County,56.0,white,female,vehicular,False,False,...,False,,,False,,,Passenger,MA,White,00:06:00
3,4,2007-01-07,MEDFORD,Middlesex County,37.0,white,male,vehicular,False,False,...,False,,,False,,,Commercial,MA,White,00:06:00
4,5,2007-01-07,EVERETT,Middlesex County,22.0,hispanic,female,vehicular,False,True,...,False,,,False,,,Commercial,MA,Hispanic,00:06:00


In [7]:
#Convering race, sex, location etc to categorical
data.iloc[:,np.r_[2:4,5:8,9:12,19:23]]=data.iloc[:,np.r_[2:4,5:8,9:12,19:23]].astype("category")

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3416238 entries, 0 to 3416237
Data columns (total 25 columns):
 #   Column                      Dtype   
---  ------                      -----   
 0   raw_row_number              int64   
 1   date                        object  
 2   location                    category
 3   county_name                 category
 4   subject_age                 float64 
 5   subject_race                category
 6   subject_sex                 category
 7   type                        category
 8   arrest_made                 object  
 9   citation_issued             category
 11  outcome                     category
 12  contraband_found            object  
 13  contraband_drugs            object  
 14  contraband_weapons          object  
 15  contraband_alcohol          bool    
 16  contraband_other            object  
 17  frisk_performed             object  
 18  search_conducted            bool    
 19  search_basis                category
 20  

In [9]:
#Checking proportion of missing values
data.isnull().sum()*100/len(data)

raw_row_number                 0.000000
date                           0.000000
location                       0.195127
county_name                    0.195127
subject_age                    4.625146
subject_race                   0.048709
subject_sex                    0.457316
type                           0.000000
arrest_made                    0.026813
citation_issued                0.026813
outcome                        0.200747
contraband_found              98.368673
contraband_drugs              98.368673
contraband_weapons            98.368673
contraband_alcohol             0.000000
contraband_other              98.368673
frisk_performed               98.400843
search_conducted               0.000000
search_basis                  98.506281
reason_for_stop               48.579432
vehicle_type                   0.145277
vehicle_registration_state     0.287275
raw_Race                       0.048709
time                           0.000000
dtype: float64

<!-- We will use the KNN Imputer for imputing values into the Age and Sex variables for better precision -->

In [None]:
# Impute the missing values.
# from sklearn.impute import SimpleImputer

# # impute missing data
# my_imputer = SimpleImputer(strategy='most_frequent')
# data["subject_"] = pd.DataFrame(my_imputer.fit_transform(data["subject_"]))

In [8]:
#Removing null, and unknown values from race
data=data[-data["subject_race"].isin(["other","unknown",np.nan])]
data["subject_race"].unique()

[white, hispanic, black, asian/pacific islander]
Categories (4, object): [white, hispanic, black, asian/pacific islander]

Encoding categorical variables for future analyses

In [None]:
#### Below code doesnt work Error:Input contains NaN

In [None]:
## One hot encoding race and other categorical variables
# from sklearn.preprocessing import OneHotEncoder
# ohe = OneHotEncoder(categories='auto')
# df = pd.DataFrame(data)
# feature_arr = ohe.fit_transform(df[['subject_race','arrest_made','citation_issued','outcome','contraband_found','contraband_drugs','contraband_weapons','contraband_alcohol','contraband_other']]).toarray()
# feature_labels = ohe.categories_
# feature_labels = np.array(feature_labels).ravel()

# features = pd.DataFrame(feature_arr, columns=feature_labels)

We use the get_dummies function to convert categorical variables to one hot encoded indicator variables

In [14]:
#One hot encoding categorical variables

df=pd.get_dummies(data.loc[:,['subject_race','arrest_made','citation_issued','outcome','contraband_found','contraband_drugs','contraband_weapons','contraband_alcohol','contraband_other','search_basis','reason_for_stop']])

We create a new binomial variable that tells us if the stop was before or after sunset

In [56]:
from suntime import Sun, SunTimeException

latitude = 42.407211
longitude = -71.382439

sun = Sun(latitude, longitude)

# Get sunrise and sunset in UTC
# sunrise=[]
sunset=[]
for value in data["date"]:
#     sunrise.append(sun.get_sunrise_time(value))
    sunset.append(sun.get_sunset_time(value))
    format(value.strftime('%H:%M:%S'))

In [69]:
sunset=pd.to_datetime(sunset)
data["time"]=pd.to_datetime(data["time"])

In [None]:
# Creating binomial variable for whether the stop was before or after sunset
# is_sunset=[]
# for i in range(len(sunset)):
#     for value in data["time"]:
#         data["is_sunset"]=np.where(value.time()>sunset[i].time(),"True","False")




In [105]:
group_agg = data.groupby(["time"]).agg({
  "search_conducted" : lambda x: x.mean(skipna = True), 
  "frisk_performed" : lambda x: x.mean(skipna = True), 
  "contraband_found" : lambda x: x.mean(skipna = True)
  })

In [None]:
group_agg.rename(columns={"search_conducted" : 'search_rate', "frisk_performed" : 'frisk_rate',"contraband_found":'success_rate'}, inplace=True)

print(len(group_agg))