This week we are tasked with exploring Linear and Logistic Regression models to predict an outcome. Based on our initial Exploratory analysis, we will perform feature extraction and analysis to determine what relevant features can be used to build our model

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [15]:
#Importing data
openpolicing_path="ma_statewide_2020_04_01.csv"

data=pd.read_csv(openpolicing_path,low_memory=False)

In [5]:
data.head()

Unnamed: 0,raw_row_number,date,location,county_name,subject_age,subject_race,subject_sex,type,arrest_made,citation_issued,...,contraband_weapons,contraband_alcohol,contraband_other,frisk_performed,search_conducted,search_basis,reason_for_stop,vehicle_type,vehicle_registration_state,raw_Race
0,1,2007-06-06,MIDDLEBOROUGH,Plymouth County,33.0,white,male,vehicular,False,True,...,,False,,,False,,Speed,Passenger,MA,White
1,2,2007-06-07,SEEKONK,Bristol County,36.0,white,male,vehicular,False,False,...,False,False,False,False,True,other,,Commercial,MA,White
2,3,2007-06-07,MEDFORD,Middlesex County,56.0,white,female,vehicular,False,False,...,,False,,,False,,,Passenger,MA,White
3,4,2007-06-07,MEDFORD,Middlesex County,37.0,white,male,vehicular,False,False,...,,False,,,False,,,Commercial,MA,White
4,5,2007-06-07,EVERETT,Middlesex County,22.0,hispanic,female,vehicular,False,True,...,,False,,,False,,,Commercial,MA,Hispanic


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3416238 entries, 0 to 3416237
Data columns (total 24 columns):
 #   Column                      Dtype  
---  ------                      -----  
 0   raw_row_number              int64  
 1   date                        object 
 2   location                    object 
 3   county_name                 object 
 4   subject_age                 float64
 5   subject_race                object 
 6   subject_sex                 object 
 7   type                        object 
 8   arrest_made                 object 
 9   citation_issued             object 
 11  outcome                     object 
 12  contraband_found            object 
 13  contraband_drugs            object 
 14  contraband_weapons          object 
 15  contraband_alcohol          bool   
 16  contraband_other            object 
 17  frisk_performed             object 
 18  search_conducted            bool   
 19  search_basis                object 
 20  reason_for_stop      

In [16]:
##Convert date to datetype
import datetime
data.date
data["date"] = pd.to_datetime(data.date, format="%Y-%M-%d")
#Extract time
data['time'] = [d.time() for d in data['date']]

In [17]:
data

Unnamed: 0,raw_row_number,date,location,county_name,subject_age,subject_race,subject_sex,type,arrest_made,citation_issued,...,contraband_alcohol,contraband_other,frisk_performed,search_conducted,search_basis,reason_for_stop,vehicle_type,vehicle_registration_state,raw_Race,time
0,1,2007-01-06 00:06:00,MIDDLEBOROUGH,Plymouth County,33.0,white,male,vehicular,False,True,...,False,,,False,,Speed,Passenger,MA,White,00:06:00
1,2,2007-01-07 00:06:00,SEEKONK,Bristol County,36.0,white,male,vehicular,False,False,...,False,False,False,True,other,,Commercial,MA,White,00:06:00
2,3,2007-01-07 00:06:00,MEDFORD,Middlesex County,56.0,white,female,vehicular,False,False,...,False,,,False,,,Passenger,MA,White,00:06:00
3,4,2007-01-07 00:06:00,MEDFORD,Middlesex County,37.0,white,male,vehicular,False,False,...,False,,,False,,,Commercial,MA,White,00:06:00
4,5,2007-01-07 00:06:00,EVERETT,Middlesex County,22.0,hispanic,female,vehicular,False,True,...,False,,,False,,,Commercial,MA,Hispanic,00:06:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3416233,3418294,2015-01-22 00:12:00,DARTMOUTH,Bristol County,25.0,white,male,vehicular,False,True,...,False,,,False,,Speed,Passenger,MA,White,00:12:00
3416234,3418295,2015-01-27 00:12:00,FALL RIVER,Bristol County,52.0,asian/pacific islander,female,vehicular,False,True,...,False,,,False,,Speed,Passenger,MA,Asian or Pacific Islander,00:12:00
3416235,3418296,2015-01-06 00:01:00,NORTHAMPTON,Hampshire County,41.0,white,female,vehicular,False,False,...,False,,,False,,,Passenger,MA,White,00:01:00
3416236,3418297,2015-01-06 00:01:00,HOLYOKE,Hampden County,26.0,white,male,vehicular,False,True,...,False,,,False,,,Passenger,MA,White,00:01:00


In [None]:
#Convering race, sex, location etc to categorical
data.iloc[:,np.r_[2:4,5:8,9:12,19:23]]=data.iloc[:,np.r_[2:4,5:8,9:12,19:23]].astype("category")

In [None]:
data.info()

In [38]:
#Checking proportion of missing values
data.isnull().sum()*100/len(data)

raw_row_number                 0.000000
date                           0.000000
location                       0.192881
county_name                    0.192881
subject_age                    4.251496
subject_race                   0.000000
subject_sex                    0.066616
type                           0.000000
arrest_made                    0.026546
citation_issued                0.026546
outcome                        0.198019
contraband_found              98.365711
contraband_drugs              98.365711
contraband_weapons            98.365711
contraband_alcohol             0.000000
contraband_other              98.365711
frisk_performed               98.396125
search_conducted               0.000000
search_basis                  98.501956
reason_for_stop               48.285615
vehicle_type                   0.139641
vehicle_registration_state     0.270423
raw_Race                       0.000000
time                           0.000000
rise                           0.000000


<!-- We will use the KNN Imputer for imputing values into the Age and Sex variables for better precision -->

We impute null values using an MICE imputer from fancyimpute, a popular algorithm.
Check imputation algorithms here:
https://www.kaggle.com/residentmario/simple-techniques-for-missing-data-imputation

# Please fix below code(s) to impute values for age and sex, the missing values are < 1% for sex so we can remove them for more precision for age it is 4.6%

In [None]:
# from sklearn.impute import KNNImputer

# imputer = KNNImputer(n_neighbors=2, weights="uniform")

# data["subject_age"]= imputer.fit_transform(data[['subject_age']])
# # data['subject_sex'] = imputer.fit_transform(data[['subject_sex']])
# data["subject_age"].isnull().sum()
# print(data['subject_age'])
# # print(data['subject_sex'])

In [None]:
# Impute the missing values.

# from fancyimpute import MICE
# trans = MICE(verbose=False)
# data["subject_sex"] = trans.complete(data["subject_sex"])


In [35]:
#Removing null, and unknown values from race
data=data[-data["subject_race"].isin(["other","unknown",np.nan])]
data["subject_race"].unique()

array(['white', 'hispanic', 'black', 'asian/pacific islander'],
      dtype=object)

Encoding categorical variables for future analyses

In [None]:
# One hot encoding race and other categorical variables
import sklearn.preprocessing.LabelEncoder
label_encoder = LabelEncoder()

df['subject_race'] = label_encoder.fit_transform(df['subject_race'])
df['arrest_made'] = label_encoder.fit_transform(df['arrest_made'])
df['citation_issued'] = label_encoder.fit_transform(df['citation_issued'])
df['outcome'] = label_encoder.fit_transform(df['outcome'])
df['contraband_found'] = label_encoder.fit_transform(df['contraband_found'])
df['contraband_drugs'] = label_encoder.fit_transform(df['contraband_drugs'])
df['contraband_weapons'] = label_encoder.fit_transform(df['contraband_weapons'])
df['contraband_alcohol'] = label_encoder.fit_transform(df['contraband_alcohol'])
df['contraband_other'] = label_encoder.fit_transform(df['contraband_other'])

In [34]:
from suntime import Sun, SunTimeException

latitude = 42.407211
longitude = -71.382439

sun = Sun(latitude, longitude)

# Get sunrise and sunset in UTC
sunrise=[]
sunset=[]
for value in data["date"]:
    sunrise.append(sun.get_sunrise_time(value))
    sunset.append(sun.get_sunset_time(value))
    format(value.strftime('%H:%M'))
                  
data["sunrise"]=sunrise
data["sunset"]=sunset
print(data)

         raw_row_number       date       location       county_name  \
0                     1 2007-06-06  MIDDLEBOROUGH   Plymouth County   
1                     2 2007-06-07        SEEKONK    Bristol County   
2                     3 2007-06-07        MEDFORD  Middlesex County   
3                     4 2007-06-07        MEDFORD  Middlesex County   
4                     5 2007-06-07        EVERETT  Middlesex County   
...                 ...        ...            ...               ...   
3416233         3418294 2015-12-22      DARTMOUTH    Bristol County   
3416234         3418295 2015-12-27     FALL RIVER    Bristol County   
3416235         3418296 2015-01-06    NORTHAMPTON  Hampshire County   
3416236         3418297 2015-01-06        HOLYOKE    Hampden County   
3416237         3418298 2015-01-06        HOLYOKE    Hampden County   

         subject_age            subject_race subject_sex       type  \
0               33.0                   white        male  vehicular   
1    

We now create new values for search, success rates

In [41]:
group_agg = data.groupby(["date"]).agg({
  "search_conducted" : lambda x: x.mean(skipna = True), 
  "frisk_performed" : lambda x: x.mean(skipna = True), 
  "contraband_found" : lambda x: x.mean(skipna = True)
  })

In [42]:
group_agg.rename(columns={"search_conducted" : 'search_rate', "frisk_performed" : 'frisk_rate',"contraband_found":'success_rate'}, inplace=True)

print(group_agg)

            search_rate  frisk_rate  success_rate
date                                             
2007-01-01     0.039474         0.5      0.333333
2007-01-02     0.021739         0.0      1.000000
2007-01-03     0.000000         NaN           NaN
2007-01-04     0.000000         NaN           NaN
2007-01-05     0.024390         0.0      0.000000
...                 ...         ...           ...
2015-12-27     0.009747         0.4      0.600000
2015-12-28     0.008997         0.0      0.571429
2015-12-29     0.008380         0.0      0.666667
2015-12-30     0.003106         0.0      0.000000
2015-12-31     0.008902         0.0      1.000000

[3287 rows x 3 columns]
