This week we are tasked with exploring Linear and Logistic Regression models to predict an outcome. Based on our initial Exploratory analysis, we will perform feature extraction and analysis to determine what relevant features can be used to build our model

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [38]:
#Importing data
openpolicing_path="ma_statewide_2020_04_01.csv"

data=pd.read_csv(openpolicing_path,low_memory=False)

In [40]:
data.head()

Unnamed: 0,raw_row_number,date,location,county_name,subject_age,subject_race,subject_sex,type,arrest_made,citation_issued,warning_issued,outcome,contraband_found,contraband_drugs,contraband_weapons,contraband_alcohol,contraband_other,frisk_performed,search_conducted,search_basis,reason_for_stop,vehicle_type,vehicle_registration_state,raw_Race
0,1,2007-06-06,MIDDLEBOROUGH,Plymouth County,33.0,white,male,vehicular,False,True,False,citation,,,,False,,,False,,Speed,Passenger,MA,White
1,2,2007-06-07,SEEKONK,Bristol County,36.0,white,male,vehicular,False,False,True,warning,False,False,False,False,False,False,True,other,,Commercial,MA,White
2,3,2007-06-07,MEDFORD,Middlesex County,56.0,white,female,vehicular,False,False,True,warning,,,,False,,,False,,,Passenger,MA,White
3,4,2007-06-07,MEDFORD,Middlesex County,37.0,white,male,vehicular,False,False,True,warning,,,,False,,,False,,,Commercial,MA,White
4,5,2007-06-07,EVERETT,Middlesex County,22.0,hispanic,female,vehicular,False,True,False,citation,,,,False,,,False,,,Commercial,MA,Hispanic


In [28]:
data.info()

data.contraband_found()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3416238 entries, 0 to 3416237
Data columns (total 24 columns):
 #   Column                      Dtype  
---  ------                      -----  
 0   raw_row_number              int64  
 1   date                        object 
 2   location                    object 
 3   county_name                 object 
 4   subject_age                 float64
 5   subject_race                object 
 6   subject_sex                 object 
 7   type                        object 
 8   arrest_made                 object 
 9   citation_issued             object 
 11  outcome                     object 
 12  contraband_found            object 
 13  contraband_drugs            object 
 14  contraband_weapons          object 
 15  contraband_alcohol          bool   
 16  contraband_other            object 
 17  frisk_performed             object 
 18  search_conducted            bool   
 19  search_basis                object 
 20  reason_for_stop      

vehicular    3416238
Name: type, dtype: int64

In [70]:
#Convert date to datetype
data.date
data["date"] = pd.to_datetime(data.date, format="%Y-%M-%d ")
# data["year"] = data.date.dt.year
data['time'] = [d.time() for d in data['date']]

data['date'] = pd.to_datetime(data['date']).dt.date

In [30]:
#Convering race, sex, location etc to categorical
data.iloc[:,np.r_[2:4,5:8,9:12,19:23]]=data.iloc[:,np.r_[2:4,5:8,9:12,19:23]].astype("category")



In [71]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3416238 entries, 0 to 3416237
Data columns (total 26 columns):
 #   Column                      Dtype  
---  ------                      -----  
 0   raw_row_number              int64  
 1   date                        object 
 2   location                    object 
 3   county_name                 object 
 4   subject_age                 float64
 5   subject_race                object 
 6   subject_sex                 object 
 7   type                        object 
 8   arrest_made                 object 
 9   citation_issued             object 
 11  outcome                     object 
 12  contraband_found            object 
 13  contraband_drugs            object 
 14  contraband_weapons          object 
 15  contraband_alcohol          bool   
 16  contraband_other            object 
 17  frisk_performed             object 
 18  search_conducted            bool   
 19  search_basis                object 
 20  reason_for_stop      

In [32]:
#Checking proportion of missing values
data.isnull().sum()*100/len(data)

raw_row_number                 0.000000
date                           0.000000
location                       0.195127
county_name                    0.195127
subject_age                    4.625146
subject_race                   0.048709
subject_sex                    0.457316
type                           0.000000
arrest_made                    0.026813
citation_issued                0.026813
outcome                        0.200747
contraband_found              98.368673
contraband_drugs              98.368673
contraband_weapons            98.368673
contraband_alcohol             0.000000
contraband_other              98.368673
frisk_performed               98.400843
search_conducted               0.000000
search_basis                  98.506281
reason_for_stop               48.579432
vehicle_type                   0.145277
vehicle_registration_state     0.287275
raw_Race                       0.048709
dtype: float64

<!-- We will use the KNN Imputer for imputing values into the Age and Sex variables for better precision -->

We impute null values using an MICE imputer from fancyimpute, a popular algorithm.
Check imputation algorithms here:
https://www.kaggle.com/residentmario/simple-techniques-for-missing-data-imputation

In [None]:
# from sklearn.impute import KNNImputer

# imputer = KNNImputer(n_neighbors=2, weights="uniform")

# data["subject_age"]= imputer.fit_transform(data[['subject_age']])
# # data['subject_sex'] = imputer.fit_transform(data[['subject_sex']])
# data["subject_age"].isnull().sum()
# print(data['subject_age'])
# # print(data['subject_sex'])

In [19]:
# Impute the missing values.

# from fancyimpute import MICE
# trans = MICE(verbose=False)
# data["subject_sex"] = trans.complete(data["subject_sex"])


0          False
1          False
2          False
3          False
4          False
           ...  
3416233    False
3416234    False
3416235    False
3416236    False
3416237    False
Name: subject_race, Length: 3386549, dtype: bool

In [33]:
#Removing null, and unknown values from race
data=data[-data["subject_race"].isin(["other","unknown",np.nan])]
data["subject_race"].unique()

[white, hispanic, black, asian/pacific islander]
Categories (4, object): [white, hispanic, black, asian/pacific islander]

We now create new values for search, success rates

In [79]:
group_agg = data.groupby(["date"]).agg({
  "search_conducted" : lambda x: x.mean(skipna = True), 
  "frisk_performed" : lambda x: x.mean(skipna = True), 
  "contraband_found" : lambda x: x.mean(skipna = True)
  })

In [92]:
group_agg.rename(columns={"search_conducted" : 'search_rate', "frisk_performed" : 'frisk_rate',"contraband_found":'success_rate'}, inplace=True)

print(group_agg)

            search_rate  frisk_rate  success_rate
date                                             
2007-01-01     0.028747    0.052885      0.440171
2007-01-02     0.043081    0.043011      0.372308
2007-01-03     0.038253    0.045455      0.436782
2007-01-04     0.038511    0.067568      0.365314
2007-01-05     0.030079    0.038278      0.320833
...                 ...         ...           ...
2015-01-27     0.007410    0.070588      0.547619
2015-01-28     0.011166    0.056452      0.464000
2015-01-29     0.007691    0.051282      0.461538
2015-01-30     0.006047    0.046154      0.307692
2015-01-31     0.007370    0.000000      0.510638

[279 rows x 3 columns]


In [None]:
# One hot encoding race and other categorical variables
import sklearn.preprocessing.LabelEncoder
label_encoder = LabelEncoder()

df['subject_race'] = label_encoder.fit_transform(df['subject_race'])
df['arrest_made'] = label_encoder.fit_transform(df['arrest_made'])
df['citation_issued'] = label_encoder.fit_transform(df['citation_issued'])
df['outcome'] = label_encoder.fit_transform(df['outcome'])
df['contraband_found'] = label_encoder.fit_transform(df['contraband_found'])
df['contraband_drugs'] = label_encoder.fit_transform(df['contraband_drugs'])
df['contraband_weapons'] = label_encoder.fit_transform(df['contraband_weapons'])
df['contraband_alcohol'] = label_encoder.fit_transform(df['contraband_alcohol'])
df['contraband_other'] = label_encoder.fit_transform(df['contraband_other'])

In [None]:
# #get sunset time
# from astral import Astral
# import datetime
# a = Astral()
# a.solar_depression = 'civil'
# city = a["Boston"]
# sun = city.sun(date=datetime.date(group_agg["date"]), local=True)

In [99]:
# import ephem

# def sunup(lat, long, time):
#     o = ephem.Observer()
#     o.long = long
#     o.lat = lat
#     o.date = time
#     s = ephem.Sun()
#     s.compute(o)
#     return s.alt > 0

# def daytime(datefield, feature):
#     date = str(feature.attribute(datefield))
#     xy = feature.geometry().asPoint()
#     return ["Night","Day"][sunup(str(xy[1]),str(xy[0]),date)]

# daytime(data["date"])