This week we are tasked with exploring Linear and Logistic Regression models to predict an outcome. Based on our initial Exploratory analysis, we will perform feature extraction and analysis to determine what relevant features can be used to build our model

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
#Importing data
openpolicing_path="ma_statewide_2020_04_01.csv"

data=pd.read_csv(openpolicing_path,low_memory=False)

In [None]:
data.head(15)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3416238 entries, 0 to 3416237
Data columns (total 24 columns):
 #   Column                      Dtype  
---  ------                      -----  
 0   raw_row_number              int64  
 1   date                        object 
 2   location                    object 
 3   county_name                 object 
 4   subject_age                 float64
 5   subject_race                object 
 6   subject_sex                 object 
 7   type                        object 
 8   arrest_made                 object 
 9   citation_issued             object 
 11  outcome                     object 
 12  contraband_found            object 
 13  contraband_drugs            object 
 14  contraband_weapons          object 
 15  contraband_alcohol          bool   
 16  contraband_other            object 
 17  frisk_performed             object 
 18  search_conducted            bool   
 19  search_basis                object 
 20  reason_for_stop      

In [8]:
#Convert date to datetype
data.date
data["date"] = pd.to_datetime(data.date, format="%Y-%M-%d")
data["year"] = data.date.dt.year


In [9]:
#Convering race, sex, location etc to categorical
data.iloc[:,np.r_[2:5,5:15,16:18,19:23]]=data.iloc[:,np.r_[2:5,5:15,16:18,19:23]].astype("category")

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3416238 entries, 0 to 3416237
Data columns (total 25 columns):
 #   Column                      Dtype         
---  ------                      -----         
 0   raw_row_number              int64         
 1   date                        datetime64[ns]
 2   location                    category      
 3   county_name                 category      
 4   subject_age                 category      
 5   subject_race                category      
 6   subject_sex                 category      
 7   type                        category      
 8   arrest_made                 category      
 9   citation_issued             category      
 11  outcome                     category      
 12  contraband_found            category      
 13  contraband_drugs            category      
 14  contraband_weapons          category      
 15  contraband_alcohol          bool          
 16  contraband_other            category      
 17  frisk_performed   

In [11]:
#Checking proportion of missing values
data.isnull().sum()*100/len(data)

raw_row_number                 0.000000
date                           0.000000
location                       0.195127
county_name                    0.195127
subject_age                    4.625146
subject_race                   0.048709
subject_sex                    0.457316
type                           0.000000
arrest_made                    0.026813
citation_issued                0.026813
outcome                        0.200747
contraband_found              98.368673
contraband_drugs              98.368673
contraband_weapons            98.368673
contraband_alcohol             0.000000
contraband_other              98.368673
frisk_performed               98.400843
search_conducted               0.000000
search_basis                  98.506281
reason_for_stop               48.579432
vehicle_type                   0.145277
vehicle_registration_state     0.287275
raw_Race                       0.048709
year                           0.000000
dtype: float64

<!-- We will use the KNN Imputer for imputing values into the Age and Sex variables for better precision -->

We impute null values using an MICE imputer from fancyimpute, a popular algorithm.

In [None]:
# from sklearn.impute import KNNImputer

# imputer = KNNImputer(n_neighbors=2, weights="uniform")

# data["subject_age"]= imputer.fit_transform(data[['subject_age']])
# # data['subject_sex'] = imputer.fit_transform(data[['subject_sex']])
# data["subject_age"].isnull().sum()
# print(data['subject_age'])
# # print(data['subject_sex'])

In [23]:
# Impute the missing values.

# from fancyimpute import MICE
# trans = MICE(verbose=False)
# data["subject_sex"] = trans.complete(data["subject_sex"])


ImportError: dlopen(/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/_cvxcore.cpython-37m-darwin.so, 2): Library not loaded: @rpath/libc++.1.dylib
  Referenced from: /Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/_cvxcore.cpython-37m-darwin.so
  Reason: image not found

In [13]:
#Removing null, and unknown values from race
data=data[-data["subject_race"].isin(["other","unknown",np.nan])]
data["subject_race"].unique()

[white, hispanic, black, asian/pacific islander]
Categories (4, object): [white, hispanic, black, asian/pacific islander]

We now create new values for search, success rates

In [19]:
# group_agg = data.groupby(["date"]).agg({
#   "search_conducted" : ["mean"], 
#   "frisk_performed" : ["mean"], 
#   "contraband_found" : ["mean"]
#   })

# group_agg.columns = ["_".join(x) for x in group_agg.columns.ravel()]

# You can also pass multiple functions to aggregate the same column e.g:
# group_agg = data.groupby(["group1", "group2"]).agg({"" : ["mean", "std", "sum"]})

DataError: No numeric types to aggregate

In [None]:
# # One hot encoding race and other categorical variables
# import sklearn.preprocessing.LabelEncoder
# label_encoder = LabelEncoder()

# df['subject_race'] = label_encoder.fit_transform(df['subject_race'])
# df['arrest_made'] = label_encoder.fit_transform(df['arrest_made'])
# df['citation_issued'] = label_encoder.fit_transform(df['citation_issued'])
# df['outcome'] = label_encoder.fit_transform(df['outcome'])
# df['contraband_found'] = label_encoder.fit_transform(df['contraband_found'])
# df['contraband_drugs'] = label_encoder.fit_transform(df['contraband_drugs'])
# df['contraband_weapons'] = label_encoder.fit_transform(df['contraband_weapons'])
# df['contraband_alcohol'] = label_encoder.fit_transform(df['contraband_alcohol'])
# df['contraband_other'] = label_encoder.fit_transform(df['contraband_other'])

In [None]:
data["subject_race"].unique()
#data["subject_race"].value_counts()/len(data["subject_race"])