## Implementation of Tree Based Models

In [13]:
import pandas as pd
import numpy as np
import matplotlib as plt

In [14]:
#Importing data
openpolicing_path="C:/Users/SwetaMankala/Desktop/Assignments/EAI6000/ma_statewide_2020.csv"

data=pd.read_csv(openpolicing_path,low_memory=False)
print('The shape of the dataset is:', data.shape)

The shape of the dataset is: (3416238, 24)


In [15]:
print(data.columns)

Index(['raw_row_number', 'date', 'location', 'county_name', 'subject_age',
       'subject_race', 'subject_sex', 'type', 'arrest_made', 'citation_issued',
       'contraband_weapons', 'contraband_alcohol', 'contraband_other',
       'frisk_performed', 'search_conducted', 'search_basis',
       'reason_for_stop', 'vehicle_type', 'vehicle_registration_state',
       'raw_Race'],
      dtype='object')


In [16]:
#numerical features
numerical_features = ['subject_age']

#categorical features
categorical_features = ['subject_sex', 'type', 'arrest_made', 'citation_issued', 'warning_issued',
                       'outcome', 'contraband_found', 'contraband_drugs', 'contraband_weapons', 'contraband_alcohol',
                       'contraband_other', 'frisk_performed', 'search_conducted', 'search_basis', 'reason_for_stop',
                       'vehicle_type', 'vehicle_registration_state', 'raw_Race']

#text features
text_features = ['location', 'county_name']

model_features = numerical_features + categorical_features + text_features
model_target = 'subject_race'

print('Model Features:', model_features)
print('Model Target:', model_target)

Model Target: subject_race


In [17]:
data[model_target].value_counts()

white                     2543612
black                      353548
hispanic                   340271
asian/pacific islander     167735
other                       11072
Name: subject_race, dtype: int64

In [20]:
data[categorical_features + text_features] = data[categorical_features + text_features].astype('str')

In [21]:
from sklearn.model_selection import train_test_split

dataset, test_data = train_test_split(data, test_size=0.1, shuffle=True, random_state=23)

In [22]:
train_data, val_data = train_test_split(dataset, test_size=0.7, shuffle=True, random_state=23)

### Data Pipeline

In [24]:
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier

numerical_processor = Pipeline([
    ('num_scaler', MinMaxScaler())
])

categorical_processor = Pipeline([
    ('cat_encoder', OneHotEncoder(handle_unknown='ignore'))
])

text_processor0 = Pipeline([
    ('text_vect0', CountVectorizer(binary=True, max_features=50))
])

text_processor1 = Pipeline([
    ('text_vect1', CountVectorizer(binary=True, max_features=50))
])

data_preprocessor = ColumnTransformer([
    ('numeric', numerical_processor, numerical_features),
    ('categoric', categorical_processor, categorical_features),
    ('text_pro0', text_processor0, text_features[0]),
    ('text_pro1', text_processor1, text_features[1])
])

pipeline = Pipeline([
    ('data_preprocessing', data_preprocessor),
    ('dt', DecisionTreeClassifier())
])

from sklearn import set_config
set_config(display='diagram')
pipeline

### Training the Model

In [25]:
X_train = train_data[model_features]
y_train = train_data[model_target]

pipeline.fit(X_train, y_train)

In [28]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

train_predictions = pipeline.predict(X_train)

print(confusion_matrix(y_train, train_predictions))
print(classification_report(y_train, train_predictions))
print('Accuracy Score:', accuracy_score(y_train, train_predictions))

[[ 45231      2      0      0    138]
 [     5  95279      0      0    265]
 [     6      9  91700      0    271]
 [     0      0      0   2995      7]
 [    20     59     41      2 686354]]
                        precision    recall  f1-score   support

asian/pacific islander       1.00      1.00      1.00     45371
                 black       1.00      1.00      1.00     95549
              hispanic       1.00      1.00      1.00     91986
                 other       1.00      1.00      1.00      3002
                 white       1.00      1.00      1.00    686476

              accuracy                           1.00    922384
             macro avg       1.00      1.00      1.00    922384
          weighted avg       1.00      1.00      1.00    922384

Accuracy Score: 0.9991055785876598


In [29]:
X_val = val_data[model_features]
y_val = val_data[model_target]

val_predictions = pipeline.predict(X_val)

print(confusion_matrix(y_val, val_predictions))
print(classification_report(y_val, val_predictions))
print('Accuracy Score:', accuracy_score(y_val, val_predictions))

[[ 104960      39      27       1     485]
 [     34  221623      77       1    1033]
 [     38      88  213383       2    1045]
 [      2       1       1    6819      41]
 [    254     532     521      15 1601208]]
                        precision    recall  f1-score   support

asian/pacific islander       1.00      0.99      1.00    105512
                 black       1.00      0.99      1.00    222768
              hispanic       1.00      0.99      1.00    214556
                 other       1.00      0.99      1.00      6864
                 white       1.00      1.00      1.00   1602530

              accuracy                           1.00   2152230
             macro avg       1.00      1.00      1.00   2152230
          weighted avg       1.00      1.00      1.00   2152230

Accuracy Score: 0.9980313442336554


In [36]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

param_grid = {'dt_max_depth': [10, 20, 30],
              'dt_min_samples_leaf':[1, 2, 5],
              'dt_min_samples_split':[10, 20, 30]
             }

grid_search = GridSearchCV(pipeline, 
                           param_grid,
                           cv = 5,
                           verbose = 1,
                           n_jobs = -1)

grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


ValueError: Invalid parameter dt_max_depth for estimator Pipeline(steps=[('data_preprocessing',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('num_scaler',
                                                                   MinMaxScaler())]),
                                                  ['subject_age']),
                                                 ('categoric',
                                                  Pipeline(steps=[('cat_encoder',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['subject_sex', 'type',
                                                   'arrest_made',
                                                   'citation_issued',
                                                   'warning_issued', 'outcome',
                                                   'contraband_found',
                                                   'contraband_drugs'...
                                                   'search_conducted',
                                                   'search_basis',
                                                   'reason_for_stop',
                                                   'vehicle_type',
                                                   'vehicle_registration_state',
                                                   'raw_Race']),
                                                 ('text_pro0',
                                                  Pipeline(steps=[('text_vect0',
                                                                   CountVectorizer(binary=True,
                                                                                   max_features=50))]),
                                                  'location'),
                                                 ('text_pro1',
                                                  Pipeline(steps=[('text_vect1',
                                                                   CountVectorizer(binary=True,
                                                                                   max_features=50))]),
                                                  'county_name')])),
                ('dt', DecisionTreeClassifier())]). Check the list of available parameters with `estimator.get_params().keys()`.

In [None]:
estimator.get_params().keys()