In [221]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.svm import SVC
from xgboost import XGBClassifier

In [129]:
unclean_df = pd.read_excel('sqf-2018.xlsx')

In [130]:
unclean_df = unclean_df.drop(['DEMEANOR_CODE', 'STOP_FRISK_ID', 'STOP_FRISK_DATE', 'Stop Frisk Time', 'YEAR2', 
                              'ISSUING_OFFICER_COMMAND_CODE', 'SUPERVISING_OFFICER_COMMAND_CODE', 'LOCATION_IN_OUT_CODE', 
                              'RECORD_STATUS_CODE', 'JURISDICTION_CODE', 'OFFICER_NOT_EXPLAINED_STOP_DESCRIPTION', 
                              'SUMMONS_OFFENSE_DESCRIPTION', 'DEMEANOR_OF_PERSON_STOPPED', 'SUSPECT_OTHER_DESCRIPTION',
                              'STOP_LOCATION_APARTMENT', 'STOP_LOCATION_ZIP_CODE', 'STOP_LOCATION_FULL_ADDRESS',
                              'STOP_LOCATION_PREMISES_NAME', 'STOP_LOCATION_STREET_NAME', 'STOP_LOCATION_X', 
                              'STOP_LOCATION_Y'], axis = 1)

In [131]:
unclean_df.SUSPECT_REPORTED_AGE.replace('(null)', '-15', inplace = True)
unclean_df.SUSPECT_REPORTED_AGE = unclean_df.SUSPECT_REPORTED_AGE.astype(float)
cut_labels = ['unknown', '0_17', '18_30', 'over_30']
cut_bins = [-16, -1, 17, 30, 100]
unclean_df['age_bin'] = pd.cut(unclean_df.SUSPECT_REPORTED_AGE, bins = cut_bins, labels = cut_labels)
unclean_df.age_bin = unclean_df.age_bin.astype(str)

In [132]:
unclean_df = unclean_df.drop('SUSPECT_REPORTED_AGE', axis = 1)

In [108]:
unclean_df.STOP_LOCATION_PRECINCT = unclean_df.STOP_LOCATION_PRECINCT.astype(str)

In [109]:
unclean_df = unclean_df[unclean_df['SUSPECT_HEIGHT'] != '(null)']
unclean_df.SUSPECT_HEIGHT = unclean_df.SUSPECT_HEIGHT.astype(float)

In [110]:
unclean_df = unclean_df[unclean_df['SUSPECT_WEIGHT'] != '(null)']
unclean_df.SUSPECT_WEIGHT = unclean_df.SUSPECT_WEIGHT.astype(float)

In [133]:
unclean_df.shape

(11008, 62)

In [112]:
# unclean_df.to_csv('cleaned_df.csv')

In [113]:
X.head()

Unnamed: 0,MONTH2,DAY2,STOP_WAS_INITIATED,ISSUING_OFFICER_RANK,SUPERVISING_OFFICER_RANK,SUPERVISING_ACTION_CORRESPONDING_ACTIVITY_LOG_ENTRY_REVIEWED,JURISDICTION_DESCRIPTION,OBSERVED_DURATION_MINUTES,SUSPECTED_CRIME_DESCRIPTION,STOP_DURATION_MINUTES,...,SUSPECT_HEIGHT,SUSPECT_WEIGHT,SUSPECT_BODY_BUILD_TYPE,SUSPECT_EYE_COLOR,SUSPECT_HAIR_COLOR,STOP_LOCATION_PRECINCT,STOP_LOCATION_SECTOR_CODE,STOP_LOCATION_PATROL_BORO_NAME,STOP_LOCATION_BORO_NAME,age_bin
0,January,Monday,Based on C/W on Scene,POM,SGT,Y,PSB,0,MENACING,18,...,5.1,170.0,MED,BRO,BLK,1,G,PBMS,MANHATTAN,unknown
1,January,Monday,Based on Radio Run,POM,SGT,N,PSB,1,CPW,15,...,6.1,250.0,HEA,BRO,BLK,34,C,PBMN,MANHATTAN,18_30
2,January,Monday,Based on Radio Run,POM,SGT,Y,Housing,0,GRAND LARCENY,10,...,5.5,150.0,THN,BRO,BLD,43,B,PBBX,BRONX,over_30
3,January,Monday,Based on Radio Run,POM,SGT,Y,PSB,2,ROBBERY,15,...,5.1,160.0,MED,BRO,BLK,63,B,PBBS,BROOKLYN,over_30
4,January,Monday,Based on Radio Run,POM,SGT,Y,PSB,2,ROBBERY,15,...,5.11,230.0,MED,BRO,BLK,63,B,PBBS,BROOKLYN,over_30


In [114]:
X = unclean_df.drop('SUSPECT_ARRESTED_FLAG', axis = 1)
y = unclean_df.SUSPECT_ARRESTED_FLAG

In [122]:
# X.STOP_LOCATION_PRECINCT.value_counts()
X = X.drop('STOP_LOCATION_PRECINCT', axis = 1)

In [123]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 345)

In [124]:
enc = OneHotEncoder(handle_unknown='ignore')
scaler = StandardScaler()
X_train_num = X_train[['OBSERVED_DURATION_MINUTES', 'STOP_DURATION_MINUTES', 'SUSPECT_HEIGHT', 'SUSPECT_WEIGHT']]
X_train_cat = X_train.drop(['OBSERVED_DURATION_MINUTES', 'STOP_DURATION_MINUTES', 'SUSPECT_HEIGHT', 'SUSPECT_WEIGHT'], axis = 1)

In [125]:
X_train_cat.info() # converted age bins and stop location precinct to strings

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8379 entries, 2604 to 3828
Data columns (total 56 columns):
MONTH2                                                          8379 non-null object
DAY2                                                            8379 non-null object
STOP_WAS_INITIATED                                              8379 non-null object
ISSUING_OFFICER_RANK                                            8379 non-null object
SUPERVISING_OFFICER_RANK                                        8379 non-null object
SUPERVISING_ACTION_CORRESPONDING_ACTIVITY_LOG_ENTRY_REVIEWED    8379 non-null object
JURISDICTION_DESCRIPTION                                        8379 non-null object
SUSPECTED_CRIME_DESCRIPTION                                     8379 non-null object
OFFICER_EXPLAINED_STOP_FLAG                                     8379 non-null object
OTHER_PERSON_STOPPED_FLAG                                       8379 non-null object
SUSPECT_ARREST_OFFENSE                  

In [136]:
# X_train_cat.columns
X_train_cat.SUSPECTS_ACTIONS_PROXIMITY_TO_SCENE_FLAG.unique()

array(['(null)', 'Y'], dtype=object)

In [234]:
X_train_enc = enc.fit_transform(X_train_cat)
names = enc.get_feature_names(X_train_cat.columns)

X_train_scale = scaler.fit_transform(X_train_num)

In [87]:
X_train_cat

Unnamed: 0,MONTH2,DAY2,STOP_WAS_INITIATED,ISSUING_OFFICER_RANK,SUPERVISING_OFFICER_RANK,SUPERVISING_ACTION_CORRESPONDING_ACTIVITY_LOG_ENTRY_REVIEWED,JURISDICTION_DESCRIPTION,SUSPECTED_CRIME_DESCRIPTION,OFFICER_EXPLAINED_STOP_FLAG,OTHER_PERSON_STOPPED_FLAG,...,SUSPECT_SEX,SUSPECT_RACE_DESCRIPTION,SUSPECT_BODY_BUILD_TYPE,SUSPECT_EYE_COLOR,SUSPECT_HAIR_COLOR,STOP_LOCATION_PRECINCT,STOP_LOCATION_SECTOR_CODE,STOP_LOCATION_PATROL_BORO_NAME,STOP_LOCATION_BORO_NAME,age_bin
2604,March,Tuesday,Based on Radio Run,POM,SGT,Y,PSB,ROBBERY,Y,N,...,MALE,WHITE HISPANIC,MED,BRO,BLK,104,(null),PBQN,QUEENS,over_30
9472,November,Friday,Based on Self Initiated,POM,LT,Y,(null),CPW,Y,N,...,MALE,BLACK,THN,BRO,BLK,40,A,PBBX,BRONX,18_30
8221,September,Thursday,Based on Self Initiated,POM,LT,N,Housing,CRIMINAL POSSESSION OF MARIHUANA,Y,Y,...,MALE,BLACK HISPANIC,THN,BRO,BLK,43,A,PBBX,BRONX,0_17
9632,November,Wednesday,Based on Self Initiated,POM,SSA,Y,PSB,CRIMINAL POSSESSION OF MARIHUANA,Y,Y,...,MALE,WHITE HISPANIC,THN,BRO,BLK,103,A,PBQS,QUEENS,18_30
9261,October,Friday,Based on Radio Run,POM,SGT,Y,(null),ASSAULT,Y,Y,...,MALE,WHITE HISPANIC,MED,BRO,BLK,106,B,PBQS,QUEENS,over_30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7072,August,Wednesday,Based on Radio Run,POM,LT,Y,(null),ROBBERY,Y,N,...,MALE,BLACK,MED,BRO,BLK,43,C,PBBX,BRONX,18_30
437,January,Friday,Based on Self Initiated,POM,SGT,Y,PSB,ROBBERY,Y,Y,...,MALE,WHITE HISPANIC,THN,BRO,BLK,32,C,PBMN,MANHATTAN,18_30
2390,March,Wednesday,Based on Self Initiated,POM,LT,Y,PSB,ROBBERY,Y,N,...,MALE,BLACK,THN,BRO,BLK,28,B,PBMN,MANHATTAN,over_30
7113,August,Thursday,Based on C/W on Scene,POM,SGT,N,(null),THEFT OF SERVICES,Y,Y,...,MALE,BLACK,THN,BLK,BLK,75,D,PBBN,BROOKLYN,18_30


## Second Try 
Rather than try and use what is giving me an error.  I am going to import the csv.

In [236]:
df = pd.read_csv('cleaned_df.csv', index_col = 0)

In [141]:
df.head()

Unnamed: 0,MONTH2,DAY2,STOP_WAS_INITIATED,ISSUING_OFFICER_RANK,SUPERVISING_OFFICER_RANK,SUPERVISING_ACTION_CORRESPONDING_ACTIVITY_LOG_ENTRY_REVIEWED,JURISDICTION_DESCRIPTION,OBSERVED_DURATION_MINUTES,SUSPECTED_CRIME_DESCRIPTION,STOP_DURATION_MINUTES,...,SUSPECT_HEIGHT,SUSPECT_WEIGHT,SUSPECT_BODY_BUILD_TYPE,SUSPECT_EYE_COLOR,SUSPECT_HAIR_COLOR,STOP_LOCATION_PRECINCT,STOP_LOCATION_SECTOR_CODE,STOP_LOCATION_PATROL_BORO_NAME,STOP_LOCATION_BORO_NAME,age_bin
0,January,Monday,Based on C/W on Scene,POM,SGT,Y,PSB,0,MENACING,18,...,5.1,170.0,MED,BRO,BLK,1,G,PBMS,MANHATTAN,unknown
1,January,Monday,Based on Radio Run,POM,SGT,N,PSB,1,CPW,15,...,6.1,250.0,HEA,BRO,BLK,34,C,PBMN,MANHATTAN,18_30
2,January,Monday,Based on Radio Run,POM,SGT,Y,Housing,0,GRAND LARCENY,10,...,5.5,150.0,THN,BRO,BLD,43,B,PBBX,BRONX,over_30
3,January,Monday,Based on Radio Run,POM,SGT,Y,PSB,2,ROBBERY,15,...,5.1,160.0,MED,BRO,BLK,63,B,PBBS,BROOKLYN,over_30
4,January,Monday,Based on Radio Run,POM,SGT,Y,PSB,2,ROBBERY,15,...,5.11,230.0,MED,BRO,BLK,63,B,PBBS,BROOKLYN,over_30


In [261]:
X = df.drop(['SUSPECT_ARRESTED_FLAG', 'SUSPECT_ARREST_OFFENSE', 'SEARCH_BASIS_INCIDENTAL_TO_ARREST_FLAG'], axis = 1)
y = df.SUSPECT_ARRESTED_FLAG

In [335]:
X.head()
X.shape

(10474, 59)

In [263]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 345)

In [264]:
enc = OneHotEncoder(handle_unknown='ignore')
scaler = StandardScaler()
X_train_num = X_train[['OBSERVED_DURATION_MINUTES', 'STOP_DURATION_MINUTES', 'SUSPECT_HEIGHT', 'SUSPECT_WEIGHT']]
X_train_cat = X_train.drop(['OBSERVED_DURATION_MINUTES', 'STOP_DURATION_MINUTES', 'SUSPECT_HEIGHT', 'SUSPECT_WEIGHT'], axis = 1)

In [265]:
# X_train_cat.info()

In [266]:
X_train_enc = enc.fit_transform(X_train_cat).toarray()
names = enc.get_feature_names(X_train_cat.columns)

X_train_scale = scaler.fit_transform(X_train_num)

In [267]:
X_train_enc_df = pd.DataFrame(X_train_enc, columns = names)
X_train_scale_df = pd.DataFrame(X_train_scale, columns = X_train_num.columns)

In [268]:
X_train_new = pd.concat([X_train_enc_df, X_train_scale_df], axis = 1)

In [269]:
X_train_new.head()

Unnamed: 0,MONTH2_April,MONTH2_August,MONTH2_December,MONTH2_February,MONTH2_January,MONTH2_July,MONTH2_June,MONTH2_March,MONTH2_May,MONTH2_November,...,STOP_LOCATION_BORO_NAME_QUEENS,STOP_LOCATION_BORO_NAME_STATEN ISLAND,age_bin_0_17,age_bin_18_30,age_bin_over_30,age_bin_unknown,OBSERVED_DURATION_MINUTES,STOP_DURATION_MINUTES,SUSPECT_HEIGHT,SUSPECT_WEIGHT
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,-0.019219,0.535113,0.413704,0.040105
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,-0.020987,-0.596034,0.154967,-0.703404
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,-0.020987,-0.081876,0.413704,-0.554702
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,-0.018335,-0.081876,0.413704,-0.554702
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,-0.020987,-0.338955,-0.362507,-0.554702


In [270]:
X_test_num = X_test[['OBSERVED_DURATION_MINUTES', 'STOP_DURATION_MINUTES', 'SUSPECT_HEIGHT', 'SUSPECT_WEIGHT']]
X_test_cat = X_test.drop(['OBSERVED_DURATION_MINUTES', 'STOP_DURATION_MINUTES', 'SUSPECT_HEIGHT', 'SUSPECT_WEIGHT'], axis = 1)

In [271]:
X_test_enc = enc.transform(X_test_cat).toarray()
X_test_scale = scaler.transform(X_test_num)

In [272]:
X_test_enc_df = pd.DataFrame(X_test_enc, columns = names)
X_test_scale_df = pd.DataFrame(X_test_scale, columns = X_test_num.columns)

In [273]:
X_test_new = pd.concat([X_test_enc_df, X_test_scale_df], axis = 1)

In [274]:
X_test_new.head()

Unnamed: 0,MONTH2_April,MONTH2_August,MONTH2_December,MONTH2_February,MONTH2_January,MONTH2_July,MONTH2_June,MONTH2_March,MONTH2_May,MONTH2_November,...,STOP_LOCATION_BORO_NAME_QUEENS,STOP_LOCATION_BORO_NAME_STATEN ISLAND,age_bin_0_17,age_bin_18_30,age_bin_over_30,age_bin_unknown,OBSERVED_DURATION_MINUTES,STOP_DURATION_MINUTES,SUSPECT_HEIGHT,SUSPECT_WEIGHT
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,-0.020987,-0.493202,0.672441,-0.257299
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,-0.017451,-0.338955,-0.10377,0.337508
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,-0.017451,-0.081876,0.154967,0.040105
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,-0.019219,-0.081876,0.931178,0.932315
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,-0.020987,-0.184708,-0.10377,-0.852106


## Random Forest 

In [275]:
rf_clf = RandomForestClassifier()
rf_param_grid = {'n_estimators': [10, 30, 70, 100],
                'criterion': ['gini', 'entropy'],
                'max_depth': [1, 5, 10],
                'min_samples_split': [3, 10, 15],
                'min_samples_leaf': [3, 10, 15]}

In [276]:
rf_grid_search = GridSearchCV(rf_clf, rf_param_grid, cv = 5)
rf_grid_search.fit(X_train_new.values, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid

In [277]:
rf_grid_search.best_params_

{'criterion': 'gini',
 'max_depth': 10,
 'min_samples_leaf': 3,
 'min_samples_split': 3,
 'n_estimators': 100}

In [278]:
rf_param_grid_2 = {'n_estimators': [100, 120, 140],
                'criterion': ['gini'],
                'max_depth': [10, 15, 20],
                'min_samples_split': [2, 3, 4],
                'min_samples_leaf': [2, 3, 4]}

In [279]:
rf_grid_search = GridSearchCV(rf_clf, rf_param_grid_2, cv = 5)
rf_grid_search.fit(X_train_new.values, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid

In [280]:
rf_grid_search.best_params_

{'criterion': 'gini',
 'max_depth': 20,
 'min_samples_leaf': 2,
 'min_samples_split': 4,
 'n_estimators': 120}

In [281]:
rf_param_grid_3 = {'n_estimators': [115, 120, 125],
                'criterion': ['gini'],
                'max_depth': [19, 20, 21],
                'min_samples_split': [4, 5, 6, 7],
                'min_samples_leaf': [2]}

In [282]:
rf_grid_search = GridSearchCV(rf_clf, rf_param_grid_3, cv = 5)
rf_grid_search.fit(X_train_new.values, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid

In [283]:
rf_grid_search.best_params_

{'criterion': 'gini',
 'max_depth': 21,
 'min_samples_leaf': 2,
 'min_samples_split': 4,
 'n_estimators': 115}

In [284]:
rf_param_grid_4 = {'n_estimators': [113, 114, 115, 116, 117],
                'criterion': ['gini'],
                'max_depth': [21, 22, 23, 24, 25],
                'min_samples_split': [4],
                'min_samples_leaf': [2]}

In [285]:
rf_grid_search = GridSearchCV(rf_clf, rf_param_grid_4, cv = 5)
rf_grid_search.fit(X_train_new.values, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid

In [286]:
rf_grid_search.best_params_

{'criterion': 'gini',
 'max_depth': 25,
 'min_samples_leaf': 2,
 'min_samples_split': 4,
 'n_estimators': 115}

In [287]:
rf_param_grid_5 = {'n_estimators': [115],
                'criterion': ['gini'],
                'max_depth': [25, 26, 27, 28, 29],
                'min_samples_split': [4],
                'min_samples_leaf': [2]}

In [288]:
rf_grid_search = GridSearchCV(rf_clf, rf_param_grid_5, cv = 5)
rf_grid_search.fit(X_train_new.values, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid

In [289]:
rf_grid_search.best_params_

{'criterion': 'gini',
 'max_depth': 25,
 'min_samples_leaf': 2,
 'min_samples_split': 4,
 'n_estimators': 115}

In [209]:
# rf_param_grid_6 = {'n_estimators': [104, 105, 106, 107, 108, 109],
#                 'criterion': ['gini'],
#                 'max_depth': [15],
#                 'min_samples_split': [2],
#                 'min_samples_leaf': [2]}

In [290]:
# rf_grid_search = GridSearchCV(rf_clf, rf_param_grid_6, cv = 5)
# rf_grid_search.fit(X_train_new.values, y_train)

In [211]:
# rf_grid_search.best_params_

{'criterion': 'gini',
 'max_depth': 15,
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_estimators': 108}

In [291]:
final_rf = RandomForestClassifier(criterion = 'gini', max_depth = 25, min_samples_leaf = 2, 
                                  min_samples_split = 4, n_estimators = 115)

In [292]:
final_rf.fit(X_train_new, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=25, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=4,
                       min_weight_fraction_leaf=0.0, n_estimators=115,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [301]:
pred_train = final_rf.predict(X_train_new)
print(confusion_matrix(y_train, pred_train))
print(classification_report(y_train, pred_train))

[[5873   72]
 [ 440 1994]]
              precision    recall  f1-score   support

           N       0.93      0.99      0.96      5945
           Y       0.97      0.82      0.89      2434

    accuracy                           0.94      8379
   macro avg       0.95      0.90      0.92      8379
weighted avg       0.94      0.94      0.94      8379



In [293]:
pred = final_rf.predict(X_test_new)

In [337]:
print(confusion_matrix(y_test, pred))
print((classification_report(y_test, pred)))
# print(accuracy_score(y_test, pred))

[[1455   49]
 [ 190  401]]
              precision    recall  f1-score   support

           N       0.88      0.97      0.92      1504
           Y       0.89      0.68      0.77       591

    accuracy                           0.89      2095
   macro avg       0.89      0.82      0.85      2095
weighted avg       0.89      0.89      0.88      2095



## Where there were problems (before fixing them)

In [259]:
pd.DataFrame([X_train_new.columns, final_rf.feature_importances_])
tup = list(zip(X_train_new.columns, final_rf.feature_importances_))

In [260]:
sorted(tup, key=lambda x: x[1], reverse=True)

[('SUSPECT_ARREST_OFFENSE_(null)', 0.3415588609258147),
 ('SEARCH_BASIS_INCIDENTAL_TO_ARREST_FLAG_Y', 0.13969766628754968),
 ('SEARCH_BASIS_INCIDENTAL_TO_ARREST_FLAG_(null)', 0.07583738578343616),
 ('SEARCHED_FLAG_Y', 0.05826291879799349),
 ('SEARCHED_FLAG_N', 0.05583826476289579),
 ('SUSPECT_ARREST_OFFENSE_ROBBERY', 0.03518327550907764),
 ('SUSPECT_ARREST_OFFENSE_CPW', 0.03095686654373609),
 ('SUSPECT_ARREST_OFFENSE_ASSAULT', 0.023827616897578462),
 ('SUSPECT_ARREST_OFFENSE_PETIT LARCENY', 0.02181017793653937),
 ('OTHER_CONTRABAND_FLAG_Y', 0.01537912366354247),
 ('SUSPECT_ARREST_OFFENSE_OTHER', 0.0143391483008295),
 ('SUSPECT_ARREST_OFFENSE_CRIMINAL TRESPASS', 0.013800837396248943),
 ('OTHER_CONTRABAND_FLAG_N', 0.011589431304800543),
 ('WEAPON_FOUND_FLAG_N', 0.011126038541071711),
 ('WEAPON_FOUND_FLAG_Y', 0.008200073197293422),
 ('FIREARM_FLAG_(null)', 0.007467821570562883),
 ('SUSPECT_ARREST_OFFENSE_BURGLARY', 0.006542519782790947),
 ('SUSPECT_ARREST_OFFENSE_GRAND LARCENY', 0.0057538

## XGBoost 

In [329]:
xgb_clf = XGBClassifier()
param_grid = {'learning_rate': [0.01],
             'max_depth': [2, 5, 10],
             'min_child_weight': [1, 5, 10],
             'subsample': [0.5, 0.7],
             'n_estimators': [10, 50, 100]}

In [330]:
grid_clf = GridSearchCV(xgb_clf, param_grid, cv = 5)
grid_clf.fit(X_train_new, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=1,
                                     nthread=None, objective='binary:logistic',
                                     random_state=0, reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, seed=None, silent=None,
                                     subsample=1, verbosity=1),
             iid='warn', n_jobs=None,
             param_grid={'learning_rate': [0.01], 'max_depth': [2, 5, 10],
                         'min_child_weight': [1, 5, 10],
                         'n_estimat

In [331]:
grid_clf.best_params_

{'learning_rate': 0.01,
 'max_depth': 10,
 'min_child_weight': 1,
 'n_estimators': 50,
 'subsample': 0.7}

In [333]:
final_xgb = XGBClassifier(learning_rate = 0.01, max_depth = 10, min_child_weight = 1, n_estimators = 50, subsample = 0.7)
final_xgb.fit(X_train_new, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.01, max_delta_step=0, max_depth=10,
              min_child_weight=1, missing=None, n_estimators=50, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=0.7, verbosity=1)

In [334]:
xgb_train_pred = final_xgb.predict(X_train_new)
print(confusion_matrix(y_train, xgb_train_pred))
print(classification_report(y_train, xgb_train_pred))

[[5797  148]
 [ 575 1859]]
              precision    recall  f1-score   support

           N       0.91      0.98      0.94      5945
           Y       0.93      0.76      0.84      2434

    accuracy                           0.91      8379
   macro avg       0.92      0.87      0.89      8379
weighted avg       0.91      0.91      0.91      8379



## Support Vector Machine

In [303]:
svc = SVC(kernel = 'rbf', gamma = 'auto', max_iter = 10000)
param_grid = {'C': [0.01, 0.05, 0.1, 0.15]}
svc_grid = GridSearchCV(estimator = svc, param_grid = param_grid)
svc_grid.fit(X_train_new, y_train)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto', kernel='rbf', max_iter=10000,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=None, param_grid={'C': [0.01, 0.05, 0.1, 0.15]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [305]:
svc_grid.best_params_

{'C': 0.15}

In [306]:
param_grid_2 = {'C': [0.15, .4, .7, 1]}
svc_grid = GridSearchCV(estimator = svc, param_grid = param_grid_2)
svc_grid.fit(X_train_new, y_train)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto', kernel='rbf', max_iter=10000,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=None, param_grid={'C': [0.15, 0.4, 0.7, 1]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [307]:
svc_grid.best_params_

{'C': 1}

In [308]:
param_grid_3 = {'C': [1, 5, 10, 15]}
svc_grid = GridSearchCV(estimator = svc, param_grid = param_grid_2)
svc_grid.fit(X_train_new, y_train)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto', kernel='rbf', max_iter=10000,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=None, param_grid={'C': [0.15, 0.4, 0.7, 1]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [309]:
svc_grid.best_params_

{'C': 1}

In [310]:
param_grid_4 = {'C': [.8, .9, 1, 1.1, 1.2]}
svc_grid = GridSearchCV(estimator = svc, param_grid = param_grid_4)
svc_grid.fit(X_train_new, y_train)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto', kernel='rbf', max_iter=10000,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=None, param_grid={'C': [0.8, 0.9, 1, 1.1, 1.2]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [311]:
svc_grid.best_params_

{'C': 1.2}

In [312]:
param_grid_5 = {'C': [1.9, 1.2, 1.3, 1.4]}
svc_grid = GridSearchCV(estimator = svc, param_grid = param_grid_5)
svc_grid.fit(X_train_new, y_train)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto', kernel='rbf', max_iter=10000,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=None, param_grid={'C': [1.9, 1.2, 1.3, 1.4]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [313]:
svc_grid.best_params_

{'C': 1.9}

In [314]:
param_grid_6 = {'C': [1.7, 1.8, 1.9, 2, 2.5]}
svc_grid = GridSearchCV(estimator = svc, param_grid = param_grid_6)
svc_grid.fit(X_train_new, y_train)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto', kernel='rbf', max_iter=10000,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=None, param_grid={'C': [1.7, 1.8, 1.9, 2, 2.5]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [315]:
svc_grid.best_params_

{'C': 2.5}

In [316]:
param_grid_7 = {'C': [2.4, 2.5, 2.6, 2.7]}
svc_grid = GridSearchCV(estimator = svc, param_grid = param_grid_7)
svc_grid.fit(X_train_new, y_train)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto', kernel='rbf', max_iter=10000,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=None, param_grid={'C': [2.4, 2.5, 2.6, 2.7]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [317]:
svc_grid.best_params_

{'C': 2.7}

In [318]:
param_grid_8 = {'C': [2.7, 2.8, 2.9, 3]}
svc_grid = GridSearchCV(estimator = svc, param_grid = param_grid_8)
svc_grid.fit(X_train_new, y_train)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto', kernel='rbf', max_iter=10000,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=None, param_grid={'C': [2.7, 2.8, 2.9, 3]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [319]:
svc_grid.best_params_

{'C': 3}

In [320]:
param_grid_9 = {'C': [3, 3.2, 3.5, 4]}
svc_grid = GridSearchCV(estimator = svc, param_grid = param_grid_9)
svc_grid.fit(X_train_new, y_train)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto', kernel='rbf', max_iter=10000,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=None, param_grid={'C': [3, 3.2, 3.5, 4]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [321]:
svc_grid.best_params_

{'C': 4}

In [322]:
param_grid_10 = {'C': [3.6, 3.7, 3.8, 3.9, 4, 4.1, 4.2]}
svc_grid = GridSearchCV(estimator = svc, param_grid = param_grid_10)
svc_grid.fit(X_train_new, y_train)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto', kernel='rbf', max_iter=10000,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [3.6, 3.7, 3.8, 3.9, 4, 4.1, 4.2]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [323]:
svc_grid.best_params_

{'C': 4.2}

In [324]:
param_grid_11 = {'C': [4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5]}
svc_grid = GridSearchCV(estimator = svc, param_grid = param_grid_11)
svc_grid.fit(X_train_new, y_train)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto', kernel='rbf', max_iter=10000,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [325]:
svc_grid.best_params_

{'C': 4.5}

In [326]:
final_svc = SVC(C = 4.5, kernel = 'rbf', gamma = 'auto', max_iter = 10000)
final_svc.fit(X_train_new, y_train)
predict_train = final_svc.predict(X_train_new)
pred = final_svc.predict(X_test_new)



In [327]:
print(confusion_matrix(y_train, predict_train))
print(classification_report(y_train, predict_train))

[[5746  199]
 [ 741 1693]]
              precision    recall  f1-score   support

           N       0.89      0.97      0.92      5945
           Y       0.89      0.70      0.78      2434

    accuracy                           0.89      8379
   macro avg       0.89      0.83      0.85      8379
weighted avg       0.89      0.89      0.88      8379



In [328]:
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))

[[1455   49]
 [ 190  401]]
              precision    recall  f1-score   support

           N       0.88      0.97      0.92      1504
           Y       0.89      0.68      0.77       591

    accuracy                           0.89      2095
   macro avg       0.89      0.82      0.85      2095
weighted avg       0.89      0.89      0.88      2095

