#### Import all necessary libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier


from sklearn import preprocessing

warnings.filterwarnings("ignore")

pd.options.display.max_rows = None
pd.options.display.max_columns = None
sns.set()

#### import both train and test dataset

In [2]:
train_df = pd.read_csv("Train.csv")
test_df = pd.read_csv("Test.csv")


In [3]:
train_df.head()

Unnamed: 0,INCIDENT_ID,DATE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,X_9,X_10,X_11,X_12,X_13,X_14,X_15,MULTIPLE_OFFENSE
0,CR_102659,4-Jul-04,0,36,34,2,1,5,6,1,6,1,174,1.0,92,29,36,0
1,CR_189752,18-Jul-17,1,37,37,0,0,11,17,1,6,1,236,1.0,103,142,34,1
2,CR_184637,15-Mar-17,0,3,2,3,5,1,0,2,3,1,174,1.0,110,93,34,1
3,CR_139071,13-Feb-09,0,33,32,2,1,7,1,1,6,1,249,1.0,72,29,34,1
4,CR_109335,13-Apr-05,0,33,32,2,1,8,3,0,5,1,174,0.0,112,29,43,1


In [4]:
test_df.head()

Unnamed: 0,INCIDENT_ID,DATE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,X_9,X_10,X_11,X_12,X_13,X_14,X_15
0,CR_195453,01-FEB-18,0,30,35,7,3,6,4,0,5,1,174,,72,119,23
1,CR_103520,05-MAR-04,0,44,44,1,3,7,1,4,6,1,316,0.0,12,29,34
2,CR_196089,27-JAN-18,0,34,33,3,5,2,7,3,0,1,316,1.0,72,0,34
3,CR_112195,18-AUG-06,7,3,2,3,5,9,8,0,5,1,174,1.0,112,87,34
4,CR_149832,31-OCT-11,0,7,8,7,3,2,7,1,5,1,174,0.0,112,93,43


In [5]:
train_df.shape

(23856, 18)

In [6]:
test_df.shape

(15903, 17)

In [7]:
#### check percentage of missing value in each column 
train_df.isnull().sum() / len(train_df) * 100

INCIDENT_ID         0.000000
DATE                0.000000
X_1                 0.000000
X_2                 0.000000
X_3                 0.000000
X_4                 0.000000
X_5                 0.000000
X_6                 0.000000
X_7                 0.000000
X_8                 0.000000
X_9                 0.000000
X_10                0.000000
X_11                0.000000
X_12                0.762911
X_13                0.000000
X_14                0.000000
X_15                0.000000
MULTIPLE_OFFENSE    0.000000
dtype: float64

In [8]:
#### check percentage of missing value in each column 
test_df.isnull().sum() / len(test_df) * 100

INCIDENT_ID    0.000000
DATE           0.000000
X_1            0.000000
X_2            0.000000
X_3            0.000000
X_4            0.000000
X_5            0.000000
X_6            0.000000
X_7            0.000000
X_8            0.000000
X_9            0.000000
X_10           0.000000
X_11           0.000000
X_12           0.798591
X_13           0.000000
X_14           0.000000
X_15           0.000000
dtype: float64

#### Check distribution of the variable that has missing value (X_12)

In [9]:
train_df["X_12"].describe()

count    23674.000000
mean         0.974064
std          1.167725
min          0.000000
25%          1.000000
50%          1.000000
75%          1.000000
max         90.000000
Name: X_12, dtype: float64

In [10]:
test_df["X_12"].describe()

count    15776.000000
mean         0.972236
std          0.876669
min          0.000000
25%          1.000000
50%          1.000000
75%          1.000000
max         40.000000
Name: X_12, dtype: float64

In [11]:
### handling missing value by filling in with the most frequent observation
train_df["X_12"] = train_df["X_12"].fillna(train_df["X_12"].mode()[0])
test_df["X_12"] = test_df["X_12"].fillna(test_df["X_12"].mode()[0])

In [12]:
#### confirm that there's no null again
train_df.isnull().sum()

INCIDENT_ID         0
DATE                0
X_1                 0
X_2                 0
X_3                 0
X_4                 0
X_5                 0
X_6                 0
X_7                 0
X_8                 0
X_9                 0
X_10                0
X_11                0
X_12                0
X_13                0
X_14                0
X_15                0
MULTIPLE_OFFENSE    0
dtype: int64

In [13]:
#### confirm that there's no null again
test_df.isnull().sum()

INCIDENT_ID    0
DATE           0
X_1            0
X_2            0
X_3            0
X_4            0
X_5            0
X_6            0
X_7            0
X_8            0
X_9            0
X_10           0
X_11           0
X_12           0
X_13           0
X_14           0
X_15           0
dtype: int64

In [14]:
### convert the X_12 column from float to int
train_df.X_12 = train_df.X_12.apply(lambda x: int(x))
test_df.X_12 = test_df.X_12.apply(lambda x: int(x))

### Extract the X_train, X_test and y_train

In [15]:
X_train = train_df.drop(['INCIDENT_ID', 'DATE', 'MULTIPLE_OFFENSE'], axis = 1)

X_test = test_df.drop(['INCIDENT_ID', 'DATE'], axis = 1)

y_train = train_df["MULTIPLE_OFFENSE"]


### Normalization 

In [16]:
scaler = preprocessing.StandardScaler().fit(X_train)

norm_X_train =scaler.transform(X_train)
norm_X_test = scaler.transform(X_test)

In [18]:
norm_X_train

array([[-0.33602492,  0.73548947,  0.61861178, ...,  0.24505213,
        -1.00872781,  0.30229601],
       [ 0.35856051,  0.80110665,  0.81683078, ...,  0.64365126,
         1.60118818,  0.06382199],
       [-0.33602492, -1.42987743, -1.4957242 , ...,  0.89730526,
         0.46945469,  0.06382199],
       ...,
       [-0.33602492,  0.0137005 ,  0.02395479, ..., -2.72632324,
        -1.00872781, -1.84397015],
       [-0.33602492,  0.932341  ,  0.94897678, ...,  0.96977783,
         0.70042071,  1.13695506],
       [ 4.52607309,  0.73548947,  0.61861178, ...,  0.24505213,
        -1.00872781,  0.06382199]])

In [17]:
norm_X_test

array([[-0.33602492,  0.3417864 ,  0.68468478, ..., -0.47967357,
         1.06996634, -1.2477851 ],
       [-0.33602492,  1.2604269 ,  1.27934178, ..., -2.65385067,
        -1.00872781,  0.06382199],
       [-0.33602492,  0.60425511,  0.55253878, ..., -0.47967357,
        -1.67852926,  0.06382199],
       ...,
       [-0.33602492,  0.07931768,  0.15610078, ..., -0.47967357,
        -0.24653996,  0.06382199],
       [-0.33602492, -0.24876821, -0.10819121, ...,  0.24505213,
        -0.24653996,  0.06382199],
       [-0.33602492, -1.36426025, -1.3635782 , ..., -0.47967357,
        -1.00872781,  0.06382199]])

#### using Gridsearch for hyperparameter tuning

In [20]:
parameters = {'subsample':[i/10.0 for i in range(6,10)], 'colsample_bytree':[i/10.0 for i in range(6,10)]}

Grid_Search_Result = GridSearchCV(estimator = XGBClassifier( learning_rate = 0.1, n_estimators=177, max_depth=5,
                                                            min_child_weight=1, gamma=0.0, subsample=0.8, colsample_bytree=0.8,
                                                            objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), verbose = 10,
                                                            param_grid = parameters, scoring='roc_auc',n_jobs=4,iid=False, cv=5)

### fit on training set
Grid_Search_Result.fit(X_train, y_train)

## return best set of parameters, and results
Grid_Search_Result.cv_results_, Grid_Search_Result.best_params_, Grid_Search_Result.best_score_

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:  1.2min
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:  1.6min
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:  2.4min
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:  2.7min
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:  3.1min
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  3.5min
[Parallel(n_jobs=4)]: Done  53 tasks      | elapsed:  4.0min
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:  4.4min
[Parallel(n_jobs=4)]: Done  80 out of  80 | elapsed:  5.6min finished


({'mean_fit_time': array([31.99351587, 27.3818469 , 26.58274593, 21.70592895, 12.60359254,
          9.33759937,  9.22299991, 11.48419971,  9.27000065, 10.69899888,
         10.65820041,  9.2715991 , 12.42820053, 10.31639981, 17.84040322,
         27.91481128]),
  'std_fit_time': array([0.75429889, 3.57369656, 3.05932488, 3.78759236, 2.46622674,
         2.0612105 , 2.07434284, 1.96333975, 1.46998619, 1.65026544,
         1.67748436, 1.29432579, 1.92916755, 2.18395174, 4.07079708,
         3.3943987 ]),
  'mean_score_time': array([0.13019962, 0.17340121, 0.14200058, 0.10699997, 0.05120053,
         0.04440002, 0.0454    , 0.04259996, 0.05100083, 0.05260005,
         0.04579973, 0.03980031, 0.05360031, 0.05740047, 0.07960038,
         0.10779896]),
  'std_score_time': array([0.01558646, 0.01334919, 0.02745983, 0.00987945, 0.00614519,
         0.0033824 , 0.00816312, 0.00427056, 0.00819789, 0.01024932,
         0.00757381, 0.00193916, 0.00808943, 0.00781308, 0.03392096,
         0.023198

#### Generate predictions

In [30]:
predictions = Grid_Search_Result.predict(X_test)

#### export submission as dataframe

In [32]:
submission_df = pd.DataFrame({'INCIDENT_ID':test_df['INCIDENT_ID'], 'MULTIPLE_OFFENSE':predictions})
submission_df.to_csv('final_submission.csv', index=False)


### End of file