In [1]:
# Supressing the warnings.

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Importing the required libraries and packages.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', 250)
pd.set_option('display.max_rows', 500)
pd.set_option('display.width', 1000)

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score, cross_val_predict

from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

import xgboost as xgb
from xgboost import XGBClassifier

from sklearn.neighbors import KNeighborsClassifier

In [3]:
# Importing the dataset from the working directory.

df = pd.read_csv('test.csv')
df.head(10)

Unnamed: 0,INCIDENT_ID,DATE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,X_9,X_10,X_11,X_12,X_13,X_14,X_15
0,CR_195453,01-FEB-18,0,30,35,7,3,6,4,0,5,1,174,,72,119,23
1,CR_103520,05-MAR-04,0,44,44,1,3,7,1,4,6,1,316,0.0,12,29,34
2,CR_196089,27-JAN-18,0,34,33,3,5,2,7,3,0,1,316,1.0,72,0,34
3,CR_112195,18-AUG-06,7,3,2,3,5,9,8,0,5,1,174,1.0,112,87,34
4,CR_149832,31-OCT-11,0,7,8,7,3,2,7,1,5,1,174,0.0,112,93,43
5,CR_81654,25-MAR-01,0,47,48,7,3,4,2,1,6,1,0,0.0,34,29,34
6,CR_139009,13-JUL-09,0,33,32,2,1,6,4,0,5,1,174,0.0,103,103,43
7,CR_6108,14-SEP-92,0,21,23,4,1,5,6,0,5,1,249,1.0,92,93,34
8,CR_62283,31-MAR-99,0,36,34,2,1,1,0,0,5,1,174,0.0,92,93,48
9,CR_5710,13-MAR-92,0,16,15,0,0,1,0,3,6,7,316,7.0,72,29,34


In [4]:
# Looking at the dimensions of the dataframe.

df.shape

(15903, 17)

In [5]:
# Looking at the count and percentages of null values of each column.

total = df.isnull().sum().sort_values(ascending = False)

percentage = (df.isnull().sum() / df.isnull().count() * 100).sort_values(ascending = False)

pd.concat([total, percentage], axis = 1, keys = ['Total', 'Percentage'])

Unnamed: 0,Total,Percentage
X_12,127,0.798591
X_15,0,0.0
X_6,0,0.0
DATE,0,0.0
X_1,0,0.0
X_2,0,0.0
X_3,0,0.0
X_4,0,0.0
X_5,0,0.0
X_7,0,0.0


In [6]:
df['X_12'].value_counts()

1.0     10530
0.0      3346
2.0      1381
3.0       321
4.0       100
5.0        42
6.0        23
7.0        10
8.0         9
10.0        4
9.0         3
11.0        2
20.0        1
40.0        1
14.0        1
12.0        1
18.0        1
Name: X_12, dtype: int64

In [7]:
df['X_12']=df['X_12'].fillna(1.0)

In [8]:
data=df[['X_1','X_2','X_3','X_4','X_5','X_6','X_7','X_8','X_9','X_10','X_11','X_12','X_13','X_14','X_15']]
data.head()

Unnamed: 0,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,X_9,X_10,X_11,X_12,X_13,X_14,X_15
0,0,30,35,7,3,6,4,0,5,1,174,1.0,72,119,23
1,0,44,44,1,3,7,1,4,6,1,316,0.0,12,29,34
2,0,34,33,3,5,2,7,3,0,1,316,1.0,72,0,34
3,7,3,2,3,5,9,8,0,5,1,174,1.0,112,87,34
4,0,7,8,7,3,2,7,1,5,1,174,0.0,112,93,43


In [9]:
scaler=StandardScaler()
data_scl=scaler.fit_transform(data)
data_scl[:10]

array([[-0.33149102,  0.3465487 ,  0.68916296,  0.91302246,  0.28091895,
        -0.01907214, -0.22379571, -0.67068633,  0.0663745 , -0.28655103,
        -0.35394606,  0.03154371, -0.47981008,  1.07699829, -1.25362   ],
       [-0.33149102,  1.26588593,  1.28412657, -1.10412952,  0.28091895,
         0.20559766, -1.00212125,  2.0481321 ,  0.79384081, -0.28655103,
         1.17121607, -1.11375218, -2.66223712, -0.99497464,  0.06944597],
       [-0.33149102,  0.60921648,  0.55694882, -0.43174553,  1.29960648,
        -0.91775132,  0.55452982,  1.3684275 , -3.57095705, -0.28655103,
         1.17121607,  0.03154371, -0.47981008, -1.66261037,  0.06944597],
       [ 4.62577813, -1.4264588 , -1.49237028, -0.43174553,  1.29960648,
         0.65493724,  0.81397167, -0.67068633,  0.0663745 , -0.28655103,
        -0.35394606,  0.03154371,  0.97514128,  0.3402968 ,  0.06944597],
       [-0.33149102, -1.16379103, -1.09572787,  0.91302246,  0.28091895,
        -0.91775132,  0.55452982,  0.00901828, 

In [10]:
data_scl=pd.DataFrame(data_scl,columns=data.columns)
data_scl.head()

Unnamed: 0,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,X_9,X_10,X_11,X_12,X_13,X_14,X_15
0,-0.331491,0.346549,0.689163,0.913022,0.280919,-0.019072,-0.223796,-0.670686,0.066374,-0.286551,-0.353946,0.031544,-0.47981,1.076998,-1.25362
1,-0.331491,1.265886,1.284127,-1.10413,0.280919,0.205598,-1.002121,2.048132,0.793841,-0.286551,1.171216,-1.113752,-2.662237,-0.994975,0.069446
2,-0.331491,0.609216,0.556949,-0.431746,1.299606,-0.917751,0.55453,1.368427,-3.570957,-0.286551,1.171216,0.031544,-0.47981,-1.66261,0.069446
3,4.625778,-1.426459,-1.49237,-0.431746,1.299606,0.654937,0.813972,-0.670686,0.066374,-0.286551,-0.353946,0.031544,0.975141,0.340297,0.069446
4,-0.331491,-1.163791,-1.095728,0.913022,0.280919,-0.917751,0.55453,0.009018,0.066374,-0.286551,-0.353946,-1.113752,0.975141,0.478428,1.151954


In [11]:
# Checking the skewness in all the other numerical features.

from scipy.stats import skew
skewed_feats = data_scl.apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
skewness = pd.DataFrame({'Skewed Features':skewed_feats})
skewness

Unnamed: 0,Skewed Features
X_10,13.27363
X_8,9.686202
X_12,9.086284
X_1,3.855509
X_6,0.984939
X_7,0.802731
X_14,0.26476
X_4,0.192462
X_5,0.173052
X_3,-0.079028


In [12]:
# Filtering out the features where skewness is more than 1 or less than -1.

skewed_feats1 = skewed_feats[skewed_feats > 1]
skewed_feats1 = skewed_feats1.index

skewed_feats2 = skewed_feats[skewed_feats < -1]
skewed_feats2 = skewed_feats2.index

In [13]:
skewed_feats1

Index(['X_10', 'X_8', 'X_12', 'X_1'], dtype='object')

In [14]:
skewed_feats2

Index(['X_13', 'X_9', 'X_15'], dtype='object')

In [15]:
# Converting the skewed numerical features into log scale. Here a constant '150' is added to facilitate the log transformation
# of negative values.

data_scl[skewed_feats1] = np.log1p(data_scl[skewed_feats1] + 150)

data_scl[skewed_feats2] = np.log1p(data_scl[skewed_feats2] + 150)

In [16]:
data_scl.head()

Unnamed: 0,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,X_9,X_10,X_11,X_12,X_13,X_14,X_15
0,5.015082,0.346549,0.689163,0.913022,0.280919,-0.019072,-0.223796,5.012828,5.017719,5.01538,-0.353946,5.017489,5.014097,1.076998,5.008943
1,5.015082,1.265886,1.284127,-1.10413,0.280919,0.205598,-1.002121,5.030752,5.022523,5.01538,1.171216,5.009877,4.999492,-0.994975,5.01774
2,5.015082,0.609216,0.556949,-0.431746,1.299606,-0.917751,0.55453,5.026301,4.993347,5.01538,1.171216,5.017489,5.014097,-1.66261,5.01774
3,5.047454,-1.426459,-1.49237,-0.431746,1.299606,0.654937,0.813972,5.012828,5.017719,5.01538,-0.353946,5.017489,5.023717,0.340297,5.01774
4,5.015082,-1.163791,-1.095728,0.913022,0.280919,-0.917751,0.55453,5.01734,5.017719,5.01538,-0.353946,5.009877,5.023717,0.478428,5.02488


In [18]:
data_test=pd.concat([data_scl,df['INCIDENT_ID']],axis=1)
data_test.head()

Unnamed: 0,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,X_9,X_10,X_11,X_12,X_13,X_14,X_15,INCIDENT_ID
0,5.015082,0.346549,0.689163,0.913022,0.280919,-0.019072,-0.223796,5.012828,5.017719,5.01538,-0.353946,5.017489,5.014097,1.076998,5.008943,CR_195453
1,5.015082,1.265886,1.284127,-1.10413,0.280919,0.205598,-1.002121,5.030752,5.022523,5.01538,1.171216,5.009877,4.999492,-0.994975,5.01774,CR_103520
2,5.015082,0.609216,0.556949,-0.431746,1.299606,-0.917751,0.55453,5.026301,4.993347,5.01538,1.171216,5.017489,5.014097,-1.66261,5.01774,CR_196089
3,5.047454,-1.426459,-1.49237,-0.431746,1.299606,0.654937,0.813972,5.012828,5.017719,5.01538,-0.353946,5.017489,5.023717,0.340297,5.01774,CR_112195
4,5.015082,-1.163791,-1.095728,0.913022,0.280919,-0.917751,0.55453,5.01734,5.017719,5.01538,-0.353946,5.009877,5.023717,0.478428,5.02488,CR_149832


In [19]:
data_test.to_csv('test_scld.csv')