In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [2]:
path = 'C:\\Users\\KBINM16696\\Desktop\\Data Science & Python\\Analytics Vidhya Hackathon Practice\\Novartis Hackerearth\\Dataset\\'

In [3]:
train = pd.read_csv(path + 'Train.csv')
test = pd.read_csv(path + 'Test.csv')
#samp = pd.read_csv(path + 'sample_submission.csv')

In [4]:
print(train.shape)
print(test.shape)

(23856, 18)
(15903, 17)


In [5]:
train.MULTIPLE_OFFENSE.value_counts()

1    22788
0     1068
Name: MULTIPLE_OFFENSE, dtype: int64

In [6]:
train.MULTIPLE_OFFENSE.value_counts(normalize=True)

1    0.955231
0    0.044769
Name: MULTIPLE_OFFENSE, dtype: float64

In [7]:
train['DATE'] = pd.to_datetime(train['DATE'])
test['DATE'] = pd.to_datetime(test['DATE'])

In [8]:
print('training data is present from {} to {}'.format(train.DATE.dt.date.min(),train.DATE.dt.date.max()))
print('test data is present from {} to {}'.format(test.DATE.dt.date.min(),test.DATE.dt.date.max()))

training data is present from 1991-01-01 to 2018-12-31
test data is present from 1991-01-02 to 2018-12-28


In [9]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score
from sklearn.metrics import confusion_matrix

### creating time based features

In [10]:
data_all = pd.concat([train, test])
data_all.shape

(39759, 18)

In [11]:
data_all.reset_index(inplace=True, drop=True)

In [12]:
def create_date_features(df):
    df['year'] = df['DATE'].dt.year
    df['month'] = df['DATE'].dt.month
    df['day'] = df['DATE'].dt.day
    df['DayOfWeek'] = df['DATE'].dt.dayofweek
    df['DayOfYear'] = df['DATE'].dt.dayofyear
    df['week'] = df['DATE'].dt.week
    df['Quarter'] = df['DATE'].dt.quarter
    df['is_month_start'] = df['DATE'].dt.is_month_start
    df['is_month_end'] = df['DATE'].dt.is_month_end
    df['is_quarter_start'] = df['DATE'].dt.is_quarter_start
    df['is_quarter_end'] = df['DATE'].dt.is_quarter_end
    df['is_year_start'] = df['DATE'].dt.is_year_start
    df['is_year_end'] = df['DATE'].dt.is_year_end
    df['Semester'] = np.where(df['Quarter'].isin([1,2]),1,2)
    df['is_weekend'] = np.where(df['DayOfWeek'].isin([5,6]), 1,0)
    df['is_weekday'] = np.where(df['DayOfWeek'].isin([0,1,2,3,4]), 1,0)
    return df

In [13]:
data_all = create_date_features(data_all)

#### create variables around missing value of X_12

In [16]:
data_all['is_X_12_missing'] = np.where(data_all.X_12.isnull(),1,0)

In [17]:
print(data_all.shape)

(39759, 35)


In [18]:
### datewise count of multiple offence
dft = data_all.groupby('DATE')['MULTIPLE_OFFENSE'].sum().reset_index(drop=False)
data_all = pd.merge(data_all, dft, how='left', on='DATE')

In [21]:
data_all.rename(columns={'MULTIPLE_OFFENSE_y':'cnt_total_offence'}, inplace=True)

In [22]:
data_all.rename(columns={'MULTIPLE_OFFENSE_x':'MULTIPLE_OFFENSE'}, inplace=True)

In [23]:
# data_all.groupby('DATE')['MULTIPLE_OFFENSE'].agg(['mean', 'std']).reset_index().sort_values('mean').isnull().sum()

### missing value treatment

In [24]:
data_all['X_12'].fillna(data_all.X_12.value_counts().idxmax(), inplace=True)

In [25]:
## missing value filled with mode value

In [26]:
# ### transform X_12 columns and group the 4+ values in the same category
# ### transform X_10 columns and group the 4+ values in the same category
# data_all['X_12'] = np.where(data_all['X_12']>4, '4+', data_all['X_12'])
# data_all['X_10'] = np.where(data_all['X_10']>4, '4+', data_all['X_10'])

In [27]:
data_all.columns

Index(['INCIDENT_ID', 'DATE', 'X_1', 'X_2', 'X_3', 'X_4', 'X_5', 'X_6', 'X_7',
       'X_8', 'X_9', 'X_10', 'X_11', 'X_12', 'X_13', 'X_14', 'X_15',
       'MULTIPLE_OFFENSE', 'year', 'month', 'day', 'DayOfWeek', 'DayOfYear',
       'week', 'Quarter', 'is_month_start', 'is_month_end', 'is_quarter_start',
       'is_quarter_end', 'is_year_start', 'is_year_end', 'Semester',
       'is_weekend', 'is_weekday', 'is_X_12_missing', 'cnt_total_offence'],
      dtype='object')

In [30]:
train1 = data_all.loc[~(data_all.MULTIPLE_OFFENSE.isnull())].reset_index(drop=True)
test1 = data_all.loc[(data_all.MULTIPLE_OFFENSE.isnull())].reset_index(drop=True).drop('MULTIPLE_OFFENSE', axis=1)
print(train1.shape)
print(test1.shape)

(23856, 36)
(15903, 35)


In [32]:
train1.drop(['INCIDENT_ID','DATE'], axis=1, inplace=True)
test1.drop(['INCIDENT_ID','DATE'], axis=1, inplace=True)

In [33]:
train1.head()

Unnamed: 0,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,X_9,X_10,...,is_month_end,is_quarter_start,is_quarter_end,is_year_start,is_year_end,Semester,is_weekend,is_weekday,is_X_12_missing,cnt_total_offence
0,0,36,34,2,1,5,6,1,6,1,...,False,False,False,False,False,2,1,0,0,1.0
1,1,37,37,0,0,11,17,1,6,1,...,False,False,False,False,False,2,0,1,0,5.0
2,0,3,2,3,5,1,0,2,3,1,...,False,False,False,False,False,1,0,1,0,2.0
3,0,33,32,2,1,7,1,1,6,1,...,False,False,False,False,False,1,0,1,0,3.0
4,0,33,32,2,1,8,3,0,5,1,...,False,False,False,False,False,1,0,1,0,5.0


In [38]:
train1.select_dtypes(include=['object','bool']).columns.tolist()

['is_month_start',
 'is_month_end',
 'is_quarter_start',
 'is_quarter_end',
 'is_year_start',
 'is_year_end']

In [44]:
train1.columns

Index(['X_1', 'X_2', 'X_3', 'X_4', 'X_5', 'X_6', 'X_7', 'X_8', 'X_9', 'X_10',
       'X_11', 'X_12', 'X_13', 'X_14', 'X_15', 'MULTIPLE_OFFENSE', 'year',
       'month', 'day', 'DayOfWeek', 'DayOfYear', 'week', 'Quarter',
       'is_month_start', 'is_month_end', 'is_quarter_start', 'is_quarter_end',
       'is_year_start', 'is_year_end', 'Semester', 'is_weekend', 'is_weekday',
       'is_X_12_missing', 'cnt_total_offence'],
      dtype='object')

In [45]:
cols_to_ohe = train1.select_dtypes(include=['object','bool']).columns.tolist()
cols_to_stdsclr = ['X_1', 'X_2', 'X_3', 'X_4', 'X_5', 'X_6', 'X_7', 'X_8', 'X_9', 'X_10',
       'X_11', 'X_12', 'X_13', 'X_14', 'X_15','Semester', 'cnt_total_offence']

In [46]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
ohe = OneHotEncoder()
st = StandardScaler()
le = LabelEncoder()

In [47]:
X = train1.drop('MULTIPLE_OFFENSE', axis=1)
y = train1['MULTIPLE_OFFENSE']
X_test = test1.copy()

In [54]:
cols_to_ohe

['is_month_start',
 'is_month_end',
 'is_quarter_start',
 'is_quarter_end',
 'is_year_start',
 'is_year_end']

In [53]:
X[cols_to_ohe].values.reshape(-1,1)

array([[False],
       [False],
       [False],
       ...,
       [False],
       [False],
       [False]])

In [56]:
## standard scaler
for col in cols_to_stdsclr:
    X[col] = st.fit_transform(X[col].values.reshape(-1,1))
    X_test[col]=st.transform(X_test[col].values.reshape(-1,1))

In [59]:
## one hot encode
ohe_vars = pd.DataFrame(ohe.fit_transform(X[cols_to_ohe]).toarray())
ohe_vars1 = pd.DataFrame(ohe.transform(X_test[cols_to_ohe]).toarray())

In [49]:
y = le.fit_transform(y)

In [61]:
## merge the variable created wih ohe
print(X.shape)
print(X_test.shape)
X = pd.concat([X,ohe_vars], axis=1)
X_test = pd.concat([X_test,ohe_vars1], axis=1)
print(X.shape)
print(X_test.shape)

(23856, 33)
(15903, 33)
(23856, 45)
(15903, 45)


In [62]:
print(ohe_vars1.shape)
print(ohe_vars.shape)

(15903, 12)
(23856, 12)


In [66]:
## drop ohe vars
X.drop(cols_to_ohe, axis=1, inplace=True)
X_test.drop(cols_to_ohe, axis=1, inplace=True)
print(X.shape)
print(X_test.shape)

(23856, 39)
(15903, 39)


In [72]:
# Load libraries
import xgboost
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from sklearn.model_selection import GridSearchCV #for GridSearch

In [77]:
# # Create the parameter grid based on the results of random search 
# param_grid = {
#     'n_estimators': [100,300,500,600,1000],
#     'max_depth': [4,5,6,7,8],
#     'gamma': [0,0.01,0.1,1,10,20],
#     'colsample_bytree': [0.7,0.8,0.9,1],
#     'colsample_bylevel': [0.7,0.8,0.9,1],
# }
# # Create a based model
# xgb = XGBClassifier()
# # Instantiate the grid search model
# grid_search = GridSearchCV(estimator = xgb, param_grid = param_grid, 
#                           cv = 5, n_jobs = -1, verbose = 2)

In [79]:
# # Fit the grid search to the data
# grid_search.fit(X, y)
# grid_search.best_params_

In [95]:
xgb1 = XGBClassifier()

In [96]:
xgb1.fit(X,y)
pred = xgb1.predict(X_test)

In [97]:
sub4 = pd.DataFrame()
sub4['INCIDENT_ID'] = test.INCIDENT_ID
sub4['MULTIPLE_OFFENSE'] = pred
sub4.to_csv(path + 'xgb1_with_target_encoded_variable.csv', index=False)