In [None]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

#Load train and test set
act_train = pd.read_csv('act_train.csv' , parse_dates=['date'])
act_test = pd.read_csv('act_test.csv' , parse_dates=['date'])
test_id = act_test['activity_id']
ppl = pd.read_csv('people.csv' , parse_dates=['date'])
train = act_train.merge(ppl , on='people_id' , how='left' , left_index=True)
test = act_test.merge(ppl , on='people_id' , how='left' , left_index=True)
del act_train , act_test , ppl
train = train.sort_values(['people_id'] ,ascending=True)
test = test.sort_values(['people_id'] , ascending=True)
target = train['outcome']
train.drop(['people_id' , 'activity_id' , 'outcome'],axis=1,inplace=True)

#Train data Preprocess
num_feat = list(train.dtypes[train.dtypes == ('int64' or 'float')].index)
num_data = train[num_feat]
cat_feat = list(train.dtypes[train.dtypes == ('object')].index)
cat_data = train[cat_feat]
bool_feat = list(train.dtypes[train.dtypes == ('bool')].index)
bool_data = train[bool_feat] 
date_feat = ['date_x' , 'date_y']
date_data = train[date_feat]
missing_cat_feat = list(cat_data.isnull().sum().sort_values(ascending=False).head(10).index)
cat_data.drop(missing_cat_feat , axis=1 , inplace=True)
bool_data_bin = bool_data.apply(lambda x:x*1).astype(np.int64)

for col in list(cat_data.columns):
    cat_data[col].fillna('type 0', inplace=True)
    
for col in list(cat_data.columns):
    cat_data[col] = cat_data[col].apply(lambda x:x.split(' ')[1]).astype(np.int64)
    
non_cat_data = pd.concat([bool_data_bin , num_data] , axis=1)
train['time_diff'] = (train['date_x'] - train['date_y']).astype(str).apply(lambda x: x.split(' ')[0]).astype(np.int64) 
X_full = pd.concat([cat_data , non_cat_data , train['time_diff']],axis=1)
print(X_full.shape)



#Test data Preprocess
test_id = test['activity_id']
test.drop(['people_id' , 'activity_id'],axis=1,inplace=True)
num_data_t= test[num_feat]
cat_data_t= test[cat_feat]
bool_data_t = test[bool_feat]
date_data_t= test[date_feat]
cat_data_t.drop(missing_cat_feat , axis=1 , inplace=True)
bool_data_bin_t = bool_data_t.apply(lambda x:x*1).astype(np.int64)

for col in list(cat_data_t.columns):
    cat_data_t[col].fillna('type 0', inplace=True)

for col in list(cat_data_t.columns):
    cat_data_t[col] = cat_data_t[col].apply(lambda x:x.split(' ')[1]).astype(np.int64)
    
non_cat_data_t = pd.concat([bool_data_bin_t , num_data_t] , axis=1)
test['time_diff'] = (test['date_x'] - test['date_y']).astype(str).apply(lambda x: x.split(' ')[0]).astype(np.int64)
X_full_t = pd.concat([cat_data_t , non_cat_data_t , test['time_diff']],axis=1)
print(X_full_t.shape)

In [None]:
#Dimensionality Reduction
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=0)
model_rf = rf.fit(X_full , target)
model_rf = SelectFromModel(model_rf , prefit=True)
X_full_red = model_rf.transform(X_full)
print('reduced train data shape',X_full_red.shape)
X_full_t_red = model_rf.transform(X_full_t)
print('reduced test data shape' , X_full_t_red.shape)

In [None]:
#Modeling

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import StratifiedKFold , cross_val_score


models = [('LR', LogisticRegression()) , 
          ('RF', RandomForestClassifier()), 
         ('AdaBoost', AdaBoostClassifier()),
         ('MLP', MLPClassifier()),
         ('Xgboost_classifier', XGBClassifier())]

skf = StratifiedKFold(n_splits = 7)

for m in models:
    this_score = cross_val_score(m[1] , X_full_red , target , scoring='roc_auc' , cv=skf , n_jobs=-1)
    print('%s average AUC score is %.3f +/- %.3f' % (m[0],np.mean(this_score) , np.std(this_score)))

In [None]:
#Hyper parameter tuning on XGBRegressor
xgb = XGBClassifier()
from sklearn.grid_search import GridSearchCV
param_grid_xgb = {'n_estimators':[20,35,50,100] , 'max_depth':[3,6,9]  ,'min_child_weight':[1,5,15]}
grid_xgb = GridSearchCV(xgb , param_grid_xgb , cv=7 , scoring='roc_auc',n_jobs=-1)
grid_xgb.fit(X_full_red,target)
print('GS best score %.3f' % grid_xgb.best_score_)
print('GS best params {}'.format(grid_xgb.best_params_))

In [None]:
#Using best params to predict test data
xgb_gs = grid_xgb.best_estimator_
xgb_gs.fit(X_full_red , target)
y_pred = xgb_gs.predict(X_full_t_red)
solution = pd.DataFrame({'activity_id':test_id, 'outcome':y_pred})
solution.to_csv('Redhat_xgb_gs.csv' , index=False)