In [None]:
import os
import json
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()

import time

%matplotlib inline

from plotly import tools
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go

from sklearn import model_selection, preprocessing, metrics
import lightgbm as lgb

from sklearn.model_selection import KFold
from tqdm import tqdm
import gc
import datetime
import re
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import matthews_corrcoef, roc_curve, auc, roc_auc_score, classification_report, confusion_matrix
from sklearn.metrics import f1_score,precision_recall_curve,roc_curve, recall_score,precision_score
from sklearn.model_selection import train_test_split

from sklearn.model_selection import KFold,GroupKFold,StratifiedKFold
from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings("ignore")

pd.options.mode.chained_assignment = None
pd.options.display.max_columns = 999
pd.options.display.max_rows = 500

In [None]:
# Path = '../input/amexml/'
Path = '../input/amex-preproc/'
train_df=pd.read_hdf(Path+'train_targetenc.hdf')
print('train reading complete')
test_df=pd.read_hdf(Path+'test_targetenc.hdf')
print('test reading complete')
log_df=pd.read_csv('../input/amexml/'+'historical_user_logs.csv')
print('log reading complete')

In [None]:
Path = '../input/amexml/'
train_rawdf=pd.read_csv(Path+'train.csv')
print('train reading complete')
test_rawdf=pd.read_csv(Path+'test.csv')
print('test reading complete')

In [None]:
cols_to_convert = ['product','gender']
for col in cols_to_convert:
    print('col=',col)
    train_rawdf[col], indexer = pd.factorize(train_rawdf[col])
    test_rawdf[col] = indexer.get_indexer(test_rawdf[col])
    if col in log_df.columns:
        log_df[col] = indexer.get_indexer(log_df[col])

In [None]:
# log_df.to_hdf('log.hdf',key='data')

In [None]:
print(train_df.shape)
train_df.head()

In [None]:
print(test_df.shape)
print(100 *test_df.shape[0] / train_df.shape[0])
test_df.head()
#train to test ratio


In [None]:
# print(log_df.shape)
# log_df.head()

In [None]:
targetcol='is_click'
#class balancing ratio
noof0s = train_df[train_df[targetcol]==0].shape[0]
noof1s = train_df[train_df[targetcol]==1].shape[0]
print('no of 0s:',noof0s)
print('no of 1s:',noof1s)
ratio0s = noof0s / (noof1s +noof0s )
ratio1s = noof1s / (noof1s +noof0s )
print('ratio of 0s:',ratio0s)
print('ratio of 1s:',ratio1s)


In [None]:
train_df[targetcol] = train_df[targetcol].astype('int32')

In [None]:
def get_opt_cutoff_prec(labels,preds):
    precision, recall, thresholds  = precision_recall_curve(labels, preds)
    f1_score= 2*((precision*recall)/(precision+recall))
    optimal_idx = np.nanargmax(f1_score)
    optimal_threshold = thresholds[optimal_idx]
    return optimal_threshold, f1_score[optimal_idx]

def convert_probtolabels(preds,cutoff=0.5):
    y_bin= preds.copy()
    y_bin[preds>cutoff] = 1
    y_bin[preds<=cutoff] = 0
    y_bin=y_bin.astype(int)

    return y_bin

In [None]:
cat_cols=['product','gender','user_id',  'campaign_id', 'webpage_id',
'product_category_1','product_category_2','user_group_id','var_1']
other_cols = ['age_level','user_depth','city_development_index']
date_feats = ['dayofweek','month','hour']

raw_cols = cat_cols + other_cols + date_feats

In [None]:
params = {
    "objective" : "binary",
    "metric" : "auc", 
    'colsample_bytree': 0.9779854557917957,
   'learning_rate': 0.09313896377798163,
   'min_child_samples': 40,
   'num_leaves': 127,
   'reg_alpha': 0.8606375999411122,
   'reg_lambda': 0.27709140233989277,
   'subsample': 0.5847676636997171,
   'subsample_for_bin': 220000,
   'bagging_seed': 2018,
   'bagging_frequency': 1,
    'n_estimators': 1000,
    'random_state':1,
    "num_threads": 4
}

# params = {
#     "objective" : "binary",
#     "metric" : "auc", 
#     "num_leaves" : 50, # 100
#     "min_child_samples" : 150, # 50
#     "learning_rate" : 0.02,
#     "bagging_fraction" : 0.75,
#     "feature_fraction" : 0.7, # 0.7,0.65
#     "bagging_frequency" : 1,
#     "bagging_seed" : 2018,
#     "verbosity" : -1,
#      "lambda_l1": 0.1,
#     "lambda_l2": 0.7,
#     'n_estimators': 1000,
#     'random_state':1,
#         "num_threads": 4
# }

In [None]:
# folds = get_folds(df=train_df, n_splits=5)

In [None]:
targetcol='is_click'

In [None]:
y_reg = (train_df[targetcol]==1).astype('int32')

In [None]:
dates = train_df['DateTime']

In [None]:
# #Date Validation split
# import datetime
# #Take 5th July 2017 as valid start and 6th july as valid end
# date_valid_start = datetime.date(2017, 7, 4) 
# date_valid_end = datetime.date(2017, 7, 5) 

# dates_train_filter = (dates <  date_valid_start) 
# dates_valid_filter = (dates >=  date_valid_start) & (dates <=  date_valid_end) 

# dates_train = dates[dates_train_filter]
# dates_valid  = dates[dates_valid_filter]

# train_df_valid = train_df.loc[dates_valid_filter]
# y_reg_valid = (train_df_valid[targetcol]==1).astype('int32')

# train_df_train = train_df.loc[dates_train_filter]
# y_reg_train = (train_df_train[targetcol]==1).astype('int32')

# print(train_df_train.shape)
# print(train_df_valid.shape)

**    Lag Feature Generation**

In [None]:
# from tqdm import tqdm
# import timeit

# def createlagcol(val_shift,tolag_cols,join_cols,
#                        data_shift,data_shift_join,
#                        data):
#     dateblocknumcol = 'day'
#     data_shift[dateblocknumcol] = data_shift[dateblocknumcol] + val_shift

#     lagcolumns = lambda x: '{}_lag_{}'.format(x, val_shift) if x in tolag_cols else x
#     data_shift = data_shift.rename(columns=lagcolumns)
    
#     print('val_shift=',val_shift)
#     print(data_shift.columns)
   
#     # Test shift would get the lags from train shift those exactly matches with the test set month.
#     # Also exclude date_block_num column in test shift
# #     colsfortest = list(train_shift.columns.difference([dateblocknumcol])) 
# #    print("colsfortest=",colsfortest)
# #     test_shift =  train_shift.loc[train_shift[dateblocknumcol]==(last_block+1),colsfortest]
#     #Don't use the shifted date block num beyond maximum block no for train set
#     data_shift = data_shift[data_shift[dateblocknumcol]<=last_block]
#     print('shift.columns=',data_shift.columns)
#     data_shift.set_index(join_cols,inplace=True)
   
#     start = timeit.default_timer()
#     data_shift_join= data_shift_join.join(data_shift,on=join_cols,how='left')
#     stop = timeit.default_timer()
    
#     print("Join complete: Execution Time={}".format(stop - start ))
   
#     del data_shift
# #     del test_shift
#     gc.collect()
    
#     start = timeit.default_timer()
#     lagcolumn_names = ['{}_lag_{}'.format(x, val_shift) for x in tolag_cols ]
#     print(lagcolumn_names)
#     for colname in lagcolumn_names:
#         data[colname] = data_shift_join[colname]
# #         test_data[colname]  = test_shift_join[colname].copy()
#         del data_shift_join[colname]
# #         del test_shift_join[colname]
#         gc.collect()
    
#     stop = timeit.default_timer()
#     print("Train and Test column copy execution time: {}".format(stop - start ))


In [None]:
def gen_datefeats(df):
    df['DateTime'] = pd.to_datetime(df['DateTime'],infer_datetime_format=True)
    
    df['day'] = df['DateTime'].dt.day
    df['dayofweek'] = df['DateTime'].dt.dayofweek
    df['month'] = df['DateTime'].dt.month
    df['hour'] = df['DateTime'].dt.hour
    return df

log_df=gen_datefeats(log_df)

In [None]:
date_log_start = datetime.date(2017, 5, 28) 

In [None]:
def gen_noofdayssincelog(df):
    df['noofdays'] = (df['DateTime'].dt.date - date_log_start).dt.days
    return df
log_df=gen_noofdayssincelog(log_df)
print('log df date generation complete')
train_df=gen_noofdayssincelog(train_df)
test_df=gen_noofdayssincelog(test_df)

In [None]:
def createlagcol(val_shift,tolag_cols,join_cols,
                       data_shift,
                       data):
    dateblocknumcol = 'noofdays'
    data_shift_proc = data_shift.copy()
    data_shift_proc[dateblocknumcol] = data_shift[dateblocknumcol] + val_shift
#     lagcolumns = lambda x: '{}_lag_{}'.format(x, val_shift) if x in tolag_cols else x
#     data_shift_proc = data_shift_proc.rename(columns=lagcolumns)
    print('bef datashift shape:',data_shift_proc.shape)
    data_shift_proc = data_shift_proc[data_shift_proc[dateblocknumcol]<=last_block]
    curcols = list(data_shift_proc.columns)
#     data_shift_proc.columns[len(tolag_cols)-1] = 
    lag_colnames = [col+'_lag_'+str(val_shift) for col in tolag_cols]
    data_shift_proc.columns= curcols[0:len(curcols)-1] + lag_colnames
#     data_shift_proc.columns = [col+'_lag_'+ str(val_shift) if col in tolag_cols else col for col in data_shift_proc.columns]
    print(data_shift_proc.columns)
#     data_shift = data_shift.groupby(join_cols)[lag_colnames].agg('count')
    
    print('val_shift=',val_shift)
    print('aft datashift shape:',data_shift_proc.shape)
    
    start = time.time()
    data= pd.merge(data,data_shift_proc, how='left',on=join_cols)
#     data= pd.merge(data,data_shift_proc, right_index=True, how='left')
#     data_shift.set_index(join_cols,inplace=True)
#     data= data.join(data_shift, how='left',on=join_cols)
    stop = time.time()
    
    print("Join complete: Execution Time={}".format(stop - start ))
    
    del data_shift_proc
    gc.collect()
    return data


In [None]:
train_df['istrain'] = 1
test_df['istrain'] = 0
test_df[targetcol] = np.nan
combined_data = pd.concat([train_df,test_df],axis=0)
print(combined_data.shape)

In [None]:
# LAG TARGET COL
from tqdm import tqdm
import timeit

last_block = 43
index_cols = ['user_id','product','noofdays']
# List of columns that we will use to create lags
cols_to_rename = [targetcol] 


# data_to_shift_columns = combined_data[index_cols + cols_to_rename].copy()

In [None]:
start = time.time()
# data_shift = combined_data[index_cols + cols_to_rename].groupby(index_cols)[cols_to_rename].agg('sum')
data_shift = combined_data.groupby(index_cols)[cols_to_rename].agg('sum')
stop = time.time()
print("Group By complete: Execution Time={}".format(stop - start ))
targetsumcolumns = lambda x: '{}_sum'.format(x) if x in cols_to_rename else x
data_shift = data_shift.rename(columns=targetsumcolumns)
start = time.time()
data_shift.reset_index(inplace=True)
stop = time.time()
print("Index complete: Execution Time={}".format(stop - start ))

data_shift = data_shift[~data_shift['is_click_sum'].isnull()]

In [None]:
#groupby on log data
log_df[targetcol] = 1
start = time.time()
log_shift = log_df.groupby(index_cols)[cols_to_rename].agg('sum')
stop = time.time()
print("Group By complete: Execution Time={}".format(stop - start ))
targetsumcolumns = lambda x: '{}_sum'.format(x) if x in cols_to_rename else x
log_shift = log_shift.rename(columns=targetsumcolumns)

start = time.time()
log_shift.reset_index(inplace=True)
stop = time.time()
print("Index complete: Execution Time={}".format(stop - start ))

In [None]:
log_plus_data_shift = pd.concat([log_shift,data_shift],axis=0)

In [None]:
# data_shift_join=combined_data[index_cols].copy()

# test_index_cols = index_cols.copy()
# test_index_cols.remove(dateblocknumcol)
# test_shift_join=test_df[test_index_cols].copy()

shift_range = [2,3,4,5,7,14,21,30,31]

for val_shift in tqdm(shift_range):
#     data_shift = data_to_shift_columns.copy()
    combined_data = createlagcol(val_shift,cols_to_rename,index_cols,
                       log_plus_data_shift,
                       combined_data) 
    print('combined_data shape:',combined_data.shape)

            
# del data_shift_join
# del test_shift_join
# del data_to_shift_columns
gc.collect()
# colstodisplay = index_cols
# for val_shift in tqdm(shift_range):
#     colstodisplay.append('target'+'_lag_'+str(val_shift))

In [None]:
train_df = combined_data[combined_data['istrain']==1]
test_df = combined_data[combined_data['istrain']==0]
del test_df['istrain']
del test_df[targetcol]
del train_df['istrain']
del combined_data;gc.collect()

In [None]:
#Set category field types for Model classification
temp=pd.DataFrame()
for col in cat_cols:
    train_df[col] =train_df[col].astype('category')
    test_df[col] =test_df[col].astype('category')

In [None]:
#train features
# train_features = [_f for _f in train_df.columns if _f not in excluded_features]
# enc_cols = [col for col in train_df.columns if ('targetenc' in col) ]
enc_cols_excl_action = [col for col in train_df.columns if ('targetenc' in col)
                       and ('action_' not in col)
                       ]
# raw_action_cols = [col for col in train_df.columns if ('targetenc' not in col)
#                        and ('action_' in col)
# #                        and ('date_mean_total' not in col)
# #                          and ('_viewsum' not in col)
#     ]
derived_raw_action_cols= [col for col in train_df.columns 
                         if ('targetenc' not in col) and (('date_mean_total' in col) ) ]
key_raw_action_cols = ['action_mean', 'action_sum', 'action_count', 'action_month_mean', 'action_month_sum', 
                       'action_month_count', 'action_dayofweek_mean', 'action_dayofweek_sum', 
                       'action_dayofweek_count', 'action_hour_mean', 'action_hour_sum', 'action_hour_count']                  
key_cols =  ['user_id','product']
target_enc_keyactioncols =['targetenc_action_sum']
lagcols = [col for col in train_df.columns if '_lag_' in col ]
train_features = raw_cols + key_raw_action_cols + derived_raw_action_cols + enc_cols_excl_action + lagcols

summarize_enc_cols =['sum_targetenc','product_targetenc','mean_targetenc']
excluded_features = []
train_features = [_f for _f in train_features if _f not in excluded_features]

# train_features =['user_id', 'product', 'campaign_id', 'webpage_id', 'product_category_1', 'product_category_2', 'user_group_id', 'gender', 'age_level', 'user_depth', 'city_development_index', 'var_1', 'action_mean', 'action_sum', 'action_count', 'day', 'dayofweek', 'month', 'hour']

print(train_features)
print(len(train_features))

importances = pd.DataFrame()
oof_cls_preds = np.zeros(train_df.shape[0])
sub_cls_preds = np.zeros(test_df.shape[0])

In [None]:
#Date Validation split
import datetime
#Take 6th July 2017 as valid start date and 7th july as valid end
date_valid_start = datetime.date(2017, 7, 6) 

dates_valid_filter = (dates >=  date_valid_start)

dates_train = dates[~dates_valid_filter]
dates_valid  = dates[dates_valid_filter]

train_df_valid = train_df.loc[dates_valid_filter]
y_reg_valid = (train_df_valid[targetcol]==1).astype('int32')

train_df_train = train_df.loc[~dates_valid_filter]
y_reg_train = (train_df_train[targetcol]==1).astype('int32')

print(train_df_train.shape)
print(train_df_valid.shape)

In [None]:
start = time.time()

# iters = len(folds)
val_aucscores=[]
importances['feature'] = train_features
importances['gain'] = 0

reg = lgb.LGBMClassifier(**params)

reg.fit(
    train_df_train[train_features], y_reg_train,
    eval_set=[(train_df_valid[train_features], y_reg_valid)],
    early_stopping_rounds=50,
    verbose=50,
    eval_metric='auc' 
    )
    
importances['gain'] = reg.booster_.feature_importance(importance_type='gain') 

valid_preds = reg.predict_proba(train_df_valid[train_features], num_iteration=reg.best_iteration_)[:,1]
auc_score = roc_auc_score(y_reg_valid,valid_preds)
print('valid auc score=',auc_score)
# _preds = reg.predict_proba(test_df[train_features], num_iteration=reg.best_iteration_)[:,1]
# sub_cls_preds += _preds / iters
val_iterations = reg.best_iteration_
print('valid iterations =',val_iterations)
end =  time.time()
print('LGB Execution Time:',end-start)

In [None]:
#Model FUll Train Training
start = time.time()

importances['feature'] = train_features
importances['gain'] = 0

#validation iteration returned 72 iterations and test predictions for nearby iterations
est_list =[90,100,110,120,130]

for est in est_list:
    params['n_estimators'] = est
    reg = lgb.LGBMClassifier(**params)

    print('Light GBM Fit Start..')
    reg.fit(
        train_df[train_features], y_reg,
        eval_metric='auc' 
        )

    importances['gain'] = reg.booster_.feature_importance(importance_type='gain') 

    print('Light GBM Test Pred Start..')
    sub_cls_preds = reg.predict_proba(test_df[train_features], num_iteration=reg.best_iteration_)[:,1]
    end =  time.time()
    print('LGB Execution Time:',end-start)
    #Save Test Submission File
    sub_df = pd.DataFrame({"session_id":test_df['session_id']})
    sub_df[targetcol] = sub_cls_preds
    sub_df.to_csv("timeseriesbaseline_" + str(est) + ".csv", index=False)
    print('Submission Save Complete ')


In [None]:
# #Train Score    
# print('oof auc=',roc_auc_score(y_reg,oof_cls_preds))

In [None]:
# #Save Initial Train Meta
# np.savetxt('LGBM_TrainMeta.npy',oof_cls_preds)
# np.savetxt('LGBM_TestMeta.npy',sub_cls_preds)

In [None]:
# vis_importances = importances[importances['fold']==2]
importances.sort_values(by='gain', ascending=False,inplace=True)
# vis_importances1= vis_importances1[vis_importances1['gain']==0]
# vis_importances1.sort_values(by='feature', ascending=False,inplace=True)
importances[0:100]