In [None]:
import os
import json
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()

%matplotlib inline

from plotly import tools
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go

from sklearn import model_selection, preprocessing, metrics
import lightgbm as lgb

from sklearn.model_selection import KFold
from tqdm import tqdm
import gc
import datetime
import re
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import matthews_corrcoef, roc_curve, auc, roc_auc_score, classification_report, confusion_matrix
from sklearn.metrics import f1_score,precision_recall_curve,roc_curve, recall_score,precision_score
from sklearn.model_selection import train_test_split

from sklearn.model_selection import KFold,GroupKFold,StratifiedKFold
from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings("ignore")

pd.options.mode.chained_assignment = None
pd.options.display.max_columns = 999
pd.options.display.max_rows = 500

In [None]:
# Path = '../input/amexml/'
Path = '../input/'
train_df=pd.read_csv(Path+'train.csv')
print('train reading complete')
test_df=pd.read_csv(Path+'test.csv')
print('test reading complete')
log_df=pd.read_csv(Path+'historical_user_logs.csv')
print('log reading complete')

In [None]:
log_df.to_hdf('historical_user_logs.h5',key='data')

In [None]:
print(train_df.shape)
train_df.head()

In [None]:
print(test_df.shape)
print(100 *test_df.shape[0] / train_df.shape[0])
test_df.head()
#train to test ratio


In [None]:
print(log_df.shape)
log_df.head()

In [None]:
targetcol='is_click'
#class balancing ratio
noof0s = train_df[train_df[targetcol]==0].shape[0]
noof1s = train_df[train_df[targetcol]==1].shape[0]
print('no of 0s:',noof0s)
print('no of 1s:',noof1s)
ratio0s = noof0s / (noof1s +noof0s )
ratio1s = noof1s / (noof1s +noof0s )
print('ratio of 0s:',ratio0s)
print('ratio of 1s:',ratio1s)


In [None]:
train_df[targetcol] = train_df[targetcol].astype('int32')

In [None]:
train_df.dtypes

In [None]:
#Label Encoding
cols_to_convert = ['product','gender']
for col in cols_to_convert:
    print('col=',col)
    train_df[col], indexer = pd.factorize(train_df[col])
    test_df[col] = indexer.get_indexer(test_df[col])
    if col in log_df.columns:
        log_df[col] = indexer.get_indexer(log_df[col])

In [None]:
#Log Label Encoding
log_df['action'], indexer = pd.factorize(log_df['action'])

In [None]:
def gen_datefeats(df):
    df['DateTime'] = pd.to_datetime(df['DateTime'],infer_datetime_format=True)
    
    df['day'] = df['DateTime'].dt.day
    df['dayofweek'] = df['DateTime'].dt.dayofweek
    df['month'] = df['DateTime'].dt.month
    df['hour'] = df['DateTime'].dt.hour
    return df

In [None]:
#generate date time features
train_df = gen_datefeats(train_df)
test_df = gen_datefeats(test_df)
log_df = gen_datefeats(log_df)

In [None]:
#Generate log_df stats features
def gen_logfeats(logdf,df_dest):
    targetcolname ='action'
    cols_agg=['mean','sum','count']
    groupcols =['user_id','product']
    log_grouped_df = logdf.groupby(groupcols)[targetcolname].agg(cols_agg)
    log_grouped_df.columns =['action_'+col for col in log_grouped_df.columns]
    df_dest= pd.merge(df_dest,log_grouped_df,on=groupcols,how='left')

    
    print('Initial feats completed')
    
    #generate datewise stats on log df
    groupcol_list=['month','dayofweek','hour']
    for cur_groupcol in groupcol_list:
        log_grouped_df = logdf.groupby(cur_groupcol)[targetcolname].agg(cols_agg)
        log_grouped_df.columns =['action_'+cur_groupcol+'_'+col for col in log_grouped_df.columns]
        df_dest= pd.merge(df_dest,log_grouped_df,on=cur_groupcol,how='left')
        
    print('Daywise feats completed')
    
    #datewise stats along with user id 
    for cur_groupcol in groupcol_list:
        allgroupcols = ['user_id',cur_groupcol]
        log_grouped_df = logdf.groupby(allgroupcols)[targetcolname].agg(cols_agg)
        log_grouped_df.columns =['action_'+'user'+ cur_groupcol+'_'+col for col in log_grouped_df.columns]
        df_dest= pd.merge(df_dest,log_grouped_df,on=allgroupcols,how='left')
        
    print('Userwise feats completed')
        
    #datewise stats along with product id 
    for cur_groupcol in groupcol_list:
        allgroupcols = ['product',cur_groupcol]
        log_grouped_df = logdf.groupby(allgroupcols)[targetcolname].agg(cols_agg)
        log_grouped_df.columns =['action_'+'product'+ cur_groupcol+'_'+col for col in log_grouped_df.columns]
        df_dest= pd.merge(df_dest,log_grouped_df,on=allgroupcols,how='left')
    print('Productwise feats completed')
        
    #datewise stats along with user id and product id combination
    for cur_groupcol in groupcol_list:
        allgroupcols = groupcols+[cur_groupcol]
        log_grouped_df = logdf.groupby(allgroupcols)[targetcolname].agg(cols_agg)
        log_grouped_df.columns =['action_'+'userproduct'+ cur_groupcol+'_'+col for col in log_grouped_df.columns]
        df_dest= pd.merge(df_dest,log_grouped_df,on=allgroupcols,how='left')
        
    print('UserProductwise feats completed')
        
    return df_dest

In [None]:
#Merge log feats with train and test
train_df=gen_logfeats(log_df,train_df)
print('train df merge complete')
test_df=gen_logfeats(log_df,test_df)
print('test df merge complete')


In [None]:
#fill na for new merged log feats
cols = [col for col in train_df.columns if 'action_' in col]
for col in cols:
    train_df[col].fillna(-1,inplace=True)
    test_df[col].fillna(-1,inplace=True)

In [None]:
# #user and product individual columns to be set to NaN when userproduct fields are available
# def setnans_action_user_product_indl_cols(df):
#     userproductcols = [col for col in train_df.columns if 'userproduct' in col]

#     for col in userproductcols:
#         usercol = col.replace('userproduct','user')
#         prodcol = col.replace('userproduct','product')
#         df.loc[~df[col].isnull(),usercol] = np.NaN
#         df.loc[~df[col].isnull(),prodcol] = np.NaN
#     return df

# train_df = setnans_action_user_product_indl_cols(train_df)
# test_df = setnans_action_user_product_indl_cols(test_df)

In [None]:
# ************* Important *******************
# This function assumes that action views and interest are label encoded as 0 and 1 resply in log df
# It also assumes that NaN are represented as -1
def gen_derivedactionfeats(df):
    actionsumcols = [col for col in train_df.columns if ('action_' in col) and ('_sum' in col)]
    #Views sum = Total count   - Interest sum (ie sum column )
    for sumcol in actionsumcols:
        newcol = sumcol.replace('_sum','_viewsum')
        countcol = sumcol.replace('_sum','_count')
        df[newcol] = df[countcol]  - df[sumcol]
    
    #compute sum of dayofweek, month and hour metrics
    newcol = 'action_userproduct_date_mean_total'
    if newcol in df:
        del df[newcol]
    userprodcols = [col for col in train_df.columns if ('action_' in col) 
                    and ('userproduct' in col) and ('_mean' in col)]
    df[newcol] = 0
    for col in userprodcols:
        df[newcol] += df[col]
        
    #compute sum of dayofweek, month and hour metrics
    newcol = 'action_product_date_mean_total'
    if newcol in df:
        del df[newcol]
    prodcols = [col for col in train_df.columns if ('action_' in col) 
                    and ('_product' in col) and ('_mean' in col)]

    df[newcol] = 0
    for col in prodcols:
        df[newcol] += df[col]   
        
    #compute sum of dayofweek, month and hour metrics
    newcol = 'action_user_date_mean_total'
    if newcol in df:
        del df[newcol]
    usercols = [col for col in train_df.columns if ('action_' in col) 
                    and ('_user' in col) and ('userproduct' not in col)  and ('_mean' in col)]
    df[newcol] = 0
    for col in usercols:
        df[newcol] += df[col]   
                
    return df

train_df =gen_derivedactionfeats(train_df)
test_df =gen_derivedactionfeats(test_df)

In [None]:
train_df.to_hdf('train_basic.hdf',key='data')
test_df.to_hdf('test_basic.hdf',key='data')