In [1]:
!pip3 install --upgrade pandas
!pip3 install lightgbm
!pip3 install bottleneck
!pip3 install numexpr

In [2]:
import gc
import io

import pandas as pd
import numpy as np
import lightgbm as lgb

In [3]:
%%gcs read --object gs://kaggle_talkingdata_belkasanek/train.csv --variable train

In [4]:
dtypes = {
        'ip'            : 'uint32',
        'app'           : 'uint16',
        'device'        : 'uint16',
        'os'            : 'uint16',
        'channel'       : 'uint16',
        'is_attributed' : 'uint8',
        'click_id'      : 'uint32'
        }

train_df = pd.read_csv(io.BytesIO(train), parse_dates=['click_time'], dtype=dtypes, 
                       usecols=['ip','app','device','os', 'channel', 'click_time', 'is_attributed'])

train_df['day'] = train_df['click_time'].dt.day.astype('uint8')
train_df['hour'] = train_df['click_time'].dt.hour.astype('uint32')

# train data contains day 6, 7, 8, 9 test data contains day 10
# 6 day has small amount of data
# test data include only hours 4, 5, 9, 10, 13, 14
# but for purpose of feature engineering we will use bigger time frame
train_df = train_df.loc[(train_df['day'] >= 7) & (train_df['hour'].between(2, 16))]
del train

In [5]:
def time_feature(train_df, columns, prefix='nc'):
    ''' 
    Add new feature that is time to next or previous click in a particular group
    If time is not exist fill it with value 65500
    train_df - pd.DataFrame
    columns - list of strings, used to make grouping
    prefix - string, used to regulate next or previous click will be used 
    'pc' for precvous and 'nc' for next
    '''

    train_df = train_df.sort_values(by=columns + ['my_time'])
    
    new_feature = prefix + '_' + '_'.join(columns)
    if prefix == 'pc':
        n = 1
        train_df[new_feature] = train_df['my_time'] - train_df['my_time'].shift(n)
    if prefix == 'nc':
        n = -1
        train_df[new_feature] = train_df['my_time'].shift(n) - train_df['my_time']
        
    # create temporary columns with bool where some feature are changed
    for col in columns:
        train_df['temp_{}'.format(col)] = train_df[col] != train_df[col].shift(n)
    # add their names to this list
    bool_col = ['temp_' + i for i in columns]
    
    # fill with value 65500 where any feature are changed or NaN
    train_df.loc[train_df[bool_col].any(axis=1), new_feature] = 65500
    train_df.fillna(65500)
    
    train_df[new_feature] = train_df[new_feature].astype('uint16')
    # drop temporary columns
    train_df.drop(bool_col, axis=1, inplace=True)
    gc.collect()
    return train_df

def time_feature_engineering(train_df):
    '''
    Create custom time 'my_time' because date is not need any more
    Also create all time features
    '''
    
    train_df['minute'] = train_df['click_time'].dt.minute
    train_df['second'] = train_df['click_time'].dt.second
    train_df['my_time'] = train_df['hour'] * 3600 + train_df['minute'] * 60 + train_df['second']

    train_df['my_time'] = train_df['my_time'].astype('uint32')
    train_df['hour'] = train_df['hour'].astype('uint8')

    train_df.drop(['click_time', 'minute', 'second'], axis=1, inplace=True)
  
    train_df = time_feature(train_df, ['ip', 'os'], 'nc')
    train_df = time_feature(train_df, ['ip', 'app'] , 'nc')
    train_df = time_feature(train_df, ['ip', 'app', 'os'] , 'nc')
    train_df = time_feature(train_df, ['app', 'channel'] , 'nc')
    train_df = time_feature(train_df, ['ip'], 'nc')
    train_df = time_feature(train_df, ['ip', 'os', 'device'], 'nc')
    train_df = time_feature(train_df, ['ip', 'os', 'device', 'app'], 'nc')
    train_df = time_feature(train_df, ['ip', 'channel'], 'nc')
    return train_df
  
def feature_engineering(train_df):
    '''
    Create all non-time features
    '''
    
    train_df= train_df.sort_values(by=['ip', 'my_time'])
    train_df.drop(['my_time'], axis=1, inplace=True)
   
    # count of unique apps visited by user
    train_df= train_df.assign(unique_app_ip=train_df[['ip', 'app']].groupby(by=['ip'])['app'].transform('nunique'))

    # count of unique channels visited by user
    train_df= train_df.assign(unique_channel_ip=train_df[['ip', 'channel']].groupby(by=['ip'])['channel']\
                              .transform('nunique'))    
    
    # cumcount of user with same device
    train_df= train_df.assign(cumcount_ip_device=train_df[['ip', 'device', 'hour']]\
                   .groupby(by=['ip', 'device'])['hour'].cumcount().astype('uint32'))
    
    # cumcount of user with same device and os     
    train_df= train_df.assign(cumcount_ip_os_device=train_df[['ip', 'os', 'device', 'hour']]\
                   .groupby(by=['ip', 'os', 'device'])['hour'].cumcount().astype('uint32'))
    
    # count of unique os for app
    train_df= train_df.assign(unique_os_app=train_df[['app', 'os']].groupby(by='app')['os'].transform('nunique'))

    # count of clicks on app for user
    train_df= train_df.assign(count_ip_app=train_df[['ip', 'app', 'hour']]\
                   .groupby(by=['ip', 'app'])['hour'].transform('count').astype('uint16'))

    # mean click rate on apps for user
    train_df= train_df.assign(mean_ip_app=train_df[['ip', 'count_ip_app']]\
                   .groupby(by='ip')['count_ip_app'].transform('mean').astype('uint16'))
    
    # count of clicks for user with defined os and device
    train_df= train_df.assign(count_ip_os_device=train_df[['ip', 'os', 'device', 'hour']]\
               .groupby(by=['ip', 'os', 'device'])['hour'].transform('count').astype('uint16'))
    
    # count of clicks for user with defined app, os and device
    train_df= train_df.assign(count_ip_app_os_device=train_df[['ip', 'app', 'os', 'device', 'hour']]\
           .groupby(by=['ip', 'app', 'os', 'device'])['hour'].transform('count').astype('uint16'))

    # count of unique channels for ip and app
    train_df= train_df.assign(unique_channel_ip_app=train_df[['ip', 'app', 'channel']]\
                   .groupby(by=['ip', 'app'])['channel'].transform('nunique'))
    
    # hour with maximum user's activity
    train_df= train_df.assign(max_hour_ip=train_df[['ip', 'hour']].groupby(by=['ip'])['hour']\
                   .transform(lambda x: np.bincount(x).argmax()).astype('uint8'))
    
    # channel that most used by user
    train_df= train_df.assign(max_channel_ip=train_df[['ip', 'channel']].groupby(by=['ip'])['channel']\
                   .transform(lambda x: np.bincount(x).argmax()).astype('uint16'))

    # count of total clicks by user
    train_df= train_df.assign(count_ip=train_df[['ip', 'hour']].groupby(by='ip')['hour']\
                              .transform('count').astype('uint32'))
    
    # count of total clicks through channel
    train_df= train_df.assign(count_channel=train_df[['channel', 'ip']].groupby(by='channel')['ip']\
                              .transform('count').astype('uint16'))
    
    categorical_features = ['app', 'device', 'os', 'channel', 'hour', 
                            'max_hour_ip', 'max_channel_ip']
    
    for i in categorical_features:
        train_df[i] = train_df[i].astype('category')
        
    train_df.drop(['ip'], axis=1, inplace=True)
    gc.collect()
    return train_df

In [6]:
# feature engineering on every day independently
day7 = train_df.loc[(train_df['day'] == 7)].copy()
train_df = train_df.loc[train_df['day'] != 7]
day8 = train_df.loc[(train_df['day'] == 8)].copy()
train_df = train_df.loc[train_df['day'] != 8]
day9 = train_df.loc[(train_df['day'] == 9)].copy()

day7.drop(['day'], axis=1, inplace=True)
day8.drop(['day'], axis=1, inplace=True)
day9.drop(['day'], axis=1, inplace=True)
del train_df
gc.collect()

day7 = time_feature_engineering(day7)
day8 = time_feature_engineering(day8)
day9 = time_feature_engineering(day9)

day7 = feature_engineering(day7)
day8 = feature_engineering(day8)
day9 = feature_engineering(day9)

train_df = pd.concat([day7, day8, day9])
del day7, day8, day9
gc.collect()

In [7]:
# make validation set
predictors = train_df.columns.tolist()
predictors.remove('is_attributed')

ratio_valid_set = 0.1
# most used hours in test data
most_hour = [4, 5, 9, 10, 13, 14]

tr = train_df.loc[~train_df['hour'].isin(most_hour)]
temp = train_df.loc[train_df['hour'].isin(most_hour)]

temp = temp.sample(frac=1)
n = temp.shape[0]

val_df = temp[(int(n)-int(n * ratio_valid_set)):int(n)]
train_df = tr.append(temp[:(int(n)-int(n * ratio_valid_set))])
train_df = train_df.sample(frac=1)

del tr, temp
gc.collect()

In [8]:
%%time
categorical_features = ['app', 'device', 'os', 'channel', 'hour', 
                        'max_hour_ip', 'max_channel_ip']
lgb_params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric':'auc',
        'subsample_for_bin': 20000,
        'reg_alpha': 0.2,
        'reg_lambda': 0.1,
        'nthread': 16,
        'verbose': 0
    }
params = {
    'learning_rate': 0.1,
    'num_leaves': 25,
    'max_depth': 4,
    'min_child_samples': 75,
    'subsample': 0.7,
    'subsample_freq': 1,
    'colsample_bytree': 0.6,
    'scale_pos_weight': 200
}
num_boost_round = 1000
early_stopping_rounds = 30

lgb_params.update(params)

xgtrain = lgb.Dataset(train_df[predictors], 
                      label=train_df['is_attributed'].values,
                      feature_name=predictors,
                      categorical_feature=categorical_features
                      )
xgvalid = lgb.Dataset(val_df[predictors], 
                      label=val_df['is_attributed'].values,
                      feature_name=predictors,
                      categorical_feature=categorical_features
                      )

evals_results = {}

bst1 = lgb.train(lgb_params, 
                 xgtrain, 
                 valid_sets=[xgvalid], 
                 valid_names=['valid'], 
                 evals_result=evals_results, 
                 num_boost_round=num_boost_round,
                 early_stopping_rounds=early_stopping_rounds,
                 verbose_eval=10,
                 feature_name=predictors,
                 categorical_feature=categorical_features
                )

print("\nModel Report")
print("bst1.best_iteration: ", bst1.best_iteration)
print("auc: {:.4f}".format(evals_results['valid']['auc'][bst1.best_iteration-1]))
best_iteration = bst1.best_iteration-1

In [9]:
del train_df, val_df
gc.collect()

In [10]:
%%gcs read --object gs://kaggle_talkingdata_belkasanek/mapping.csv --variable mapping

In [11]:
%%gcs read --object gs://kaggle_talkingdata_belkasanek/test_supplement.csv --variable test

In [12]:
# feature engineering on test data
train_df = pd.read_csv(io.BytesIO(test), parse_dates=['click_time'], dtype=dtypes, 
                      usecols=['ip', 'app', 'device', 'os', 'channel', 'click_time', 'click_id'])

del test
gc.collect()

mapping = pd.read_csv(io.BytesIO(mapping), dtype={'click_id': 'int32', 'old_click_id': 'int32'})

train_df.rename(columns={'click_id': 'old_click_id'}, inplace=True)
train_df = pd.merge(train_df, mapping, on=['old_click_id'], how='left')
train_df.drop(['old_click_id'], axis=1, inplace=True)
train_df['click_id'].fillna(-1, inplace=True)
train_df['click_id'] = train_df['click_id'].astype(np.int32)
train_df['hour'] = train_df['click_time'].dt.hour.astype('uint32') 

train_df = time_feature_engineering(train_df)
train_df = feature_engineering(train_df)

train_df = train_df.loc[train_df['click_id'] != -1]

In [13]:
sub = pd.DataFrame()
sub['click_id'] = train_df['click_id']
sub['click_id'] = sub['click_id'].astype('int')
sub['is_attributed'] = bst1.predict(train_df[predictors], num_iteration=best_iteration)
sub.to_csv('final.csv.gz', index=False, compression='gzip')

In [14]:
!gsutil cp final.csv.gz gs://kaggle_talkingdata_belkasanek/final.csv.gz

In [15]:
del train_df, sub
gc.collect()