### 导入基本工具包

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold,train_test_split
from bayes_opt import BayesianOptimization
import lightgbm as lgb
import gc
import matplotlib.pyplot as plt
import seaborn as sns
import os
import time
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm_notebook as tqdm
%matplotlib inline

### 统计变量工具函数定义
为提取到不同类型用户的特征，构建了以下函数，在不同层次、不同水平上对用户的点击行为习惯和当次点击在用户点击历史中的次序进行了特征提取。

#### do_count
将df按照group_cols分组，计算分组的count数，将聚合结果重命名为agg_name，数据类型为agg_type，通过pd.merge，按照group_cols列与df聚合，返回聚合结果。

In [None]:
def do_count(df,group_cols,agg_name,agg_type='uint32',show_max=False,show_agg=True):
    if show_agg:
        print("Aggregating by ", group_cols , '...' )
    gp = df[group_cols][group_cols].groupby(group_cols).size().rename(agg_name).to_frame().reset_index()
    df = df.merge(gp, on=group_cols, how='left', copy=False)
    del gp
    if show_max:
        print( agg_name + " max value = ", df[agg_name].max() )
    df[agg_name] = df[agg_name].astype(agg_type,copy=False)
    return df

#### do_countuniq
将df按照group_cols分组，提取counted列，计算counted特征的唯一数据的count数，将聚合结果重命名为agg_name，数据类型为agg_type，通过pd.merge，按照group_cols列与df聚合，返回聚合结果。

In [None]:
def do_countuniq(df, group_cols, counted, agg_name, agg_type='uint32',show_max=False,show_agg=True):
    if show_agg:
        print("Counting unique {} by {} ...".format(counted,group_cols))
    gp = df[group_cols+[counted]].groupby(group_cols)[counted].nunique().reset_index().rename(columns={counted:agg_name})
    df = df.merge(gp,on=group_cols,how='left',copy=False)
    del gp
    if show_max:
        print("{} max value = {}".format(agg_name,df[agg_name].max()))
    df[agg_name] = df[agg_name].astype(agg_type,copy=False)
    return df

#### do_cumcount
将df按照group_cols分组，提取counted列，计算counted特征的累计count数，数据类型为agg_type，将返回值直接赋值给df的新特征agg_name。例如对ip进行聚合，计算每个ip使用过的app，每次使用时候是这ip第几次使用的。

In [None]:
def do_cumcount(df,group_cols,counted,agg_name,agg_type='uint32',show_max=False,show_agg=True):
    if show_agg:
        print("Cumulative count {} by {} ...".format(counted ,group_cols))
    gp = df[group_cols+[counted]].groupby(group_cols)[counted].cumcount()
    df[agg_name] = gp.values
    del gp
    if show_max:
        print("{} max value = {}".format(agg_name,df[agg_name].max()))
    df[agg_name] = df[agg_name].astype(agg_type,copy=False)
    return df

#### do_mean
将df按照group_cols分组，提取counted列，计算该分组counted特征的均值，数据类型为agg_type，重命名为agg_name，通过pd.merge与df进行聚合，聚合索引为group_cols

In [None]:
def do_mean(df,group_cols,counted,agg_name,agg_type='float32',show_max=False,show_agg=True):
    if show_agg:
        print("Calculating mean of {} by {} ...".format(counted,group_cols))
    gp = df[group_cols+[counted]].groupby(group_cols)[counted].mean().reset_index().rename(columns={counted:agg_name})
    df = df.merge(gp,on=group_cols,how='left',copy=False)
    del gp
    if show_max:
        print("{} max value = {}".format(agg_name,df[agg_name].max()))
    df[agg_name] = df[agg_name].astype(agg_type,copy=False)
    return df

#### do_var
将df按照group_cols分组，提取counted列，计算该分组counted特征的方差variance，数据类型为agg_type，重命名为agg_name，通过pd.merge与df进行聚合，聚合索引为group_cols

In [None]:
def do_var(df,group_cols,counted,agg_name,agg_type='float32',show_max=False,show_agg=True):
    if show_agg:
        print("Calculating variance of {} by {} ...".format(counted,group_cols))
    gp = df[group_cols+[counted]].groupby(group_cols)[counted].var().reset_index().rename(columns={counted:agg_name})
    df = df.merge(gp,on=group_cols,how='left',copy=False)
    del gp
    if show_max:
        print("{} max value = {}".format(agg_name,df[agg_name].max()))
    df[agg_name] = df[agg_name].astype(agg_type,copy=False)
    return df

### Load data

官方提供训练数据集train.csv包含2亿条点击数据，其中正例样本占比不到0.2%，大部分都是负例样本，用户查看了app广告后并没有进行下载任务。考虑到数据集过大，且类别严重不平衡，这里，我们使用分块读取数据，减少读取的内存压力，同时通过下采样，将正例：负例的样本比例调节到1:2。

In [None]:
np.random.seed(2018)

In [None]:
chunkers = pd.read_csv("../input/train.csv",chunksize=2000000,parse_dates=['click_time'])
test = pd.read_csv("../input/test.csv",parse_dates=['click_time'])

In [None]:
train = pd.DataFrame()
for i,chunker in tqdm(enumerate(chunkers)):
    positive_sample = chunker[chunker['is_attributed']==1]
    n_positive = len(positive_sample)
    train = train.append(positive_sample)
    # 下采样，只抽取了2倍于正例样本
    negative_sample = chunker[chunker['is_attributed']==0].sample(int(2*n_positive))
    train = train.append(negative_sample)
    
    del positive_sample,negative_sample
    gc.enable()
    gc.collect()

In [None]:
display(train.head())
display(test.head())

选取有价值信息，存储test的click_id信息，拼接train和test，方便后续一起进行特征工程的操作。

In [None]:
test_id = test['click_id']
train = train[['ip','app','device','os','channel','click_time','is_attributed']]
test = test[['ip','app','device','os','channel','click_time']]
all_data = pd.concat([train,test],axis=0)

In [None]:
del train,test
gc.enable()
gc.collect()

#### Feature Engineering特征工程

提取时间粒度特征day和hour

In [None]:
all_data['day'] = all_data['click_time'].apply(lambda x: x.day).astype('uint16')
all_data['hour'] = all_data['click_time'].apply(lambda x: x.hour).astype('uint16')

可视化点击次数和平均下载概率随时间的变化，以及在时间粒度上的差异。

In [None]:
plt.figure(figsize=(20,10))
plt.subplot2grid((2,2),(0,0),colspan=1)
all_data[all_data['is_attributed'].notnull()].groupby('day')['day'].count().plot(kind='bar',edgecolor='black')
plt.ylabel('count of click')
plt.subplot2grid((2,2),(0,1),colspan=1)
all_data[all_data['is_attributed'].notnull()].groupby('hour')['day'].count().plot(kind='bar',edgecolor='black')
plt.ylabel('count of click')
plt.subplot2grid((2,2),(1,0),colspan=1)
all_data[all_data['is_attributed'].notnull()].groupby('day')['is_attributed'].mean().plot(kind='bar',edgecolor='black')
plt.ylabel('mean probability of attributed')
plt.subplot2grid((2,2),(1,1),colspan=1)
all_data[all_data['is_attributed'].notnull()].groupby('hour')['is_attributed'].mean().plot(kind='bar',edgecolor='black')
plt.ylabel('mean probability of attributed')
plt.show()

在官方提供的训练集train.csv中，总共包括day为6/7/8/9日的点击信息，第6日的样本信息较少；点击量随每日各小时的变化趋势如上图所示，0-14小时的点击量较高，而其他时间的点击量则较低。经过下采样平衡正负例之后，每日的点击下载率大致为33%，各日均较为稳定。从每日24小时划分来看，每个时间区间的下载率大致相同，仅在20-21小时稍微有些高。

In [None]:
plt.figure(figsize=(20,10))
plt.subplot2grid((1,2),(0,0),colspan=1)
all_data[all_data['is_attributed'].isnull()].groupby('day')['day'].count().plot(kind='bar',edgecolor='black')
plt.ylabel('count of click')
plt.subplot2grid((1,2),(0,1),colspan=1)
all_data[all_data['is_attributed'].isnull()].groupby('hour')['day'].count().plot(kind='bar',edgecolor='black')
plt.ylabel('count of click')
plt.show()

官方提供的测试集的情况与训练集有些差异，首先测试集仅有第10日的数据，且仅包括了4/5/9/10/13/14时的数据信息。在后续建模中，要考虑如何使用这一情况，提高数据在这些时间点的预测准确率。一种方法是增加这些时间点的采样量，另一种是给这些数据一个大一些的代价权重。

In [None]:
all_data['ip'] = all_data['ip'].astype('uint32')
all_data['app'] = all_data['app'].astype('uint16')
all_data['device'] = all_data['device'].astype('uint16')
all_data['os'] = all_data['os'].astype('uint16')
all_data['channel'] = all_data['channel'].astype('uint16')

计算每一个ip总共有多少个唯一的channel

In [None]:
all_data = do_countuniq(all_data,['ip'],'channel','X0','uint8',show_max=False)
gc.collect()

In [None]:
plt.figure(figsize=(10,7))
plt.hist(all_data[all_data['is_attributed']==0]['X0'],color='r',alpha=0.5,label='is_attributed == 0',density=True,bins=20)
plt.hist(all_data[all_data['is_attributed']==1]['X0'],color='g',alpha=0.5,label='is_attributed == 1',density=True,bins=20)
plt.legend(loc='upper right')
plt.show()

计算每个ip每天都在多少个hour节点上点击，越少说明越集中，越多说明点击时间越分散

In [None]:
all_data = do_countuniq(all_data,['ip','day'],'hour','X2','uint8',show_max=False)
gc.collect()

In [None]:
plt.figure(figsize=(10,7))
plt.hist(all_data[all_data['is_attributed']==0]['X2'],color='r',alpha=0.5,label='is_attributed == 0',density=True,bins=20)
plt.hist(all_data[all_data['is_attributed']==1]['X2'],color='g',alpha=0.5,label='is_attributed == 1',density=True,bins=20)
plt.legend(loc='upper right')
plt.show()

从上图可以看出，会进行下载操作的用户，他们一天只能可能仅有1个小时会使用手机。

计算每个ip总共使用过多少种app

In [None]:
all_data = do_countuniq(all_data,['ip'],'app','X3','uint16',show_max=False)
gc.collect()

In [None]:
plt.figure(figsize=(10,7))
plt.hist(all_data[all_data['is_attributed']==0]['X3'],color='r',alpha=0.5,label='is_attributed == 0',density=True,bins=20)
plt.hist(all_data[all_data['is_attributed']==1]['X3'],color='g',alpha=0.5,label='is_attributed == 1',density=True,bins=20)
plt.legend(loc='upper right')
plt.show()

用户会在几种os系统上登录特定的app

In [None]:
all_data = do_countuniq(all_data,['ip','app'],'os','X4','uint8',show_max=False)
gc.collect()

In [None]:
plt.figure(figsize=(10,7))
plt.hist(all_data[all_data['is_attributed']==0]['X4'],color='r',alpha=0.5,label='is_attributed == 0',density=True,bins=20)
plt.hist(all_data[all_data['is_attributed']==1]['X4'],color='g',alpha=0.5,label='is_attributed == 1',density=True,bins=20)
plt.legend(loc='upper right')
plt.show()

In [None]:
all_data = do_countuniq(all_data,['ip'],'device','X5','uint16',show_max=False)
gc.collect()

In [None]:
plt.figure(figsize=(10,7))
plt.hist(all_data[all_data['is_attributed']==0]['X5'],color='r',alpha=0.5,label='is_attributed == 0',density=True,bins=20)
plt.hist(all_data[all_data['is_attributed']==1]['X5'],color='g',alpha=0.5,label='is_attributed == 1',density=True,bins=20)
plt.legend(loc='upper right')
plt.show()

In [None]:
all_data = do_countuniq(all_data,['app'],'channel','X6','uint8',show_max=False)
gc.collect()

In [None]:
plt.figure(figsize=(10,7))
plt.hist(all_data[all_data['is_attributed']==0]['X6'],color='r',alpha=0.5,label='is_attributed == 0',density=True,bins=20)
plt.hist(all_data[all_data['is_attributed']==1]['X6'],color='g',alpha=0.5,label='is_attributed == 1',density=True,bins=20)
plt.legend(loc='upper right')
plt.show()

In [None]:
all_data = do_countuniq(all_data,['ip','device','os'],'app','X8','uint8',show_max=False)
gc.collect()

In [None]:
plt.figure(figsize=(10,7))
plt.hist(all_data[all_data['is_attributed']==0]['X8'],color='r',alpha=0.5,label='is_attributed == 0',density=True,bins=20)
plt.hist(all_data[all_data['is_attributed']==1]['X8'],color='g',alpha=0.5,label='is_attributed == 1',density=True,bins=20)
plt.legend(loc='upper right')
plt.show()

In [None]:
all_data = do_countuniq(all_data,['ip'],'os','X7','uint16',show_max=False)
gc.collect()

In [None]:
plt.figure(figsize=(10,7))
plt.hist(all_data[all_data['is_attributed']==0]['X7'],color='r',alpha=0.5,label='is_attributed == 0',density=True,bins=20)
plt.hist(all_data[all_data['is_attributed']==1]['X7'],color='g',alpha=0.5,label='is_attributed == 1',density=True,bins=20)
plt.legend(loc='upper right')
plt.show()

In [None]:
all_data = do_countuniq(all_data,['ip','device','os'],'app','X1','uint8',show_max=False)
gc.collect()

In [None]:
plt.figure(figsize=(10,7))
plt.hist(all_data[all_data['is_attributed']==0]['X1'],color='r',alpha=0.5,label='is_attributed == 0',density=True,bins=20)
plt.hist(all_data[all_data['is_attributed']==1]['X1'],color='g',alpha=0.5,label='is_attributed == 1',density=True,bins=20)
plt.legend(loc='upper right')
plt.show()

In [None]:
all_data = do_countuniq(all_data,['ip','device','os'],'channel','A0','uint8',show_max=False)
gc.collect()

In [None]:
plt.figure(figsize=(10,7))
plt.hist(all_data[all_data['is_attributed']==0]['A0'],color='r',alpha=0.5,label='is_attributed == 0',density=True,bins=20)
plt.hist(all_data[all_data['is_attributed']==1]['A0'],color='g',alpha=0.5,label='is_attributed == 1',density=True,bins=20)
plt.legend(loc='upper right')
plt.show()

In [None]:
all_data = do_count(all_data,['ip','app','channel'],'A1',show_max=False)
gc.collect()

In [None]:
all_data = do_count(all_data,['ip','device','os','app'],'A2',show_max=False)
gc.collect()

In [None]:
all_data = do_count(all_data,['ip','day','hour'],'ip_tcount','uint16',show_max=False )
gc.collect()

In [None]:
all_data = do_count(all_data,['ip', 'app'],'ip_app_count','uint32',show_max=False)
gc.collect()

In [None]:
all_data = do_count(all_data, ['ip','app','os'],'ip_app_os_count','uint16',show_max=False)
gc.collect()

按ip、day和channel分组，计算点击发生的时间方差

In [None]:
all_data = do_var(all_data,['ip','day','channel'],'hour','ip_tchan_count',show_max=False)
gc.collect()

按ip、app和os分组，计算点击发生的时间方差

In [None]:
all_data = do_var(all_data,['ip','app','os'],'hour','ip_app_os_var',show_max=False)
gc.collect()

按ip、app和channel分组，计算点击发生的时间方差

In [None]:
all_data = do_var(all_data,['ip','app','channel'],'day','ip_app_channel_var_day',show_max=False)
gc.collect()

按ip、app和channel分组，计算点击发生的时间均值

In [None]:
all_data = do_mean(all_data,['ip','app','channel'],'hour','ip_app_channel_mean_hour',show_max=False)
gc.collect()

#### nextclicktime
当次点击距离下一次、下第二次、下第三次点击的时间间隔
当次点击距离上一次、上第二次、上第三次点击的时间间隔

In [None]:
all_data['nextClick1'] = (all_data.groupby(['ip','app','device','os'])['click_time'].shift(-1)-all_data['click_time']).dt.seconds.astype(np.float32)
all_data['nextClick2'] = (all_data.groupby(['ip','app','device','os'])['click_time'].shift(-2)-all_data['click_time']).dt.seconds.astype(np.float32)
all_data['nextClick3'] = (all_data.groupby(['ip','app','device','os'])['click_time'].shift(-3)-all_data['click_time']).dt.seconds.astype(np.float32)

In [None]:
all_data['prevClick1'] = (all_data['click_time'] - all_data.groupby(['ip','app','device','os'])['click_time'].shift(+1)).dt.seconds.astype(np.float32)
all_data['prevClick2'] = (all_data['click_time'] - all_data.groupby(['ip','app','device','os'])['click_time'].shift(+2)).dt.seconds.astype(np.float32)
all_data['prevClick3'] = (all_data['click_time'] - all_data.groupby(['ip','app','device','os'])['click_time'].shift(+3)).dt.seconds.astype(np.float32)

In [None]:
all_data.drop(columns=['click_time','day'],inplace=True)

In [None]:
all_data.drop(columns=['ip'],inplace=True)

In [None]:
gc.collect()

#### 拆分训练集和测试集，定义训练特征的category类

In [None]:
print(all_data.columns)

In [None]:
target = 'is_attributed'
categorical_features = ['app', 'device', 'os', 'channel', 'hour']

In [None]:
test_data = all_data[all_data['is_attributed'].isna()].drop(columns=['is_attributed'])

In [None]:
train_data = all_data[all_data['is_attributed'].notnull()].drop(columns=['is_attributed'])
train_target = all_data[all_data['is_attributed'].notnull()]['is_attributed']

In [None]:
print(train_data.shape)
print(train_target.shape)
print(test_data.shape)

In [None]:
predictors = train_data.columns.tolist()

In [None]:
del all_data

定义lightgbm训练所需的训练集、验证集

In [None]:
Dtrain = lgb.Dataset(data=train_data,label=train_target,free_raw_data=False)
train_X,valid_X,train_y,valid_y = train_test_split(train_data,train_target,stratify=train_target,random_state=2018)
dtrain = lgb.Dataset(data=train_X,label=train_y,feature_name=predictors,categorical_feature=categorical_features,free_raw_data=False)
dvalid = lgb.Dataset(data=valid_X,label=valid_y,feature_name=predictors,categorical_feature=categorical_features,free_raw_data=False)

In [None]:
del train_data,train_target
gc.enable()
gc.collect()

定义lightgbm进行5折交叉验证的评价函数

In [None]:
def lgb_evaluate_cv(colsample_bytree=0.7,
                    learning_rate=0.1,num_leaves=32,
                    subsample=0.9,reg_alpha=0.0,
                    reg_lambda=0.0,min_child_weight=0.0):
    params = dict()
    params['colsample_bytree'] = max(min(colsample_bytree,1),0)
    params['learning_rate'] = max(min(learning_rate,1),0)
    params['num_leaves'] = int(num_leaves)
    params['subsample'] = max(min(subsample,1),0)
    params['reg_alpha'] = max(0,reg_alpha)
    params['reg_lambda'] = max(0,reg_lambda)
    params['min_child_weight'] = int(min_child_weight)
    params['is_unbalance'] = True
    params['categorical_features'] = categorical_features
    params['predictor'] = predictors
    params['objective'] = 'binary'
    cv_result =  lgb.cv(params=params,train_set=Dtrain,
                        early_stopping_rounds=50,
                        metrics='auc',nfold=3,
                        num_boost_round=2000,
                        verbose_eval=False,seed=1,show_stdv=True)                       
    return cv_result['auc-mean'][-1]

定义对贝叶斯优化lightgbm超参数函数以及参数空间

In [None]:
lgbBO = BayesianOptimization(f=lgb_evaluate_cv,
                             pbounds={'colsample_bytree':(0.5,0.7),
                                      'learning_rate':(0.01,0.2),
                                      'num_leaves':(7,31),
                                      'subsample':(0.5,0.95),
                                      'reg_alpha':(0.1,0.1),
                                      'reg_lambda':(0.1,0.1),
                                      'min_child_weight':(1,1)})

In [None]:
lgbBO.maximize(init_points=5,n_iter=20)

In [None]:
print(lgbBO.res['max']['max_val'])
display(lgbBO.res['max']['max_params'])
params_opt = lgbBO.res['max']['max_params'].copy()

定义最终lightgbm的优化超参数

In [None]:
params_opt['num_leaves'] = int(params_opt['num_leaves'])
params_opt['is_unbalance'] = True
params_opt['categorical_features'] = categorical_features
params_opt['predictor'] = predictors
params_opt['objective'] = 'binary'
params_opt['metrics'] = 'auc'
print(params_opt)

In [None]:
display(params_opt)

In [None]:
opt_lgb = lgb.train(params=params_opt,train_set=dtrain,valid_sets=[dtrain,dvalid],valid_names=['train','valid'],early_stopping_rounds=50,num_boost_round=3000,verbose_eval=True)

In [None]:
bst_iterations = opt_lgb.best_iteration

以最优boosting次数，训练完整的训练数据集

In [None]:
final_lgb = lgb.train(params=params_opt,train_set=Dtrain,valid_sets=Dtrain,valid_names='train',num_boost_round=bst_iterations,verbose_eval=True)

In [None]:
pred = final_lgb.predict(test_data)

输出预测结果

In [None]:
sub = pd.DataFrame()
sub['click_id'] = test_id
sub['is_attributed'] = pred

In [None]:
print(sub.shape)
display(sub.head())
import datetime

In [None]:
sub.to_csv("submissions_lgb_{}.csv".format(datetime.datetime.now()),index=False)