In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import os
import time
import warnings
warnings.filterwarnings("ignore")
os.chdir('/Users/magictavern/Downloads/tap4fun游戏玩家付费金额预测大赛/tap4fun竞赛数据')

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [21]:
def read_data():
    train = pd.read_csv('tap_fun_train.csv')
    test = pd.read_csv('tap_fun_test.csv')
    print(train.shape)
    print(test.shape)
    res0 = pd.DataFrame()#前7天没付费的预测不会付费
    res0['user_id'] = test[test['pay_price']==0]['user_id']
    res0['prediction_pay_price'] = 0
    train = train[train['pay_price']>0]
    test =test[test['pay_price']>0]
    data = pd.concat([train,test],axis=0,ignore_index=True)
    data['prediction_pay_price'].fillna(value=-1,inplace=True)
    return data,res0

In [22]:
def split_time(data):
    data['register_hour']=data['register_time'].map(lambda x: int(x.split(' ')[1].split(':')[0]))
    data['register_date']=data['register_time'].map(lambda x: x.split(' ')[0])
    data['register_day']=data['register_time'].map(lambda x: int(x.split(' ')[0].split('-')[2]))
    #凌晨（0-6），上午（7-12），下午（13-18），晚上（19-23）
    data['register_period'] = data['register_hour']//6
    return data

In [23]:
def process_data(data):
    cols_value = []
    for co in data.columns:
        if 'value' in co:#特征名含‘value’的特征
            cols_value.append(co)
    n = len(cols_value)
    for i in range(n//2):
        col_name = cols_value[2*i].replace('add','reserve')
        data[col_name] = data[cols_value[2*i]]-data[cols_value[2*i+1]]
        col_name = cols_value[2*i].replace('add_value','reduce_add_ratio')
        data[col_name] = data[cols_value[2*i+1]]/data[cols_value[2*i]]
    data['sr_outpost_tier_level'] = data['sr_outpost_tier_2_level']+data['sr_outpost_tier_3_level']+data['sr_outpost_tier_4_level']
    data['sr_outpost'] = data['sr_outpost_durability_level']*data['sr_outpost_tier_level']
    data['sr_healing'] = data['sr_healing_space_level']*data['sr_healing_speed_level']
    data['label'] = data['prediction_pay_price']/data['pay_price']
    for co in data.columns:#新增聚合特征
        if 'level' in co or co in ['register_day','register_date','register_hour','register_period']:
            se = data[co].value_counts()
            data[co+'_cnt'] =  data[co].map(se)
            se = data.groupby(co)['pay_price'].mean()
            data[co+'_avg_pay'] = data[co].map(se)
            se = data.groupby(co)['avg_online_minutes'].mean()
            data[co+'_avg_online_minutes'] = data[co].map(se)
            se = data.groupby(co)['label'].mean()
            data[co+'_avg_label'] = data[co].map(se)
    register_date = data['register_date'].values.tolist()
    se = pd.Series(pd.Series(register_date).drop_duplicates().sort_values().values)
    se = pd.Series(se.index+5,index = se.values)%7#%相当于mod，求余数 0代表星期天
    ##week_day特征
    data['week_day'] = data['register_date'].map(se)
    for co in ['register_time','register_date']:
        del data[co]
    ##pvp特征
    data['pay_avg'] = data['pay_price']/data['pay_count']

    data['pvp_win_battle_ratio'] = data['pvp_win_count']/data['pvp_battle_count']
    data['pvp_lanch_battle_ratio'] = data['pvp_lanch_count']/data['pvp_battle_count']

    data['pve_win_battle_ratio'] = data['pve_win_count']/data['pve_battle_count']
    data['pve_lanch_battle_ratio'] = data['pve_lanch_count']/data['pve_battle_count']

    data['pve_pvp_lanch_ratio'] = data['pve_lanch_count']/(data['pve_lanch_count']+data['pvp_lanch_count'])
    data['pve_pvp_battle_ratio'] = data['pve_battle_count']/(data['pve_battle_count']+data['pvp_battle_count'])
    data['pve_pvp_win_ratio'] = data['pve_win_count']/(data['pve_win_count']+data['pvp_win_count'])
    for num in [10,20,30,50,70,100,200,500,1000,2000,5000,10000]:
        data['(pay_price)>'+str(num)] = (data['pay_price']>=num).astype(int)
    return data

In [24]:
def split_train_test(data):
    train_x = data[data['prediction_pay_price']!=-1]
    test_x = data[data['prediction_pay_price']==-1]
    print(train_x.shape)
    print (test_x.shape)
    label = ['label','prediction_pay_price']
    k = 0
    res = pd.DataFrame()
    res['user_id'] = test_x['user_id'].values
    del train_x['user_id']
    del test_x['user_id']
    del train_x[label[1-k]]
    del test_x[label[1-k]]
    train_y = train_x.pop(label[k])
    del test_x[label[k]]
    return train_x,train_y,test_x,res

In [25]:
def cross_train_index(train_y):
    lst = train_y.sort_values().index.tolist()
    train_index = []
    test_index = []
    se = pd.Series(lst)
    se = pd.Series(se.index,index=se.values)
    for i in range(5):
        test_index.append(se[se%5==i].index.tolist())
        train_index.append(se[se%5!=i].index.tolist())
    return train_index,test_index

In [32]:
def feature_importance(train_x,train_index,test_index,weight=1):
    params = {
        'learning_rate':0.01,
    'boosting':'gbdt',
    'metric':'rmse',
    'objective':'regression'} 
    train_weight = train_x['pay_price']
    feature_score = pd.Series(0,index=train_x.columns)
    for i in range(5):
        if weight==1:
            train_part = lgb.Dataset(train_x.loc[train_index[i]],weight=train_weight.loc[train_index[i]],label=train_y.loc[train_index[i]])
            evals = lgb.Dataset(train_x.loc[test_index[i]],weight=train_weight.loc[test_index[i]],label=train_y.loc[test_index[i]])
        elif weight==0:
            train_part = lgb.Dataset(train_x.loc[train_index[i]],label=train_y.loc[train_index[i]])
            evals = lgb.Dataset(train_x.loc[test_index[i]],label=train_y.loc[test_index[i]])
        bst = lgb.train(params,train_part, 
              num_boost_round=10000, valid_sets=[train_part,evals], 
              valid_names=['train','evals'], fobj=None,feval=None,
              categorical_feature=['week_day','register_period','register_hour'],
                        early_stopping_rounds=50,
              evals_result=None, verbose_eval=50, learning_rates=None, 
              keep_training_booster=False, callbacks=None)
        feature_score = feature_score+pd.Series(bst.feature_importance(),index=train_x.columns)
    feature_score = feature_score.sort_values(ascending=False)
    return feature_score

In [34]:
def select_lgb_data(train_x,feature_importance,n=50):
    col_all = feature_importance.index.tolist()
    col_select=col_all[:n]
    X_select=train_x[col_select]
    y=train_y
    X_train, X_eval, y_train, y_eval = train_test_split(X_select,y,test_size = 0.2, random_state = 0)
    lgb_train = lgb.Dataset(X_train, y_train,weight=X_train['pay_price'],free_raw_data=False)
    lgb_eval = lgb.Dataset(X_eval, y_eval,weight=X_eval['pay_price'],reference=lgb_train,free_raw_data=False)
    return lgb_train,lgb_eval,col_select

In [35]:
def lgb_cv(lgb_train):
    params = {    'boosting_type': 'gbdt', 
    'objective': 'regression', 
              'learning_rate': 0.1 }
    params['metrics']='rmse'
    min_merror = float('Inf')
    best_params = {}
    cat_feature= ['week_day','register_period','register_hour']
    select_cat_featrure=[]
    for i in cat_feature:
        if i in col_select:
            select_cat_featrure.append(i)
    print("调参1：num_leaves/max_bin")
    for num_leaves in range(31,131,9):
        for max_bin in range(55,256,50):
        
            params['num_leaves'] = num_leaves
            params['max_bin'] = max_bin
            
            cv_results = lgb.cv(params, lgb_train,num_boost_round=1000, nfold=5,stratified=False,
            early_stopping_rounds=50, verbose_eval=50,categorical_feature=select_cat_featrure )
            
            mean_merror = pd.Series(cv_results['rmse-mean']).min()
            
            if mean_merror <= min_merror:
                min_merror = mean_merror
                best_params['num_leaves'] = num_leaves
                best_params['max_bin'] = max_bin
                
    params['num_leaves'] = best_params['num_leaves']
    params['max_bin'] = best_params['max_bin']

    print('best params:', best_params)
    print('best cv score:', min_merror)

    print("调参2：feature_fraction/min_data_in_leaf")
    for feature_fraction in [0.6,0.8,1.0]:
        for min_data_in_leaf in range(20,101,10):
        
                params['feature_fraction'] =feature_fraction
                params['min_data_in_leaf'] = min_data_in_leaf
            
                cv_results = lgb.cv(params, lgb_train,num_boost_round=1000, nfold=5, stratified=False,
                early_stopping_rounds=50, verbose_eval=50,categorical_feature=select_cat_featrure)
                    
                mean_merror = pd.Series(cv_results['rmse-mean']).min()

                if mean_merror <= min_merror:
                    min_merror = mean_merror
                    best_params['feature_fraction']=feature_fraction
                    best_params['min_data_in_leaf'] = min_data_in_leaf

    params['min_data_in_leaf'] = best_params['min_data_in_leaf']
    params['feature_fraction'] = best_params['feature_fraction']

    print('best params:', best_params)
    print('best cv score:', min_merror)

    print("调参3：bagging")
    for bagging_fraction in [0.6,0.8,1.0]:
        for bagging_freq in range(0,11,1):
            params['bagging_fraction'] = bagging_fraction
            params['bagging_freq'] = bagging_freq
            
            cv_results = lgb.cv(params, lgb_train,num_boost_round=1000, nfold=5,stratified=False,
            early_stopping_rounds=50, verbose_eval=50,categorical_feature=select_cat_featrure)
                    
            mean_merror = pd.Series(cv_results['rmse-mean']).min()

            if mean_merror <= min_merror:
                min_merror = mean_merror
                best_params['bagging_fraction'] = bagging_fraction
                best_params['bagging_freq'] = bagging_freq

    params['bagging_fraction'] = best_params['bagging_fraction']
    params['bagging_freq'] = best_params['bagging_freq']

    print('best params:', best_params)
    print('best cv score:', min_merror)

    print("调参4：l1/l2")
    for lambda_l1 in [0.0,0.01,0.1,1.0,10,100,1000]:
        for lambda_l2 in [0.0,0.01,0.1,1.0,10,100,1000]:
        
            params['lambda_l1'] = lambda_l1
            params['lambda_l2'] = lambda_l2
            
            cv_results = lgb.cv(params, lgb_train,num_boost_round=1000, nfold=5, stratified=False,
            early_stopping_rounds=50, verbose_eval=50,categorical_feature=select_cat_featrure)
                    
            mean_merror = pd.Series(cv_results['rmse-mean']).min()

            if mean_merror <= min_merror:
                min_merror = mean_merror
                best_params['lambda_l1'] = lambda_l1
                best_params['lambda_l2'] = lambda_l2

    params['lambda_l1'] = best_params['lambda_l1']
    params['lambda_l2'] = best_params['lambda_l2']

    print('best params:', best_params)
    print('best cv score:', min_merror)
    return params,select_cat_featrure

In [38]:
def lgbm_train(params,select_cat_feature,learning_rate=0.01):
    params['learning_rate']=learning_rate
    lgbm=lgb.train(
          params,                     # 参数字典
          lgb_train,                  # 训练集
          valid_sets=[lgb_train,lgb_eval],
    valid_names=['train','evals'],
          num_boost_round=2000,       # 迭代次数
          early_stopping_rounds=50,# 早停次数
         verbose_eval=50,categorical_feature=select_cat_feature)
    return lgbm

In [39]:
if __name__ == '__main__':
    data,res0= read_data()
    data=split_time(data)
    data = process_data(data)
    train_x,train_y,test_x,res=split_train_test(data)
    train_index,test_index=cross_train_index(train_y)
    feature_score=feature_importance(train_x,train_index,test_index,weight=1)
    lgb_train,lgb_eval,col_select=select_lgb_data(train_x,feature_score,50)
    params,select_cat_featrure=lgb_cv(lgb_train)
    lgbm=lgbm_train(params,select_cat_featrure)
    y_pred=lgbm.predict(test_x[col_select]) 
    label=pd.Series(y_pred,name='label')
    pay_price=pd.Series(test_x['pay_price'].values,name='pay_price')
    result=pd.concat([res,label,pay_price],axis=1)
    result['prediction_pay_price'] = result['label']*result['pay_price']
    sub=result[['user_id','prediction_pay_price']] 
    sub1=pd.concat([res0,sub],ignore_index=True)
    sub1.sort_values(by='user_id',inplace=True)
    filename = 'result/mysub.csv'
    sub1.to_csv(filename,index=False)
    print(filename)
    print('Done')

(2288007, 109)
(828934, 108)
(41439, 448)
(19549, 448)
Training until validation scores don't improve for 50 rounds.
[50]	train's rmse: 7.42694	evals's rmse: 8.38487
[100]	train's rmse: 7.20483	evals's rmse: 8.38028
Early stopping, best iteration is:
[81]	train's rmse: 7.28299	evals's rmse: 8.37881
Training until validation scores don't improve for 50 rounds.
[50]	train's rmse: 7.38994	evals's rmse: 8.45796
[100]	train's rmse: 7.13785	evals's rmse: 8.44376
[150]	train's rmse: 6.94295	evals's rmse: 8.44028
Early stopping, best iteration is:
[147]	train's rmse: 6.95165	evals's rmse: 8.44014
Training until validation scores don't improve for 50 rounds.
[50]	train's rmse: 7.53453	evals's rmse: 7.64518
[100]	train's rmse: 7.25511	evals's rmse: 7.64603
Early stopping, best iteration is:
[76]	train's rmse: 7.38526	evals's rmse: 7.63952
Training until validation scores don't improve for 50 rounds.
[50]	train's rmse: 7.53421	evals's rmse: 7.80033
[100]	train's rmse: 7.23763	evals's rmse: 7.7933

[50]	train's rmse: 8.20878	evals's rmse: 5.4283
[100]	train's rmse: 8.09479	evals's rmse: 5.41516
[150]	train's rmse: 7.99839	evals's rmse: 5.40926
[200]	train's rmse: 7.91351	evals's rmse: 5.41094
Early stopping, best iteration is:
[170]	train's rmse: 7.96319	evals's rmse: 5.40861
result/mysub.csv
Done
