In [1]:
import xgboost as xgb
import pandas as pd
from sklearn.metrics import f1_score
import numpy as np
# from notify import send_msg

In [2]:
def evalerror(preds, dtrain):
    labels = dtrain.get_label()
    f1s = []
    # 实验证明在分类阈值在0.4左右比较好.
    for limit in np.arange(0.4, 0.44, 0.01):
        pred = [int(i>limit) for i in preds]
        f1s.append(f1_score(labels, pred))
    
    
#     pred = [1. if i>0.45 else 0. for i in preds]
    # return a pair metric_name, result
    # since preds are margin(before logistic transformation, cutoff at 0)
    return 'f1', -1*max(f1s) #-1*f1_score(labels, pred)

In [3]:
def model_train(train, val, cols, r, es):
    params = {
            'objective': 'binary:logistic',
            'eta': 0.05,
            'min_child_weight': 17,
            'max_depth': 5,
            'verbose_eval': False,
            'seed': 2018,
            'missing': -1,
            'n_jobs':4,
            'eval_metric':'auc',
            'tree_method':'gpu_hist',
            'max_bin':64,
            'gpu_device':0
    }
    dtrain = xgb.DMatrix(train[cols], label=train[['label']])
    dval = xgb.DMatrix(val[cols], label=val[['label']])
    # 这里没有用feval，用了之后会变成单核在跑，有没有什么办法改进一下
    model = xgb.train(params, dtrain, num_boost_round=r, early_stopping_rounds=es, evals=[(dval, 'val')], verbose_eval=False)
    preds = model.predict(dval, ntree_limit=model.best_ntree_limit)
    f1_dict = {}
    # 因此用这种方式来代替feval，最终确定模型轮数的时候还是用feval来确定
    for limit in np.arange(0.4, 0.44, 0.01):
        pred = [int(i>limit) for i in preds]
        f1 = f1_score(dval.get_label(), pred)
        f1_dict[limit] = f1
    best_f1 = sorted(f1_dict.items(), key=lambda x:x[1], reverse=True)[0]
    return best_f1[1], model.best_iteration
#     return model.best_score, model.best_iteration

In [4]:
def feature_select(train, val, basic_cols, add_cols, reverse=False, add_step=10):
    num_boost_rounds = 78
    early_stop_rounds = 200
    from tqdm import tqdm_notebook as tqdm
#     params = {
#         'objective': 'binary:logistic',
#         'eta': 0.05,
#         'colsample_bytree': 0.8,
#         'subsample':0.6,
#         'min_child_weight': 5,
#         'max_depth': 3,
#         'verbose_eval': 100,
#         'seed': 2018,
#         'missing': -1,
#         'n_jobs':8,
#         'eval_metric':'auc'
#     }
    # basic score
    basic_score = model_train(train, val, basic_cols, num_boost_rounds, early_stop_rounds)[0]
    print('base -> ', basic_score)
    base_score = basic_score
    
    print('start calcuate %d features score'%len(add_cols))
    # one-feature scoring
    fea_importance = {}
    for col in tqdm(add_cols):
        if col in basic_cols:
            continue
        tmp_cols = basic_cols + [col]
        fea_score = model_train(train, val, tmp_cols, num_boost_rounds, early_stop_rounds)[0]
    #     print(col,'->',model.best_score,model.best_iteration)
        fea_importance[col] = fea_score
    
    # sort by score
    select1 = []
    for f in fea_importance:
        if reverse:
            if abs(fea_importance[f]) > abs(base_score):
                select1.append(f)
        else:
            if abs(fea_importance[f]) < abs(base_score):
                select1.append(f)
    select1.sort(key=lambda x:abs(fea_importance[x]),reverse=reverse)
    
    print('good features: ', len(select1), 'trying add...')
    # try add features
    best_i = 1
    best_score = base_score
    for i in tqdm(range(1,len(select1)+add_step-1,add_step)):
        tmp_cols = basic_cols + select1[:i]
        tmp_score, tmp_iter = model_train(train, val, tmp_cols, num_boost_rounds, early_stop_rounds)
        if reverse:
            if abs(tmp_score) > abs(best_score):
                print(i,'->',tmp_score, tmp_iter)
                best_score = tmp_score
                best_i = i
        else:
            if abs(tmp_score) < abs(best_score):
                print(i,'->',tmp_score, tmp_iter)
                best_score = tmp_score
                best_i = i
    # best_i before-after
    for i in range(max(0,best_i-add_step),best_i+add_step,1):
        tmp_cols = basic_cols + select1[:i]
        tmp_score, tmp_iter = model_train(train, val, tmp_cols, num_boost_rounds, early_stop_rounds)
        if reverse:
            if abs(tmp_score) > abs(best_score):
                print(i,'->',tmp_score, tmp_iter)
                best_score = tmp_score
                best_i = i
        else:
            if abs(tmp_score) < abs(best_score):
                print(i,'->',tmp_score, tmp_iter)
                best_score = tmp_score
                best_i = i
    
    usecols = basic_cols + select1[:best_i]
    print('add finished, selected top',best_i, 'total :', len(usecols))
    #try drop features
    dropped = []
    while True:
        flag = False
        for f in tqdm(usecols[::-1]):
            if f in dropped:
                continue
            tmp_cols = usecols
            tmp_cols = [each for each in tmp_cols if each not in dropped and each !=f]
            tmp_score, tmp_iter = model_train(train, val, tmp_cols, num_boost_rounds, early_stop_rounds)
            if reverse:
                if abs(tmp_score) > abs(best_score):
                    print(f,'->',tmp_score, tmp_iter)
                    best_score = tmp_score
                    dropped.append(f)
                    flag = True
            else:
                if abs(tmp_score) < abs(best_score):
                    print(f,'->',tmp_score, tmp_iter)
                    best_score = tmp_score
                    dropped.append(f)
                    flag = True
        if not flag:
            break
    print('dropped %d features'%len(dropped))
    usecols = [each for each in usecols if each not in dropped]
    return usecols

In [5]:
def feature_select2(train, val, basic_cols, add_cols, reverse=False):
    num_boost_rounds = 78
    early_stop_rounds = 200
    from tqdm import tqdm_notebook as tqdm
    base_score = model_train(train, val, basic_cols, num_boost_rounds, early_stop_rounds)[0]
    print('base -> ',base_score)
    global_best = base_score
    to_select = add_cols
    selected = []
    stop_flag = 0
    while True:
        fea_imp = {}
        part_best = None
        part_best_fea = ''
        
        for col in tqdm(to_select):
            if col in selected:
                continue
            tmp_cols = basic_cols + selected + [col]
            best_s, best_iter = model_train(train, val, tmp_cols, num_boost_rounds, early_stop_rounds)
            fea_imp[col] = best_s
            if part_best is None:
                part_best = best_s
                part_best_fea = col
            else:
                if reverse:
                    if abs(best_s) > abs(part_best):
                        part_best = best_s
                        part_best_fea = col
                else:
                    if abs(best_s) < abs(part_best):
                        part_best = best_s
                        part_best_fea = col
        if reverse:
            if abs(part_best) > abs(global_best):
                stop_flag = 0
                global_best = part_best
                selected.append(part_best_fea)
                print(part_best_fea, '->', part_best)
            else:
                stop_flag += 1
                selected.append(part_best_fea)
                print('[stop flag + 1]', part_best_fea, part_best)
                
        else:
            if abs(part_best) < abs(global_best):
                stop_flag = 0
                global_best = part_best
                selected.append(part_best_fea)
                print(part_best_fea, '->', part_best)
            else:
                stop_flag += 1
                selected.append(part_best_fea)
                print('[stop flag + 1]', part_best_fea, part_best)
        new_select = []
        for each in fea_imp:
            if abs(fea_imp[each]) > abs(base_score):
                new_select.append(each)
        to_select = new_select
        if stop_flag >= 2:
            break        
        if len(selected) == len(add_cols) or len(add_cols) == 0:
            break
    print('finished.',len(selected),'selected.','stop_flag =',stop_flag)
    usecols = basic_cols + selected[:-stop_flag]
    #try drop features
    dropped = []
    while True:
        flag = False
        for f in tqdm(usecols[::-1]):
            if f in dropped:
                continue
            tmp_cols = usecols
            tmp_cols = [each for each in tmp_cols if each not in dropped and each !=f]
            tmp_score, tmp_iter = model_train(train, val, tmp_cols, num_boost_rounds, early_stop_rounds)
            if reverse:
                if abs(tmp_score) > abs(global_best):
                    print(f,'->',tmp_score, tmp_iter)
                    global_best = tmp_score
                    dropped.append(f)
                    flag = True
            else:
                if abs(tmp_score) < abs(global_best):
                    print(f,'->',tmp_score, tmp_iter)
                    global_best = tmp_score
                    dropped.append(f)
                    flag = True
        if not flag:
            break
    print('dropped %d features'%len(dropped))
    usecols = [each for each in usecols if each not in dropped]
    return usecols

In [6]:
df = pd.read_csv('../features/baseline_features9.csv')

In [7]:
train = df[(df.data_weeknum <= 1)] #& (df.data_weeknum>0)]
val = df[df.data_weeknum == 2]
test = df[df.data_weeknum == 3]
trainval = df[df.data_weeknum <=2]

In [8]:
basic_cols = ['register_type', 'device_type', 'user_reg_days']
usecols = ['register_type', 'device_type', 'user_reg_days', 'user_lastweek_launchday', 'user_launch_days_lastweek', 'user_launch_times_lastweek', 'user_last_launch_dist', 'user_hist_launch_freq', 'user_hist_launchday', 'user_launch_days_hist', 'user_launch_times_hist', 'user_mean_continue_launch_times_lastweek', 'user_max_continue_launch_times_lastweek', 'user_activity_days_hist', 'user_activity_days_lastweek', 'user_min_continue_launch_times_lastweek', 'user_mean_continue_launch_days_lastweek', 'user_max_continue_launch_days_lastweek', 'user_mean_continue_activity_days_lastweek', 'user_max_continue_activity_days_lastweek', 'user_lastweek_act_0_freq', 'user_lastweek_actcount', 'user_activity_times_lastweek', 'user_lastweek_act_video_uniquecount', 'user_max_continue_launch_times_hist', 'user_min_continue_launch_days_lastweek', 'user_mean_continue_launch_times_hist', 'user_min_continue_activity_days_lastweek', 'user_hist_act_0_count', 'user_hist_actcount', 'user_activity_times_hist', 'user_mean_continue_activity_days_hist', 'user_max_continue_launch_days_hist', 'user_hist_act_freq', 'user_mean_continue_launch_days_hist', 'user_hist_act_author_count', 'user_mean_no_launch_days_hist', 'user_min_activity_daytimes_lastweek', 'user_lastweek_act', 'user_lastweek_act_0', 'user_mean_continue_activity_times_hist', 'user_max_launch_daytimes_lastweek', 'user_mean_launch_daytimes_lastweek', 'user_min_launch_daytimes_lastweek', 'user_lastweek_launch', 'user_lastweek_act_page_3_count', 'user_lastweek_act_page_1_count', 'user_max_no_launch_days_hist', 'user_last_act_date', 'user_lastweek_act_2_freq', 'user_lastweek_video_freq', 'user_lastweek_act_2_count']
add1 = ['user_lastweek_launch_freq', 'user_max_no_activity_days_lastweek_hist_dist','user_lastweek_act_page_2_count','user_var_continue_activity_times_lastweek','user_kurt_continue_activity_days_hist']
add2 = ['user_launch_range_percent','user_activity_div_launch_days_hist','user_hist_act_video_meancount','user_hist_video_activity_types']
usecols += add1
usecols += add2
#usecols version1.

In [8]:
#usecols version2.
usecols = ['register_type', 'device_type', 'user_reg_days', 'user_lastweek_launchday', 'user_last_launch_dist', 'user_hist_launch_freq', 'user_hist_launchday', 'user_mean_continue_launch_times_lastweek', 'user_max_continue_launch_times_lastweek', 'user_activity_days_hist', 'user_activity_days_lastweek', 'user_min_continue_launch_times_lastweek', 'user_mean_continue_launch_days_lastweek', 'user_max_continue_launch_days_lastweek', 'user_mean_continue_activity_days_lastweek', 'user_max_continue_activity_days_lastweek', 'user_lastweek_act_0_freq', 'user_lastweek_actcount', 'user_lastweek_act_video_uniquecount', 'user_max_continue_launch_times_hist', 'user_min_continue_launch_days_lastweek', 'user_mean_continue_launch_times_hist', 'user_min_continue_activity_days_lastweek', 'user_hist_act_0_count', 'user_hist_actcount', 'user_mean_continue_activity_days_hist', 'user_max_continue_launch_days_hist', 'user_hist_act_freq', 'user_mean_continue_launch_days_hist', 'user_hist_act_author_count', 'user_mean_no_launch_days_hist', 'user_min_activity_daytimes_lastweek', 'user_lastweek_act', 'user_lastweek_act_0', 'user_mean_continue_activity_times_hist', 'user_max_launch_daytimes_lastweek', 'user_lastweek_launch', 'user_lastweek_act_page_3_count', 'user_lastweek_act_page_1_count', 'user_max_no_launch_days_hist', 'user_last_act_date', 'user_lastweek_act_2_freq', 'user_lastweek_video_freq', 'user_lastweek_act_2_count', 'user_lastweek_launch_freq', 'user_max_no_activity_days_lastweek_hist_dist', 'user_lastweek_act_page_2_count', 'user_var_continue_activity_times_lastweek', 'user_kurt_continue_activity_days_hist', 'user_launch_range_percent', 'user_activity_div_launch_days_hist', 'user_hist_act_video_meancount', 'user_hist_video_activity_types', 'user_activity_range_percent', 'user_5daybefore_act_page_1_count', 'user_min_continue_activity_times_5daywin', 'user_lastweek_hist_act_3_count_dist', 'user_lastweek_hist_act_page_3_count_dist', 'user_mean_createvideo_date_lastweek_hist_dist', 'user_4daybefore_act_page_4_count', 'user_kurt_no_launch_days_6daywin', 'user_max_continue_createvideo_days_4daywin', 'user_2daybefore_act_1_count', 'user_max_continue_createvideo_days_hist']

In [None]:
to_select = [col for col in df.columns.tolist()[9:] if col not in usecols]

In [28]:
# from scipy.stats import pearsonr
# for i in range(len(usecols)):
#     for j in range(i+1, len(usecols)):
#         if pearsonr(df[usecols[i]], df[usecols[j]])[0] == 1:
#             print(usecols[i], usecols[j])

user_lastweek_launchday user_launch_days_lastweek
user_lastweek_launchday user_launch_times_lastweek
user_launch_days_lastweek user_launch_times_lastweek
user_hist_launchday user_launch_days_hist
user_hist_launchday user_launch_times_hist
user_launch_days_hist user_launch_times_hist
user_lastweek_actcount user_activity_times_lastweek
user_hist_actcount user_activity_times_hist
user_max_launch_daytimes_lastweek user_mean_launch_daytimes_lastweek
user_max_launch_daytimes_lastweek user_min_launch_daytimes_lastweek
user_mean_launch_daytimes_lastweek user_min_launch_daytimes_lastweek


In [15]:
feas = feature_select(train, val, usecols, to_select, reverse=True)

base ->  0.806753637471795
start calcuate 1267 features score



good features:  170 trying add...


1 -> 0.8071999379315694 77

add finished, selected top 1 total : 66


user_lastweek_hist_act_page_3_dist -> 0.8072924747866563 77




dropped 1 features


In [None]:
feas = feature_select2(train, val, usecols, to_select, reverse=True)

base ->  0.8084760857038086


In [None]:
send_msg('%s'%(model_train(train, val, feas, 20000, 500),))

In [None]:
feas