In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMClassifier
import logging

In [2]:
import pickle
from tqdm import tqdm, tqdm_notebook, _tqdm_notebook, tqdm_pandas

In [3]:
log_fmt = "[%(asctime)s] %(levelname)s in %(module)s: %(message)s"
logging.basicConfig(format=log_fmt, level=logging.INFO)

In [4]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
def extract_day(s):
    return s.apply(lambda x: int(x.split('-')[0][1:]))

def extract_hour(s):
    return s.apply(lambda x: int(x.split('-')[1][1:]))

In [6]:
def parse_str(d):
    return np.array(list(map(float, d.split())))
def parse_list_1(d):
    if d == '-1':
        return [0]
    return list(map(lambda x: int(x[1:]), str(d).split(',')))
def parse_map(d):
    if d == '-1':
        return {}
    return dict([int(z.split(':')[0][1:]), float(z.split(':')[1])] for z in d.split(','))

In [7]:
base_path = '../data'

In [8]:
# 加载邀请回答数据

train = pd.read_csv(f'{base_path}/invite_info_0926.txt', sep='\t', header=None)
train.columns = ['qid', 'uid', 'dt', 'label']
logging.info("invite %s", train.shape)

# test = pd.read_csv(f'{base_path}/invite_info_evaluate_1_0926.txt', sep='\t', header=None)
test = pd.read_csv(f'{base_path}/invite_info_evaluate_2_0926.txt', sep='\t', header=None)
test.columns = ['qid', 'uid', 'dt']
logging.info("test %s", test.shape)

sub = test.copy()

sub_size = len(sub)

train['day'] = extract_day(train['dt'])
train['hour'] = extract_hour(train['dt'])

test['day'] = extract_day(test['dt'])
test['hour'] = extract_hour(test['dt'])
del train['dt'], test['dt']


[2019-12-17 16:48:49,765] INFO in <ipython-input-8-15fcefa9ae03>: invite (9489162, 4)
[2019-12-17 16:48:56,142] INFO in <ipython-input-8-15fcefa9ae03>: test (1141718, 3)


In [9]:
# 加载问题
ques = pd.read_csv(f'{base_path}/question_info_0926.txt', header=None, sep='\t')
ques.columns = ['qid', 'q_dt', 'title_t1', 'title_t2', 'desc_t1', 'desc_t2', 'q_topic']
# del ques['title_t1'], ques['title_t2'], ques['desc_t1'], ques['desc_t2']
del ques['title_t1'],ques['desc_t1']
logging.info("ques %s", ques.shape)

ques['q_day'] = extract_day(ques['q_dt'])
ques['q_hour'] = extract_hour(ques['q_dt'])
del ques['q_dt']

[2019-12-17 16:54:14,794] INFO in <ipython-input-9-8ea131990986>: ques (1829900, 5)


In [10]:
# ques['关注话题'] = train['关注话题'].apply(parse_list_1)
ques['title_t2'] = ques['title_t2'].apply(parse_list_1)
ques['desc_t2'] = ques['desc_t2'].apply(parse_list_1)
ques['q_topic'] = ques['q_topic'].apply(parse_list_1)
# train['感兴趣话题'] = train['感兴趣话题'].apply(parse_map)
ques.head()

Unnamed: 0,qid,title_t2,desc_t2,q_topic,q_day,q_hour
0,Q2234111670,"[22414, 963, 10458]",[0],"[321, 730, 5784, 4389]",1018,5
1,Q760329790,"[12677, 16829, 15201, 6419, 101839]","[1296, 2118, 12677, 16829, 15201, 6419, 101839...","[278, 12673, 4677]",1745,20
2,Q741313548,"[700, 2781, 3280, 81215]","[732, 24400, 48321, 39608, 20788, 219486, 1183...",[226],2032,21
3,Q3481466230,"[3312, 1823, 1505, 638, 166, 461]","[6642, 4214, 3312, 1505, 2205, 232, 294, 7177,...","[51, 4468]",2185,15
4,Q3966197028,"[700, 895, 2253]",[0],"[54700, 81, 57, 17670, 43574]",2269,17


In [11]:
topicmap = pd.read_csv('../data/topic_vectors_64d.txt', 
                          names=['id', 'embed'], sep='\t')
topicmap['embed'] = topicmap['embed'].apply(parse_str)
topicmap['id'] = topicmap['id'].apply(lambda x: int(x[1:]))
topicmap.head()

Unnamed: 0,id,embed
0,1,"[0.16508673, -0.0037432343, -0.058245048, -0.0..."
1,2,"[1.608256, -1.0515573, -1.1897708, 1.1820835, ..."
2,3,"[3.3307428, -0.43252096, -2.1518784, -1.439003..."
3,4,"[2.4698818, -0.12998039, -0.4648351, 0.8796743..."
4,5,"[1.562477, -1.3560516, -0.3271215, -0.06341907..."


In [12]:
topicmap.shape

(100000, 2)

In [13]:
topic_vector_dict = dict(zip(np.array(topicmap['id']), np.array(topicmap['embed'])))

In [14]:
def topic2v(x):
    try:
        tmp = topic_vector_dict[x[0]]
    except:
        tmp = np.zeros(64)
    for i in x[1:]:
        tmp = tmp + topic_vector_dict[i]
    if len(tmp) == 0:
        return np.zeros(64)
    return (tmp / len(x))

In [15]:
tqdm.pandas(desc="topic2v...")
ques['q_topic']=ques['q_topic'].progress_apply(lambda x:topic2v(x))
print('finished!')

topic2v...: 100%|██████████| 1829900/1829900 [01:28<00:00, 20631.88it/s]


finished!


In [16]:
wordmap = pd.read_csv('../data/word_vectors_64d.txt', 
                          names=['id', 'embed'], sep='\t')
wordmap['embed'] = wordmap['embed'].apply(parse_str)
wordmap['id'] = wordmap['id'].apply(lambda x: int(x[1:]))
wordmap.head()

Unnamed: 0,id,embed
0,1,"[0.12561196, -0.57268924, -0.14478925, -0.0524..."
1,2,"[3.224765, 2.2482696, -0.511986, -0.5329892, -..."
2,3,"[-0.985937, 0.11307016, 0.012898494, -0.682206..."
3,4,"[-0.3367663, 0.039051324, 0.8155926, 0.8351733..."
4,5,"[0.3074205, -1.0977745, 0.7528213, 0.6299011, ..."


In [17]:
wordmap.shape
word_vector_dict = dict(zip(np.array(wordmap['id']), np.array(wordmap['embed'])))

In [18]:
def word2v(x):
    try:
        tmp = word_vector_dict[x[0]]
    except:
        tmp = np.zeros(64)
    for i in x[1:]:
        tmp = tmp + word_vector_dict[i]
    if len(tmp) == 0:
        return np.zeros(64)
    return (tmp / len(x))

In [19]:
tqdm.pandas(desc="word2v...")
ques['title_t2']=ques['title_t2'].progress_apply(lambda x:word2v(x))
print('finished!')

word2v...: 100%|██████████| 1829900/1829900 [03:20<00:00, 9113.39it/s] 


finished!


In [20]:
tqdm.pandas(desc="word2v...")
ques['desc_t2']=ques['desc_t2'].progress_apply(lambda x:word2v(x))
print('finished!')

word2v...: 100%|██████████| 1829900/1829900 [07:43<00:00, 3950.78it/s]


finished!


In [21]:
ques.head()

Unnamed: 0,qid,title_t2,desc_t2,q_topic,q_day,q_hour
0,Q2234111670,"[3.4942667999999997, -0.8552949333333334, -4.0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-1.1211431250000001, 2.9202411, -0.0973278000...",1018,5
1,Q760329790,"[1.1900169984, -4.05578632, -3.945152239999999...","[0.040301072461538384, -1.3610320753846152, -3...","[-2.680529233333333, -5.466671466666667, -3.97...",1745,20
2,Q741313548,"[1.39550311, -0.394145575, -1.2251911, -0.3720...","[4.4959643713999995, 1.582830815, 0.0903138088...","[1.1679975, -0.9902606, 2.8614578, 2.517082, -...",2032,21
3,Q3481466230,"[0.5512061833333333, 0.0740648883333332, 1.733...","[1.5809253704166668, -1.2889901612499999, -0.0...","[-1.4455721, -1.401717435, 1.141536675, 3.5025...",2185,15
4,Q3966197028,"[-0.6215633333333334, 0.6605363, -0.6174622666...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.281498462, 1.8468875400000002, -2.340192636...",2269,17


In [22]:
def wordadd(x):
    try:
        tmp = 0.0
    except:
        tmp = 0.0
    tmp=x.sum()
    return tmp

In [23]:
tqdm.pandas(desc="wordadd...")
ques['title_t2']=ques['title_t2'].progress_apply(lambda x:wordadd(x))
print('finished!')

wordadd...: 100%|██████████| 1829900/1829900 [01:26<00:00, 21154.92it/s]


finished!


In [24]:
tqdm.pandas(desc="wordadd...")
ques['desc_t2']=ques['desc_t2'].progress_apply(lambda x:wordadd(x))
print('finished!')

wordadd...: 100%|██████████| 1829900/1829900 [01:24<00:00, 21558.97it/s]


finished!


In [25]:
tqdm.pandas(desc="wordadd...")
ques['q_topic']=ques['q_topic'].progress_apply(lambda x:wordadd(x))
print('finished!')

wordadd...: 100%|██████████| 1829900/1829900 [01:42<00:00, 17836.63it/s]


finished!


In [26]:
ques.head()

Unnamed: 0,qid,title_t2,desc_t2,q_topic,q_day,q_hour
0,Q2234111670,1.29121,0.0,-15.928334,1018,5
1,Q760329790,-40.540854,-27.272866,-17.548214,1745,20
2,Q741313548,-20.942559,-4.872102,1.576837,2032,21
3,Q3481466230,-20.935588,-18.919992,-1.075748,2185,15
4,Q3966197028,-43.186283,0.0,-15.350362,2269,17


In [27]:
# 加载回答
ans = pd.read_csv(f'{base_path}/answer_info_0926.txt', header=None, sep='\t')
ans.columns = ['aid', 'qid', 'uid', 'ans_dt', 'ans_t1', 'ans_t2', 'is_good', 'is_rec', 'is_dest', 'has_img',
               'has_video', 'word_count', 'reci_cheer', 'reci_uncheer', 'reci_comment', 'reci_mark', 'reci_tks',
               'reci_xxx', 'reci_no_help', 'reci_dis']
del ans['ans_t1'], ans['ans_t2']
logging.info("ans %s", ans.shape)

ans['a_day'] = extract_day(ans['ans_dt'])
ans['a_hour'] = extract_hour(ans['ans_dt'])
del ans['ans_dt']

ans = pd.merge(ans, ques, on='qid')
del ques

[2019-12-17 17:31:13,599] INFO in <ipython-input-27-0f04b531e5be>: ans (4513735, 18)


In [28]:
print(ans['a_day'].min())
print(ans['a_day'].max())

3807
3867


In [29]:
# 回答距提问的天数
ans['diff_qa_days'] = ans['a_day'] - ans['q_day']

[2019-12-17 17:32:43,318] INFO in utils: Note: NumExpr detected 24 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
[2019-12-17 17:32:43,320] INFO in utils: NumExpr defaulting to 8 threads.


In [30]:
# 时间窗口划分
# train
# val
train_start = 3838
train_end = 3867

val_start = 3868
val_end = 3874

label_end = 3867
label_start = label_end - 6

train_label_feature_end = label_end - 7
train_label_feature_start = train_label_feature_end - 22

train_ans_feature_end = label_end - 7
train_ans_feature_start = train_ans_feature_end - 50

val_label_feature_end = val_start - 1
val_label_feature_start = val_label_feature_end - 22

val_ans_feature_end = val_start - 1
val_ans_feature_start = val_ans_feature_end - 50

train_label_feature = train[(train['day'] >= train_label_feature_start) & (train['day'] <= train_label_feature_end)]
logging.info("train_label_feature %s", train_label_feature.shape)

val_label_feature = train[(train['day'] >= val_label_feature_start) & (train['day'] <= val_label_feature_end)]
logging.info("val_label_feature %s", val_label_feature.shape)

train_label = train[(train['day'] > train_label_feature_end)]

logging.info("train feature start %s end %s, label start %s end %s", train_label_feature['day'].min(),
             train_label_feature['day'].max(), train_label['day'].min(), train_label['day'].max())

logging.info("test feature start %s end %s, label start %s end %s", val_label_feature['day'].min(),
             val_label_feature['day'].max(), test['day'].min(), test['day'].max())

[2019-12-17 17:32:46,596] INFO in <ipython-input-30-413cbf6f4a7d>: train_label_feature (6895493, 5)
[2019-12-17 17:32:49,575] INFO in <ipython-input-30-413cbf6f4a7d>: val_label_feature (7583553, 5)
[2019-12-17 17:32:50,872] INFO in <ipython-input-30-413cbf6f4a7d>: train feature start 3838 end 3860, label start 3861 end 3867
[2019-12-17 17:32:50,986] INFO in <ipython-input-30-413cbf6f4a7d>: test feature start 3845 end 3867, label start 3868 end 3874


In [31]:
# 确定ans的时间范围
# 3807~3874
train_ans_feature = ans[(ans['a_day'] >= train_ans_feature_start) & (ans['a_day'] <= train_ans_feature_end)]

val_ans_feature = ans[(ans['a_day'] >= val_ans_feature_start) & (ans['a_day'] <= val_ans_feature_end)]

logging.info("train ans feature %s, start %s end %s", train_ans_feature.shape, train_ans_feature['a_day'].min(),
             train_ans_feature['a_day'].max())

logging.info("val ans feature %s, start %s end %s", val_ans_feature.shape, val_ans_feature['a_day'].min(),
             val_ans_feature['a_day'].max())

fea_cols = ['is_good', 'is_rec', 'is_dest', 'has_img', 'has_video', 'word_count',
            'reci_cheer', 'reci_uncheer', 'reci_comment', 'reci_mark', 'reci_tks',
            'reci_xxx', 'reci_no_help', 'reci_dis', 'diff_qa_days']


def extract_feature1(target, label_feature, ans_feature):
    # 问题特征
    t1 = label_feature.groupby('qid')['label'].agg(['mean', 'sum', 'std', 'count']).reset_index()
    t1.columns = ['qid', 'q_inv_mean', 'q_inv_sum', 'q_inv_std', 'q_inv_count']
    target = pd.merge(target, t1, on='qid', how='left')

    # 用户特征
    t1 = label_feature.groupby('uid')['label'].agg(['mean', 'sum', 'std', 'count']).reset_index()
    t1.columns = ['uid', 'u_inv_mean', 'u_inv_sum', 'u_inv_std', 'u_inv_count']
    target = pd.merge(target, t1, on='uid', how='left')
    #
    # train_size = len(train)
    # data = pd.concat((train, test), sort=True)

    # 回答部分特征

    t1 = ans_feature.groupby('qid')['aid'].count().reset_index()
    t1.columns = ['qid', 'q_ans_count']
    target = pd.merge(target, t1, on='qid', how='left')

    t1 = ans_feature.groupby('uid')['aid'].count().reset_index()
    t1.columns = ['uid', 'u_ans_count']
    target = pd.merge(target, t1, on='uid', how='left')

    for col in fea_cols:
        t1 = ans_feature.groupby('uid')[col].agg(['sum', 'max', 'mean']).reset_index()
        t1.columns = ['uid', f'u_{col}_sum', f'u_{col}_max', f'u_{col}_mean']
        target = pd.merge(target, t1, on='uid', how='left')

        t1 = ans_feature.groupby('qid')[col].agg(['sum', 'max', 'mean']).reset_index()
        t1.columns = ['qid', f'q_{col}_sum', f'q_{col}_max', f'q_{col}_mean']
        target = pd.merge(target, t1, on='qid', how='left')
        logging.info("extract %s", col)
    return target


train_label = extract_feature1(train_label, train_label_feature, train_ans_feature)
test = extract_feature1(test, val_label_feature, val_ans_feature)

[2019-12-17 17:33:00,620] INFO in <ipython-input-31-0511893e776f>: train ans feature (3700178, 25), start 3810 end 3860
[2019-12-17 17:33:00,655] INFO in <ipython-input-31-0511893e776f>: val ans feature (3992334, 25), start 3817 end 3867
[2019-12-17 17:37:34,139] INFO in <ipython-input-31-0511893e776f>: extract is_good
[2019-12-17 17:38:36,290] INFO in <ipython-input-31-0511893e776f>: extract is_rec
[2019-12-17 17:40:50,126] INFO in <ipython-input-31-0511893e776f>: extract is_dest
[2019-12-17 17:42:47,091] INFO in <ipython-input-31-0511893e776f>: extract has_img
[2019-12-17 17:44:42,062] INFO in <ipython-input-31-0511893e776f>: extract has_video
[2019-12-17 17:46:25,391] INFO in <ipython-input-31-0511893e776f>: extract word_count
[2019-12-17 17:48:14,765] INFO in <ipython-input-31-0511893e776f>: extract reci_cheer
[2019-12-17 17:50:10,737] INFO in <ipython-input-31-0511893e776f>: extract reci_uncheer
[2019-12-17 17:51:47,319] INFO in <ipython-input-31-0511893e776f>: extract reci_commen

In [32]:
# 特征提取结束
logging.info("train shape %s, test shape %s", train_label.shape, test.shape)
assert len(test) == sub_size

# 加载用户
user = pd.read_csv(f'{base_path}/member_info_0926.txt', header=None, sep='\t')
user.columns = ['uid', 'gender', 'creat_keyword', 'level', 'hot', 'reg_type', 'reg_plat', 'freq', 'uf_b1', 'uf_b2',
                'uf_b3', 'uf_b4', 'uf_b5', 'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5', 'score', 'follow_topic',
                'inter_topic']

[2019-12-17 18:27:51,974] INFO in <ipython-input-32-d70daf315b0b>: train shape (2593669, 105), test shape (1141718, 104)


In [33]:
dit = {'daily': 4, 'weekly': 3, 'monthly': 2, 'new': 1,'unknow':0}
user['freq'] = user['freq'].map(dit)

In [34]:
user = user.drop(['creat_keyword','level','hot','reg_type','reg_plat'],axis=1)

In [35]:
user.head()

Unnamed: 0,uid,gender,freq,uf_b1,uf_b2,uf_b3,uf_b4,uf_b5,uf_c1,uf_c2,uf_c3,uf_c4,uf_c5,score,follow_topic,inter_topic
0,M1934753188,male,2.0,0,1,0,1,0,MD470265,BR470265,PV929066,CT929066,PF470265,764,T540,"T21107:1.7915097,T405:1.6123838,T4436:1.518003..."
1,M595924114,male,4.0,0,0,0,1,1,MD195122,BR596936,PV002320,CT840234,PF470265,671,"T44126,T15940,T839,T8978,T2934,T1113,T3914,T12...","T18016:2.0650618,T2384:1.2503042,T1142:1.13569..."
2,M1473482940,female,3.0,0,1,0,1,0,MD116493,BR641329,PV170953,CT470265,PF470265,454,"T30874,T2113,T8656,T21,T523,T8,T116,T5727,T68,...","T46:1.330939,T2159:1.1296458,T379:1.1241927,T1..."
3,M578477092,male,4.0,1,1,0,1,0,MD889589,BR803759,PV545833,CT545833,PF470265,588,"T946,T7323,T297,T2660,T36067,T53107,T2654,T507...","T15918:1.9479566,T8106:1.8578106,T4787:1.58486..."
4,M1088794709,male,3.0,0,1,0,0,0,MD825760,BR641329,PV071037,CT470265,PF470265,361,"T582,T558,T28776,T5186,T9081,T2419,T2693,T2299...","T65:1.5992582,T867:1.3179373,T93:1.2095009,T31..."


In [36]:
user['follow_topic'] = user['follow_topic'].apply(parse_list_1)
user['inter_topic'] = user['inter_topic'].apply(parse_map)

In [37]:
def topic_interest2v(x):
    if len(x)==0:
        return np.zeros(64)
    else:
        tmp=np.zeros(64)
        for i in x:
            tmp = tmp + topic_vector_dict[i]*x[i]
        return (tmp / len(x))

In [38]:
tqdm.pandas(desc="topic2v...")
user['follow_topic']=user['follow_topic'].progress_apply(lambda x:topic2v(x))
print('finished!')

topic2v...: 100%|██████████| 1931654/1931654 [06:09<00:00, 5228.78it/s] 


finished!


In [39]:
tqdm.pandas(desc="topic_interest2v...")
user['inter_topic']=user['inter_topic'].progress_apply(lambda x:topic_interest2v(x))
user.head()

topic_interest2v...: 100%|██████████| 1931654/1931654 [04:48<00:00, 6698.51it/s] 


Unnamed: 0,uid,gender,freq,uf_b1,uf_b2,uf_b3,uf_b4,uf_b5,uf_c1,uf_c2,uf_c3,uf_c4,uf_c5,score,follow_topic,inter_topic
0,M1934753188,male,2.0,0,1,0,1,0,MD470265,BR470265,PV929066,CT929066,PF470265,764,"[6.125305, -1.4180568, -1.3143845, -2.9268239,...","[0.5255145018431924, 2.8507211193389996, 0.571..."
1,M595924114,male,4.0,0,0,0,1,1,MD195122,BR596936,PV002320,CT840234,PF470265,671,"[0.30575216599999994, -0.9980522930666669, 0.0...","[-1.9947685124736019, 1.3387408371544118, -0.3..."
2,M1473482940,female,3.0,0,1,0,1,0,MD116493,BR641329,PV170953,CT470265,PF470265,454,"[0.1508028030833333, 0.7434654864583335, -0.25...","[-0.25824397617484, 0.6167275103940157, -0.075..."
3,M578477092,male,4.0,1,1,0,1,0,MD889589,BR803759,PV545833,CT545833,PF470265,588,"[1.4605925141999998, -0.33966116570100013, 0.3...","[1.4908855224214492, 1.4438666038597823, 1.179..."
4,M1088794709,male,3.0,0,1,0,0,0,MD825760,BR641329,PV071037,CT470265,PF470265,361,"[0.9643650885294115, 0.31516375361764704, 0.82...","[1.5879302100003165, -1.1453530874701519, -0.7..."


In [40]:
tqdm.pandas(desc="wordadd...")
user['follow_topic']=user['follow_topic'].progress_apply(lambda x:wordadd(x))
print('finished!')

wordadd...: 100%|██████████| 1931654/1931654 [01:00<00:00, 31883.72it/s]


finished!


In [41]:
tqdm.pandas(desc="wordadd...")
user['inter_topic']=user['inter_topic'].progress_apply(lambda x:wordadd(x))
print('finished!')

wordadd...: 100%|██████████| 1931654/1931654 [00:55<00:00, 35069.40it/s] 


finished!


In [42]:
user.head()

Unnamed: 0,uid,gender,freq,uf_b1,uf_b2,uf_b3,uf_b4,uf_b5,uf_c1,uf_c2,uf_c3,uf_c4,uf_c5,score,follow_topic,inter_topic
0,M1934753188,male,2.0,0,1,0,1,0,MD470265,BR470265,PV929066,CT929066,PF470265,764,-9.543027,11.425364
1,M595924114,male,4.0,0,0,0,1,1,MD195122,BR596936,PV002320,CT840234,PF470265,671,2.840966,-9.270188
2,M1473482940,female,3.0,0,1,0,1,0,MD116493,BR641329,PV170953,CT470265,PF470265,454,1.337653,3.0734
3,M578477092,male,4.0,1,1,0,1,0,MD889589,BR803759,PV545833,CT545833,PF470265,588,-0.832412,-4.796925
4,M1088794709,male,3.0,0,1,0,0,0,MD825760,BR641329,PV071037,CT470265,PF470265,361,-6.422263,-14.987697


In [43]:

logging.info("user %s", user.shape)

unq = user.nunique()
logging.info("user unq %s", unq)

for x in unq[unq == 1].index:
    del user[x]
    logging.info('del unq==1 %s', x)

t = user.dtypes
cats = [x for x in t[t == 'object'].index if x not in ['follow_topic', 'inter_topic', 'uid','freq']]
logging.info("user cat %s", cats)

for d in cats:
    lb = LabelEncoder()
    user[d] = lb.fit_transform(user[d])
    logging.info('encode %s', d)

q_lb = LabelEncoder()
q_lb.fit(list(train_label['qid'].astype(str).values) + list(test['qid'].astype(str).values))
train_label['qid_enc'] = q_lb.transform(train_label['qid'])
test['qid_enc'] = q_lb.transform(test['qid'])

u_lb = LabelEncoder()
u_lb.fit(user['uid'])
train_label['uid_enc'] = u_lb.transform(train_label['uid'])
test['uid_enc'] = u_lb.transform(test['uid'])

# merge user
train_label = pd.merge(train_label, user, on='uid', how='left')
test = pd.merge(test, user, on='uid', how='left')
logging.info("train shape %s, test shape %s", train_label.shape, test.shape)

data = pd.concat((train_label, test), axis=0, sort=True)
# del train_label, test

[2019-12-17 18:45:53,185] INFO in <ipython-input-43-9e402841ed3b>: user (1931654, 16)
[2019-12-17 18:46:28,490] INFO in <ipython-input-43-9e402841ed3b>: user unq uid             1931654
gender                3
freq                  4
uf_b1                 2
uf_b2                 2
uf_b3                 2
uf_b4                 2
uf_b5                 2
uf_c1              2561
uf_c2               291
uf_c3               428
uf_c4              1556
uf_c5                 2
score               732
follow_topic    1288996
inter_topic     1399663
dtype: int64
[2019-12-17 18:46:28,512] INFO in <ipython-input-43-9e402841ed3b>: user cat ['gender', 'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5']
[2019-12-17 18:46:31,053] INFO in <ipython-input-43-9e402841ed3b>: encode gender
[2019-12-17 18:46:33,230] INFO in <ipython-input-43-9e402841ed3b>: encode uf_c1
[2019-12-17 18:46:35,207] INFO in <ipython-input-43-9e402841ed3b>: encode uf_c2
[2019-12-17 18:46:37,086] INFO in <ipython-input-43-9e402841ed3b>: e

In [44]:
train_label.head()

Unnamed: 0,qid,uid,label,day,hour,q_inv_mean,q_inv_sum,q_inv_std,q_inv_count,u_inv_mean,...,uf_b4,uf_b5,uf_c1,uf_c2,uf_c3,uf_c4,uf_c5,score,follow_topic,inter_topic
0,Q2166419046,M401693808,0,3865,22,,,,,0.0,...,0,0,2113,190,261,927,1,297,-7.818888,1.389115
1,Q604029601,M2317670257,0,3862,15,,,,,0.090909,...,0,0,1519,229,0,506,1,415,-0.998302,-11.683995
2,Q2443223942,M3544409350,0,3867,4,0.375,57.0,0.485723,152.0,0.0,...,0,0,551,226,188,815,1,296,0.0,4.540293
3,Q795459266,M2818659842,0,3861,20,0.166667,1.0,0.408248,6.0,0.285714,...,0,0,1519,229,0,506,1,380,-19.358392,-12.257125
4,Q110462128,M848334644,1,3862,8,,,,,0.634146,...,0,0,2161,31,396,1438,1,719,-6.080521,-11.315631


In [45]:
test.head()

Unnamed: 0,qid,uid,day,hour,q_inv_mean,q_inv_sum,q_inv_std,q_inv_count,u_inv_mean,u_inv_sum,...,uf_b4,uf_b5,uf_c1,uf_c2,uf_c3,uf_c4,uf_c5,score,follow_topic,inter_topic
0,Q3273481096,M1267743167,3871,6,,,,,0.347826,16.0,...,1,0,1688,190,203,443,1,696,-5.481045,5.671983
1,Q4224184733,M2715893043,3871,23,,,,,0.095238,2.0,...,1,0,1190,130,275,1330,1,573,-5.67406,-27.750894
2,Q1832714071,M2244950365,3874,15,,,,,0.142857,1.0,...,0,0,1190,130,203,758,1,323,0.0,-7.115444
3,Q3594972263,M2321407666,3872,10,0.62963,17.0,0.492103,27.0,,,...,0,0,1190,130,288,758,1,294,-2.06347,-35.260996
4,Q403456350,M1091084170,3870,9,,,,,0.2,1.0,...,0,0,2270,271,306,758,1,449,17.825838,6.284933


In [46]:
data.head()

Unnamed: 0,day,follow_topic,freq,gender,hour,inter_topic,label,q_ans_count,q_diff_qa_days_max,q_diff_qa_days_mean,...,uf_b3,uf_b4,uf_b5,uf_c1,uf_c2,uf_c3,uf_c4,uf_c5,uid,uid_enc
0,3865,-7.818888,3.0,2,22,1.389115,0.0,,,,...,0,0,0,2113,190,261,927,1,M401693808,1508098
1,3862,-0.998302,3.0,2,15,-11.683995,0.0,,,,...,0,0,0,1519,229,0,506,1,M2317670257,657985
2,3867,0.0,2.0,2,4,4.540293,0.0,32.0,13.0,9.53125,...,0,0,0,551,226,188,815,1,M3544409350,1272353
3,3861,-19.358392,4.0,1,20,-12.257125,0.0,3.0,234.0,222.333333,...,0,0,0,1519,229,0,506,1,M2818659842,909154
4,3862,-6.080521,3.0,0,8,-11.315631,1.0,,,,...,0,0,0,2161,31,396,1438,1,M848334644,1856019


In [47]:
# count编码
count_fea = ['uid_enc', 'qid_enc', 'gender', 'freq', 'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5']
for feat in count_fea:
    col_name = '{}_count'.format(feat)
    data[col_name] = data[feat].map(data[feat].value_counts().astype(int))
    data.loc[data[col_name] < 2, feat] = -1
    data[feat] += 1
    data[col_name] = data[feat].map(data[feat].value_counts().astype(int))
    data[col_name] = (data[col_name] - data[col_name].min()) / (data[col_name].max() - data[col_name].min())
#     # 

# # 问题被回答的次数

In [48]:
# 压缩数据
t = data.dtypes
for x in t[t == 'int64'].index:
    data[x] = data[x].astype('int32')

for x in t[t == 'float64'].index:
    data[x] = data[x].astype('float32')

data['wk'] = data['day'] % 7

feature_cols = [x for x in data.columns if x not in ('label', 'uid', 'qid', 'dt', 'day')]

In [None]:
# target编码
logging.info("feature size %s", len(feature_cols))

X_train_all = data.iloc[:len(train_label)][feature_cols]
y_train_all = data.iloc[:len(train_label)]['label']
test = data.iloc[len(train_label):]
# del data
assert len(test) == sub_size

logging.info("train shape %s, test shape %s", train_label.shape, test.shape)

fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for index, (train_idx, val_idx) in enumerate(fold.split(X=X_train_all, y=y_train_all)):
    break

X_train, X_val, y_train, y_val = X_train_all.iloc[train_idx][feature_cols], X_train_all.iloc[val_idx][feature_cols], \
                                 y_train_all.iloc[train_idx], \
                                 y_train_all.iloc[val_idx]
del X_train_all

model_lgb = LGBMClassifier(n_estimators=3000, n_jobs=-1, objective='binary', seed=1000, silent=True)
model_lgb.fit(X_train, y_train,
              eval_metric=['logloss', 'auc'],
              eval_set=[(X_val, y_val)],
              early_stopping_rounds=50)

sub['label'] = model_lgb.predict_proba(test[feature_cols])[:, 1]


sub.to_csv('../example/result_1217_invite2.txt', index=None, header=None, sep='\t')

[2019-12-17 18:57:32,575] INFO in <ipython-input-49-860e4fdfec07>: feature size 128
[2019-12-17 18:57:34,340] INFO in <ipython-input-49-860e4fdfec07>: train shape (2593669, 122), test shape (1141718, 132)


[1]	valid_0's auc: 0.75976	valid_0's binary_logloss: 0.42656
Training until validation scores don't improve for 50 rounds
[2]	valid_0's auc: 0.775308	valid_0's binary_logloss: 0.416497
[3]	valid_0's auc: 0.779558	valid_0's binary_logloss: 0.408378
[4]	valid_0's auc: 0.785098	valid_0's binary_logloss: 0.401546
[5]	valid_0's auc: 0.786935	valid_0's binary_logloss: 0.395927
[6]	valid_0's auc: 0.790076	valid_0's binary_logloss: 0.390779
[7]	valid_0's auc: 0.7921	valid_0's binary_logloss: 0.386575
[8]	valid_0's auc: 0.793813	valid_0's binary_logloss: 0.382857
[9]	valid_0's auc: 0.795286	valid_0's binary_logloss: 0.379494
[10]	valid_0's auc: 0.796636	valid_0's binary_logloss: 0.376595
[11]	valid_0's auc: 0.79737	valid_0's binary_logloss: 0.374088
[12]	valid_0's auc: 0.798314	valid_0's binary_logloss: 0.371749
[13]	valid_0's auc: 0.799481	valid_0's binary_logloss: 0.369759
[14]	valid_0's auc: 0.800445	valid_0's binary_logloss: 0.367978
[15]	valid_0's auc: 0.801639	valid_0's binary_logloss: 0.

[129]	valid_0's auc: 0.829632	valid_0's binary_logloss: 0.335678
[130]	valid_0's auc: 0.829655	valid_0's binary_logloss: 0.33566
[131]	valid_0's auc: 0.829721	valid_0's binary_logloss: 0.335605
[132]	valid_0's auc: 0.829799	valid_0's binary_logloss: 0.335536
[133]	valid_0's auc: 0.829858	valid_0's binary_logloss: 0.335487
[134]	valid_0's auc: 0.829907	valid_0's binary_logloss: 0.335448
[135]	valid_0's auc: 0.829935	valid_0's binary_logloss: 0.335427
[136]	valid_0's auc: 0.829994	valid_0's binary_logloss: 0.335382
[137]	valid_0's auc: 0.830032	valid_0's binary_logloss: 0.335354
[138]	valid_0's auc: 0.830076	valid_0's binary_logloss: 0.335317
[139]	valid_0's auc: 0.830133	valid_0's binary_logloss: 0.335276
[140]	valid_0's auc: 0.83015	valid_0's binary_logloss: 0.335258
[141]	valid_0's auc: 0.830172	valid_0's binary_logloss: 0.33524
[142]	valid_0's auc: 0.830251	valid_0's binary_logloss: 0.335184
[143]	valid_0's auc: 0.830321	valid_0's binary_logloss: 0.335124
[144]	valid_0's auc: 0.83036

[256]	valid_0's auc: 0.833078	valid_0's binary_logloss: 0.332745
[257]	valid_0's auc: 0.833126	valid_0's binary_logloss: 0.332708
[258]	valid_0's auc: 0.833179	valid_0's binary_logloss: 0.332663
[259]	valid_0's auc: 0.833222	valid_0's binary_logloss: 0.332622
[260]	valid_0's auc: 0.833227	valid_0's binary_logloss: 0.332618
[261]	valid_0's auc: 0.833231	valid_0's binary_logloss: 0.332615
[262]	valid_0's auc: 0.833243	valid_0's binary_logloss: 0.332604
[263]	valid_0's auc: 0.833252	valid_0's binary_logloss: 0.332598
[264]	valid_0's auc: 0.833253	valid_0's binary_logloss: 0.332599
[265]	valid_0's auc: 0.833256	valid_0's binary_logloss: 0.332597
[266]	valid_0's auc: 0.833258	valid_0's binary_logloss: 0.332596
[267]	valid_0's auc: 0.833273	valid_0's binary_logloss: 0.332586
[268]	valid_0's auc: 0.833301	valid_0's binary_logloss: 0.332564
[269]	valid_0's auc: 0.833299	valid_0's binary_logloss: 0.332565
[270]	valid_0's auc: 0.833301	valid_0's binary_logloss: 0.332565
[271]	valid_0's auc: 0.83

[383]	valid_0's auc: 0.834688	valid_0's binary_logloss: 0.331368
[384]	valid_0's auc: 0.834693	valid_0's binary_logloss: 0.331364
[385]	valid_0's auc: 0.834709	valid_0's binary_logloss: 0.33135
[386]	valid_0's auc: 0.834733	valid_0's binary_logloss: 0.331325
[387]	valid_0's auc: 0.834737	valid_0's binary_logloss: 0.331321
[388]	valid_0's auc: 0.834735	valid_0's binary_logloss: 0.331323
[389]	valid_0's auc: 0.834734	valid_0's binary_logloss: 0.331324
[390]	valid_0's auc: 0.834744	valid_0's binary_logloss: 0.331315
[391]	valid_0's auc: 0.834741	valid_0's binary_logloss: 0.331317
[392]	valid_0's auc: 0.834752	valid_0's binary_logloss: 0.331307
[393]	valid_0's auc: 0.834752	valid_0's binary_logloss: 0.331307
[394]	valid_0's auc: 0.83477	valid_0's binary_logloss: 0.331293
[395]	valid_0's auc: 0.834781	valid_0's binary_logloss: 0.331284
[396]	valid_0's auc: 0.834782	valid_0's binary_logloss: 0.331284
[397]	valid_0's auc: 0.83479	valid_0's binary_logloss: 0.331275
[398]	valid_0's auc: 0.83480

[510]	valid_0's auc: 0.836009	valid_0's binary_logloss: 0.330176
[511]	valid_0's auc: 0.836013	valid_0's binary_logloss: 0.330173
[512]	valid_0's auc: 0.836032	valid_0's binary_logloss: 0.330159
[513]	valid_0's auc: 0.836038	valid_0's binary_logloss: 0.330154
[514]	valid_0's auc: 0.83606	valid_0's binary_logloss: 0.330127
[515]	valid_0's auc: 0.836071	valid_0's binary_logloss: 0.330118
[516]	valid_0's auc: 0.836088	valid_0's binary_logloss: 0.330101
[517]	valid_0's auc: 0.836097	valid_0's binary_logloss: 0.330094
[518]	valid_0's auc: 0.836099	valid_0's binary_logloss: 0.330093
[519]	valid_0's auc: 0.8361	valid_0's binary_logloss: 0.330091
[520]	valid_0's auc: 0.836106	valid_0's binary_logloss: 0.330086
[521]	valid_0's auc: 0.836103	valid_0's binary_logloss: 0.330086
[522]	valid_0's auc: 0.8361	valid_0's binary_logloss: 0.330088
[523]	valid_0's auc: 0.836102	valid_0's binary_logloss: 0.330087
[524]	valid_0's auc: 0.836125	valid_0's binary_logloss: 0.330067
[525]	valid_0's auc: 0.836131	

[637]	valid_0's auc: 0.836793	valid_0's binary_logloss: 0.329491
[638]	valid_0's auc: 0.836801	valid_0's binary_logloss: 0.329484
[639]	valid_0's auc: 0.836807	valid_0's binary_logloss: 0.329479
[640]	valid_0's auc: 0.836812	valid_0's binary_logloss: 0.329475
[641]	valid_0's auc: 0.836812	valid_0's binary_logloss: 0.329475
[642]	valid_0's auc: 0.836811	valid_0's binary_logloss: 0.329476
[643]	valid_0's auc: 0.836814	valid_0's binary_logloss: 0.329474
[644]	valid_0's auc: 0.836824	valid_0's binary_logloss: 0.329467
[645]	valid_0's auc: 0.836821	valid_0's binary_logloss: 0.329469
[646]	valid_0's auc: 0.836827	valid_0's binary_logloss: 0.329466
[647]	valid_0's auc: 0.83683	valid_0's binary_logloss: 0.329463
[648]	valid_0's auc: 0.836832	valid_0's binary_logloss: 0.329461
[649]	valid_0's auc: 0.836834	valid_0's binary_logloss: 0.329458
[650]	valid_0's auc: 0.836839	valid_0's binary_logloss: 0.329454
[651]	valid_0's auc: 0.836837	valid_0's binary_logloss: 0.329455
[652]	valid_0's auc: 0.836

[764]	valid_0's auc: 0.837321	valid_0's binary_logloss: 0.329035
[765]	valid_0's auc: 0.837326	valid_0's binary_logloss: 0.329031
[766]	valid_0's auc: 0.837339	valid_0's binary_logloss: 0.32902
[767]	valid_0's auc: 0.837341	valid_0's binary_logloss: 0.329019
[768]	valid_0's auc: 0.837342	valid_0's binary_logloss: 0.329017
[769]	valid_0's auc: 0.837344	valid_0's binary_logloss: 0.329015
[770]	valid_0's auc: 0.837355	valid_0's binary_logloss: 0.329007
[771]	valid_0's auc: 0.837365	valid_0's binary_logloss: 0.328999
[772]	valid_0's auc: 0.837364	valid_0's binary_logloss: 0.329001
[773]	valid_0's auc: 0.837369	valid_0's binary_logloss: 0.328998
[774]	valid_0's auc: 0.837365	valid_0's binary_logloss: 0.329
[775]	valid_0's auc: 0.83737	valid_0's binary_logloss: 0.328996
[776]	valid_0's auc: 0.837374	valid_0's binary_logloss: 0.328994
[777]	valid_0's auc: 0.837377	valid_0's binary_logloss: 0.328991
[778]	valid_0's auc: 0.837378	valid_0's binary_logloss: 0.328991
[779]	valid_0's auc: 0.837383	

[891]	valid_0's auc: 0.837861	valid_0's binary_logloss: 0.328577
[892]	valid_0's auc: 0.837863	valid_0's binary_logloss: 0.328574
[893]	valid_0's auc: 0.837862	valid_0's binary_logloss: 0.328575
[894]	valid_0's auc: 0.837864	valid_0's binary_logloss: 0.328573
[895]	valid_0's auc: 0.837871	valid_0's binary_logloss: 0.328569
[896]	valid_0's auc: 0.83787	valid_0's binary_logloss: 0.328569
[897]	valid_0's auc: 0.837877	valid_0's binary_logloss: 0.328564
[898]	valid_0's auc: 0.837881	valid_0's binary_logloss: 0.328562
[899]	valid_0's auc: 0.837878	valid_0's binary_logloss: 0.328564
[900]	valid_0's auc: 0.837888	valid_0's binary_logloss: 0.328557
[901]	valid_0's auc: 0.837885	valid_0's binary_logloss: 0.328559
[902]	valid_0's auc: 0.837894	valid_0's binary_logloss: 0.32855
[903]	valid_0's auc: 0.8379	valid_0's binary_logloss: 0.328545
[904]	valid_0's auc: 0.837899	valid_0's binary_logloss: 0.328545
[905]	valid_0's auc: 0.837902	valid_0's binary_logloss: 0.328544
[906]	valid_0's auc: 0.837904

[1018]	valid_0's auc: 0.838292	valid_0's binary_logloss: 0.328213
[1019]	valid_0's auc: 0.838296	valid_0's binary_logloss: 0.328209
[1020]	valid_0's auc: 0.838293	valid_0's binary_logloss: 0.32821
[1021]	valid_0's auc: 0.838303	valid_0's binary_logloss: 0.328204
[1022]	valid_0's auc: 0.838305	valid_0's binary_logloss: 0.328204
[1023]	valid_0's auc: 0.838312	valid_0's binary_logloss: 0.328198
[1024]	valid_0's auc: 0.83832	valid_0's binary_logloss: 0.32819
[1025]	valid_0's auc: 0.838328	valid_0's binary_logloss: 0.328183
[1026]	valid_0's auc: 0.838331	valid_0's binary_logloss: 0.328181
[1027]	valid_0's auc: 0.838332	valid_0's binary_logloss: 0.32818
[1028]	valid_0's auc: 0.838335	valid_0's binary_logloss: 0.328178
[1029]	valid_0's auc: 0.838339	valid_0's binary_logloss: 0.328174
[1030]	valid_0's auc: 0.838337	valid_0's binary_logloss: 0.328175
[1031]	valid_0's auc: 0.838335	valid_0's binary_logloss: 0.328176
[1032]	valid_0's auc: 0.838344	valid_0's binary_logloss: 0.328171
[1033]	valid_0

[1143]	valid_0's auc: 0.838605	valid_0's binary_logloss: 0.327957
[1144]	valid_0's auc: 0.838602	valid_0's binary_logloss: 0.327959
[1145]	valid_0's auc: 0.8386	valid_0's binary_logloss: 0.32796
[1146]	valid_0's auc: 0.838598	valid_0's binary_logloss: 0.327962
[1147]	valid_0's auc: 0.838597	valid_0's binary_logloss: 0.327961
[1148]	valid_0's auc: 0.838595	valid_0's binary_logloss: 0.327962
[1149]	valid_0's auc: 0.838595	valid_0's binary_logloss: 0.327962
[1150]	valid_0's auc: 0.838598	valid_0's binary_logloss: 0.327959
[1151]	valid_0's auc: 0.838597	valid_0's binary_logloss: 0.327959
[1152]	valid_0's auc: 0.838599	valid_0's binary_logloss: 0.327958
[1153]	valid_0's auc: 0.838605	valid_0's binary_logloss: 0.327955
[1154]	valid_0's auc: 0.838614	valid_0's binary_logloss: 0.327949
[1155]	valid_0's auc: 0.838614	valid_0's binary_logloss: 0.327948
[1156]	valid_0's auc: 0.838617	valid_0's binary_logloss: 0.327945
[1157]	valid_0's auc: 0.838618	valid_0's binary_logloss: 0.327944
[1158]	valid_

[1268]	valid_0's auc: 0.838932	valid_0's binary_logloss: 0.327689
[1269]	valid_0's auc: 0.838933	valid_0's binary_logloss: 0.327687
[1270]	valid_0's auc: 0.838943	valid_0's binary_logloss: 0.32768
[1271]	valid_0's auc: 0.838941	valid_0's binary_logloss: 0.327681
[1272]	valid_0's auc: 0.838938	valid_0's binary_logloss: 0.327683
[1273]	valid_0's auc: 0.838935	valid_0's binary_logloss: 0.327684
[1274]	valid_0's auc: 0.838935	valid_0's binary_logloss: 0.327685
[1275]	valid_0's auc: 0.838937	valid_0's binary_logloss: 0.327684
[1276]	valid_0's auc: 0.838943	valid_0's binary_logloss: 0.327678
[1277]	valid_0's auc: 0.838952	valid_0's binary_logloss: 0.327671
[1278]	valid_0's auc: 0.838957	valid_0's binary_logloss: 0.327668
[1279]	valid_0's auc: 0.838965	valid_0's binary_logloss: 0.327663
[1280]	valid_0's auc: 0.838967	valid_0's binary_logloss: 0.327661
[1281]	valid_0's auc: 0.838968	valid_0's binary_logloss: 0.327659
[1282]	valid_0's auc: 0.838973	valid_0's binary_logloss: 0.327655
[1283]	vali

[1393]	valid_0's auc: 0.839325	valid_0's binary_logloss: 0.327349
[1394]	valid_0's auc: 0.839328	valid_0's binary_logloss: 0.327346
[1395]	valid_0's auc: 0.839331	valid_0's binary_logloss: 0.327342
[1396]	valid_0's auc: 0.839335	valid_0's binary_logloss: 0.32734
[1397]	valid_0's auc: 0.839342	valid_0's binary_logloss: 0.327335
[1398]	valid_0's auc: 0.839341	valid_0's binary_logloss: 0.327334
[1399]	valid_0's auc: 0.839348	valid_0's binary_logloss: 0.327328
[1400]	valid_0's auc: 0.839353	valid_0's binary_logloss: 0.327323
[1401]	valid_0's auc: 0.839353	valid_0's binary_logloss: 0.327324
[1402]	valid_0's auc: 0.839352	valid_0's binary_logloss: 0.327325
[1403]	valid_0's auc: 0.839348	valid_0's binary_logloss: 0.327329
[1404]	valid_0's auc: 0.839346	valid_0's binary_logloss: 0.32733
[1405]	valid_0's auc: 0.83935	valid_0's binary_logloss: 0.327327
[1406]	valid_0's auc: 0.839362	valid_0's binary_logloss: 0.32732
[1407]	valid_0's auc: 0.839372	valid_0's binary_logloss: 0.32731
[1408]	valid_0'

[1518]	valid_0's auc: 0.839683	valid_0's binary_logloss: 0.327041
[1519]	valid_0's auc: 0.839683	valid_0's binary_logloss: 0.327041
[1520]	valid_0's auc: 0.839681	valid_0's binary_logloss: 0.327043
[1521]	valid_0's auc: 0.839681	valid_0's binary_logloss: 0.327041
[1522]	valid_0's auc: 0.83968	valid_0's binary_logloss: 0.327042
[1523]	valid_0's auc: 0.839679	valid_0's binary_logloss: 0.327042
[1524]	valid_0's auc: 0.839691	valid_0's binary_logloss: 0.327035
[1525]	valid_0's auc: 0.839689	valid_0's binary_logloss: 0.327036
[1526]	valid_0's auc: 0.83969	valid_0's binary_logloss: 0.327035
[1527]	valid_0's auc: 0.839692	valid_0's binary_logloss: 0.327033
[1528]	valid_0's auc: 0.839692	valid_0's binary_logloss: 0.327032
[1529]	valid_0's auc: 0.839689	valid_0's binary_logloss: 0.327034
[1530]	valid_0's auc: 0.839701	valid_0's binary_logloss: 0.327024
[1531]	valid_0's auc: 0.839705	valid_0's binary_logloss: 0.327021
[1532]	valid_0's auc: 0.839704	valid_0's binary_logloss: 0.327021
[1533]	valid

[1643]	valid_0's auc: 0.839928	valid_0's binary_logloss: 0.326839
[1644]	valid_0's auc: 0.839932	valid_0's binary_logloss: 0.326835
[1645]	valid_0's auc: 0.839931	valid_0's binary_logloss: 0.326836
[1646]	valid_0's auc: 0.839933	valid_0's binary_logloss: 0.326835
[1647]	valid_0's auc: 0.839939	valid_0's binary_logloss: 0.326829
[1648]	valid_0's auc: 0.839942	valid_0's binary_logloss: 0.326827
[1649]	valid_0's auc: 0.839946	valid_0's binary_logloss: 0.326824
[1650]	valid_0's auc: 0.839945	valid_0's binary_logloss: 0.326823
[1651]	valid_0's auc: 0.839943	valid_0's binary_logloss: 0.326825
[1652]	valid_0's auc: 0.839943	valid_0's binary_logloss: 0.326824
[1653]	valid_0's auc: 0.839946	valid_0's binary_logloss: 0.326822
[1654]	valid_0's auc: 0.839948	valid_0's binary_logloss: 0.32682
[1655]	valid_0's auc: 0.839951	valid_0's binary_logloss: 0.326818
[1656]	valid_0's auc: 0.839948	valid_0's binary_logloss: 0.32682
[1657]	valid_0's auc: 0.839955	valid_0's binary_logloss: 0.326816
[1658]	valid

[1768]	valid_0's auc: 0.840169	valid_0's binary_logloss: 0.326631
[1769]	valid_0's auc: 0.840167	valid_0's binary_logloss: 0.326631
[1770]	valid_0's auc: 0.840167	valid_0's binary_logloss: 0.326632
[1771]	valid_0's auc: 0.840166	valid_0's binary_logloss: 0.326633
[1772]	valid_0's auc: 0.840164	valid_0's binary_logloss: 0.326634
[1773]	valid_0's auc: 0.840161	valid_0's binary_logloss: 0.326636
[1774]	valid_0's auc: 0.840162	valid_0's binary_logloss: 0.326636
[1775]	valid_0's auc: 0.840162	valid_0's binary_logloss: 0.326635
[1776]	valid_0's auc: 0.840164	valid_0's binary_logloss: 0.326634
[1777]	valid_0's auc: 0.840171	valid_0's binary_logloss: 0.326628
[1778]	valid_0's auc: 0.840175	valid_0's binary_logloss: 0.326625
[1779]	valid_0's auc: 0.840178	valid_0's binary_logloss: 0.326623
[1780]	valid_0's auc: 0.840176	valid_0's binary_logloss: 0.326624
[1781]	valid_0's auc: 0.840175	valid_0's binary_logloss: 0.326625
[1782]	valid_0's auc: 0.840174	valid_0's binary_logloss: 0.326626
[1783]	val

[1893]	valid_0's auc: 0.84035	valid_0's binary_logloss: 0.326472
[1894]	valid_0's auc: 0.840352	valid_0's binary_logloss: 0.326471
[1895]	valid_0's auc: 0.840351	valid_0's binary_logloss: 0.326472
[1896]	valid_0's auc: 0.840358	valid_0's binary_logloss: 0.326466
[1897]	valid_0's auc: 0.840358	valid_0's binary_logloss: 0.326466
[1898]	valid_0's auc: 0.840356	valid_0's binary_logloss: 0.326466
[1899]	valid_0's auc: 0.840354	valid_0's binary_logloss: 0.326467
[1900]	valid_0's auc: 0.840355	valid_0's binary_logloss: 0.326466
[1901]	valid_0's auc: 0.840352	valid_0's binary_logloss: 0.326468
[1902]	valid_0's auc: 0.840358	valid_0's binary_logloss: 0.326463
[1903]	valid_0's auc: 0.840369	valid_0's binary_logloss: 0.326453
[1904]	valid_0's auc: 0.840375	valid_0's binary_logloss: 0.326447
[1905]	valid_0's auc: 0.840372	valid_0's binary_logloss: 0.326449
[1906]	valid_0's auc: 0.840375	valid_0's binary_logloss: 0.326446
[1907]	valid_0's auc: 0.840379	valid_0's binary_logloss: 0.326442
[1908]	vali

[2018]	valid_0's auc: 0.840612	valid_0's binary_logloss: 0.326246
[2019]	valid_0's auc: 0.840617	valid_0's binary_logloss: 0.326241
[2020]	valid_0's auc: 0.840622	valid_0's binary_logloss: 0.326238
[2021]	valid_0's auc: 0.840627	valid_0's binary_logloss: 0.326234
[2022]	valid_0's auc: 0.840629	valid_0's binary_logloss: 0.326233
[2023]	valid_0's auc: 0.840627	valid_0's binary_logloss: 0.326234
[2024]	valid_0's auc: 0.840628	valid_0's binary_logloss: 0.326234
[2025]	valid_0's auc: 0.840632	valid_0's binary_logloss: 0.326229
[2026]	valid_0's auc: 0.840638	valid_0's binary_logloss: 0.326226
[2027]	valid_0's auc: 0.840643	valid_0's binary_logloss: 0.326221
[2028]	valid_0's auc: 0.840647	valid_0's binary_logloss: 0.326219
[2029]	valid_0's auc: 0.840647	valid_0's binary_logloss: 0.32622
[2030]	valid_0's auc: 0.840645	valid_0's binary_logloss: 0.326221
[2031]	valid_0's auc: 0.840647	valid_0's binary_logloss: 0.32622
[2032]	valid_0's auc: 0.840646	valid_0's binary_logloss: 0.326221
[2033]	valid

[2143]	valid_0's auc: 0.840771	valid_0's binary_logloss: 0.326106
[2144]	valid_0's auc: 0.840772	valid_0's binary_logloss: 0.326105
[2145]	valid_0's auc: 0.840774	valid_0's binary_logloss: 0.326104
[2146]	valid_0's auc: 0.840774	valid_0's binary_logloss: 0.326105
[2147]	valid_0's auc: 0.840778	valid_0's binary_logloss: 0.326101
[2148]	valid_0's auc: 0.840778	valid_0's binary_logloss: 0.3261
[2149]	valid_0's auc: 0.840779	valid_0's binary_logloss: 0.3261
[2150]	valid_0's auc: 0.840782	valid_0's binary_logloss: 0.326098
[2151]	valid_0's auc: 0.840786	valid_0's binary_logloss: 0.326093
[2152]	valid_0's auc: 0.840787	valid_0's binary_logloss: 0.326091
[2153]	valid_0's auc: 0.840785	valid_0's binary_logloss: 0.326093
[2154]	valid_0's auc: 0.840788	valid_0's binary_logloss: 0.326091
[2155]	valid_0's auc: 0.840793	valid_0's binary_logloss: 0.326087
[2156]	valid_0's auc: 0.840793	valid_0's binary_logloss: 0.326087
[2157]	valid_0's auc: 0.840797	valid_0's binary_logloss: 0.326084
[2158]	valid_0

[2268]	valid_0's auc: 0.840938	valid_0's binary_logloss: 0.325958
[2269]	valid_0's auc: 0.840937	valid_0's binary_logloss: 0.32596
[2270]	valid_0's auc: 0.840937	valid_0's binary_logloss: 0.325959
[2271]	valid_0's auc: 0.840938	valid_0's binary_logloss: 0.325958
[2272]	valid_0's auc: 0.840938	valid_0's binary_logloss: 0.325958
[2273]	valid_0's auc: 0.84094	valid_0's binary_logloss: 0.325957
[2274]	valid_0's auc: 0.84094	valid_0's binary_logloss: 0.325957
[2275]	valid_0's auc: 0.84094	valid_0's binary_logloss: 0.325957
[2276]	valid_0's auc: 0.840939	valid_0's binary_logloss: 0.325958
[2277]	valid_0's auc: 0.840939	valid_0's binary_logloss: 0.325957
[2278]	valid_0's auc: 0.840943	valid_0's binary_logloss: 0.325955
[2279]	valid_0's auc: 0.840942	valid_0's binary_logloss: 0.325955
[2280]	valid_0's auc: 0.84094	valid_0's binary_logloss: 0.325957
[2281]	valid_0's auc: 0.840943	valid_0's binary_logloss: 0.325954
[2282]	valid_0's auc: 0.84094	valid_0's binary_logloss: 0.325957
[2283]	valid_0's

[2393]	valid_0's auc: 0.841122	valid_0's binary_logloss: 0.325774
[2394]	valid_0's auc: 0.841119	valid_0's binary_logloss: 0.325776
[2395]	valid_0's auc: 0.841119	valid_0's binary_logloss: 0.325776
[2396]	valid_0's auc: 0.841118	valid_0's binary_logloss: 0.325776
[2397]	valid_0's auc: 0.841117	valid_0's binary_logloss: 0.325777
[2398]	valid_0's auc: 0.841118	valid_0's binary_logloss: 0.325776
[2399]	valid_0's auc: 0.841127	valid_0's binary_logloss: 0.325768
[2400]	valid_0's auc: 0.841128	valid_0's binary_logloss: 0.325766
[2401]	valid_0's auc: 0.841129	valid_0's binary_logloss: 0.325766
[2402]	valid_0's auc: 0.841152	valid_0's binary_logloss: 0.325749
[2403]	valid_0's auc: 0.841155	valid_0's binary_logloss: 0.325747
[2404]	valid_0's auc: 0.841155	valid_0's binary_logloss: 0.325746
[2405]	valid_0's auc: 0.841163	valid_0's binary_logloss: 0.325739
[2406]	valid_0's auc: 0.841166	valid_0's binary_logloss: 0.325737
[2407]	valid_0's auc: 0.841166	valid_0's binary_logloss: 0.325737
[2408]	val

[2518]	valid_0's auc: 0.841298	valid_0's binary_logloss: 0.325628
[2519]	valid_0's auc: 0.841298	valid_0's binary_logloss: 0.325629
[2520]	valid_0's auc: 0.841299	valid_0's binary_logloss: 0.325628
[2521]	valid_0's auc: 0.841298	valid_0's binary_logloss: 0.325628
[2522]	valid_0's auc: 0.841298	valid_0's binary_logloss: 0.325628
[2523]	valid_0's auc: 0.841298	valid_0's binary_logloss: 0.32563
[2524]	valid_0's auc: 0.841296	valid_0's binary_logloss: 0.325631
[2525]	valid_0's auc: 0.841295	valid_0's binary_logloss: 0.325631
[2526]	valid_0's auc: 0.841295	valid_0's binary_logloss: 0.325631
[2527]	valid_0's auc: 0.841298	valid_0's binary_logloss: 0.325628
[2528]	valid_0's auc: 0.841298	valid_0's binary_logloss: 0.325628
[2529]	valid_0's auc: 0.841297	valid_0's binary_logloss: 0.325628
[2530]	valid_0's auc: 0.841297	valid_0's binary_logloss: 0.325628
[2531]	valid_0's auc: 0.841298	valid_0's binary_logloss: 0.325626
[2532]	valid_0's auc: 0.8413	valid_0's binary_logloss: 0.325624
[2533]	valid_

[2643]	valid_0's auc: 0.841394	valid_0's binary_logloss: 0.325558
[2644]	valid_0's auc: 0.841401	valid_0's binary_logloss: 0.325551
[2645]	valid_0's auc: 0.841405	valid_0's binary_logloss: 0.325549
[2646]	valid_0's auc: 0.841404	valid_0's binary_logloss: 0.325549
[2647]	valid_0's auc: 0.84141	valid_0's binary_logloss: 0.325546
[2648]	valid_0's auc: 0.841409	valid_0's binary_logloss: 0.325546
[2649]	valid_0's auc: 0.841418	valid_0's binary_logloss: 0.325538
[2650]	valid_0's auc: 0.841421	valid_0's binary_logloss: 0.325536
[2651]	valid_0's auc: 0.841427	valid_0's binary_logloss: 0.32553
[2652]	valid_0's auc: 0.841425	valid_0's binary_logloss: 0.32553
[2653]	valid_0's auc: 0.841428	valid_0's binary_logloss: 0.325528
[2654]	valid_0's auc: 0.841428	valid_0's binary_logloss: 0.325529
[2655]	valid_0's auc: 0.841429	valid_0's binary_logloss: 0.325527
[2656]	valid_0's auc: 0.841438	valid_0's binary_logloss: 0.325518
[2657]	valid_0's auc: 0.84144	valid_0's binary_logloss: 0.325516
[2658]	valid_0

[2768]	valid_0's auc: 0.84154	valid_0's binary_logloss: 0.325444
[2769]	valid_0's auc: 0.841542	valid_0's binary_logloss: 0.325443
[2770]	valid_0's auc: 0.841544	valid_0's binary_logloss: 0.32544
[2771]	valid_0's auc: 0.84155	valid_0's binary_logloss: 0.325436
[2772]	valid_0's auc: 0.841556	valid_0's binary_logloss: 0.325432
[2773]	valid_0's auc: 0.841558	valid_0's binary_logloss: 0.325432
[2774]	valid_0's auc: 0.841565	valid_0's binary_logloss: 0.325426
[2775]	valid_0's auc: 0.841567	valid_0's binary_logloss: 0.325424
[2776]	valid_0's auc: 0.841573	valid_0's binary_logloss: 0.325421
[2777]	valid_0's auc: 0.841576	valid_0's binary_logloss: 0.325419
[2778]	valid_0's auc: 0.841574	valid_0's binary_logloss: 0.325419
[2779]	valid_0's auc: 0.841577	valid_0's binary_logloss: 0.325418
[2780]	valid_0's auc: 0.841575	valid_0's binary_logloss: 0.325419
[2781]	valid_0's auc: 0.841571	valid_0's binary_logloss: 0.325421
[2782]	valid_0's auc: 0.841573	valid_0's binary_logloss: 0.32542
[2783]	valid_0