In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMClassifier
import logging

In [2]:
import pickle
from tqdm import tqdm, tqdm_notebook, _tqdm_notebook, tqdm_pandas

In [3]:
log_fmt = "[%(asctime)s] %(levelname)s in %(module)s: %(message)s"
logging.basicConfig(format=log_fmt, level=logging.INFO)

In [4]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
def extract_day(s):
    return s.apply(lambda x: int(x.split('-')[0][1:]))

def extract_hour(s):
    return s.apply(lambda x: int(x.split('-')[1][1:]))

In [6]:
def parse_str(d):
    return np.array(list(map(float, d.split())))
def parse_list_1(d):
    if d == '-1':
        return [0]
    return list(map(lambda x: int(x[1:]), str(d).split(',')))
def parse_map(d):
    if d == '-1':
        return {}
    return dict([int(z.split(':')[0][1:]), float(z.split(':')[1])] for z in d.split(','))

In [7]:
base_path = '../data'

In [8]:
# 加载邀请回答数据

train = pd.read_csv(f'{base_path}/invite_info_0926.txt', sep='\t', header=None)
train.columns = ['qid', 'uid', 'dt', 'label']
logging.info("invite %s", train.shape)

test = pd.read_csv(f'{base_path}/invite_info_evaluate_1_0926.txt', sep='\t', header=None)
test.columns = ['qid', 'uid', 'dt']
logging.info("test %s", test.shape)

sub = test.copy()

sub_size = len(sub)

train['day'] = extract_day(train['dt'])
train['hour'] = extract_hour(train['dt'])

test['day'] = extract_day(test['dt'])
test['hour'] = extract_hour(test['dt'])
del train['dt'], test['dt']


[2019-12-13 08:49:39,605] INFO in <ipython-input-8-12241a91563a>: invite (9489162, 4)
[2019-12-13 08:49:46,292] INFO in <ipython-input-8-12241a91563a>: test (1141683, 3)


In [9]:
# 加载问题
ques = pd.read_csv(f'{base_path}/question_info_0926.txt', header=None, sep='\t')
ques.columns = ['qid', 'q_dt', 'title_t1', 'title_t2', 'desc_t1', 'desc_t2', 'q_topic']
# del ques['title_t1'], ques['title_t2'], ques['desc_t1'], ques['desc_t2']
del ques['title_t1'],ques['desc_t1']
logging.info("ques %s", ques.shape)

ques['q_day'] = extract_day(ques['q_dt'])
ques['q_hour'] = extract_hour(ques['q_dt'])
del ques['q_dt']

[2019-12-13 08:52:30,866] INFO in <ipython-input-9-8ea131990986>: ques (1829900, 5)


In [10]:
# ques['关注话题'] = train['关注话题'].apply(parse_list_1)
ques['title_t2'] = ques['title_t2'].apply(parse_list_1)
ques['desc_t2'] = ques['desc_t2'].apply(parse_list_1)
ques['q_topic'] = ques['q_topic'].apply(parse_list_1)
# train['感兴趣话题'] = train['感兴趣话题'].apply(parse_map)
ques.head()

Unnamed: 0,qid,title_t2,desc_t2,q_topic,q_day,q_hour
0,Q2234111670,"[22414, 963, 10458]",[0],"[321, 730, 5784, 4389]",1018,5
1,Q760329790,"[12677, 16829, 15201, 6419, 101839]","[1296, 2118, 12677, 16829, 15201, 6419, 101839...","[278, 12673, 4677]",1745,20
2,Q741313548,"[700, 2781, 3280, 81215]","[732, 24400, 48321, 39608, 20788, 219486, 1183...",[226],2032,21
3,Q3481466230,"[3312, 1823, 1505, 638, 166, 461]","[6642, 4214, 3312, 1505, 2205, 232, 294, 7177,...","[51, 4468]",2185,15
4,Q3966197028,"[700, 895, 2253]",[0],"[54700, 81, 57, 17670, 43574]",2269,17


In [11]:
topicmap = pd.read_csv('../data/topic_vectors_64d.txt', 
                          names=['id', 'embed'], sep='\t')
topicmap['embed'] = topicmap['embed'].apply(parse_str)
topicmap['id'] = topicmap['id'].apply(lambda x: int(x[1:]))
topicmap.head()

Unnamed: 0,id,embed
0,1,"[0.16508673, -0.0037432343, -0.058245048, -0.0..."
1,2,"[1.608256, -1.0515573, -1.1897708, 1.1820835, ..."
2,3,"[3.3307428, -0.43252096, -2.1518784, -1.439003..."
3,4,"[2.4698818, -0.12998039, -0.4648351, 0.8796743..."
4,5,"[1.562477, -1.3560516, -0.3271215, -0.06341907..."


In [12]:
topicmap.shape

(100000, 2)

In [13]:
topic_vector_dict = dict(zip(np.array(topicmap['id']), np.array(topicmap['embed'])))

In [14]:
def topic2v(x):
    try:
        tmp = topic_vector_dict[x[0]]
    except:
        tmp = np.zeros(64)
    for i in x[1:]:
        tmp = tmp + topic_vector_dict[i]
    if len(tmp) == 0:
        return np.zeros(64)
    return (tmp / len(x))

In [15]:
tqdm.pandas(desc="topic2v...")
ques['q_topic']=ques['q_topic'].progress_apply(lambda x:topic2v(x))
print('finished!')

topic2v...: 100%|██████████| 1829900/1829900 [01:39<00:00, 18476.91it/s]


finished!


In [16]:
wordmap = pd.read_csv('../data/word_vectors_64d.txt', 
                          names=['id', 'embed'], sep='\t')
wordmap['embed'] = wordmap['embed'].apply(parse_str)
wordmap['id'] = wordmap['id'].apply(lambda x: int(x[1:]))
wordmap.head()

Unnamed: 0,id,embed
0,1,"[0.12561196, -0.57268924, -0.14478925, -0.0524..."
1,2,"[3.224765, 2.2482696, -0.511986, -0.5329892, -..."
2,3,"[-0.985937, 0.11307016, 0.012898494, -0.682206..."
3,4,"[-0.3367663, 0.039051324, 0.8155926, 0.8351733..."
4,5,"[0.3074205, -1.0977745, 0.7528213, 0.6299011, ..."


In [17]:
wordmap.shape
word_vector_dict = dict(zip(np.array(wordmap['id']), np.array(wordmap['embed'])))

In [18]:
def word2v(x):
    try:
        tmp = word_vector_dict[x[0]]
    except:
        tmp = np.zeros(64)
    for i in x[1:]:
        tmp = tmp + word_vector_dict[i]
    if len(tmp) == 0:
        return np.zeros(64)
    return (tmp / len(x))

In [19]:
tqdm.pandas(desc="word2v...")
ques['title_t2']=ques['title_t2'].progress_apply(lambda x:word2v(x))
print('finished!')

word2v...: 100%|██████████| 1829900/1829900 [02:54<00:00, 10503.80it/s]


finished!


In [20]:
tqdm.pandas(desc="word2v...")
ques['desc_t2']=ques['desc_t2'].progress_apply(lambda x:word2v(x))
print('finished!')

word2v...: 100%|██████████| 1829900/1829900 [05:26<00:00, 5597.64it/s] 


finished!


In [21]:
ques.head()

Unnamed: 0,qid,title_t2,desc_t2,q_topic,q_day,q_hour
0,Q2234111670,"[3.4942667999999997, -0.8552949333333334, -4.0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-1.1211431250000001, 2.9202411, -0.0973278000...",1018,5
1,Q760329790,"[1.1900169984, -4.05578632, -3.945152239999999...","[0.040301072461538384, -1.3610320753846152, -3...","[-2.680529233333333, -5.466671466666667, -3.97...",1745,20
2,Q741313548,"[1.39550311, -0.394145575, -1.2251911, -0.3720...","[4.4959643713999995, 1.582830815, 0.0903138088...","[1.1679975, -0.9902606, 2.8614578, 2.517082, -...",2032,21
3,Q3481466230,"[0.5512061833333333, 0.0740648883333332, 1.733...","[1.5809253704166668, -1.2889901612499999, -0.0...","[-1.4455721, -1.401717435, 1.141536675, 3.5025...",2185,15
4,Q3966197028,"[-0.6215633333333334, 0.6605363, -0.6174622666...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.281498462, 1.8468875400000002, -2.340192636...",2269,17


In [22]:
def wordadd(x):
    try:
        tmp = 0.0
    except:
        tmp = 0.0
    tmp=x.sum()
    return (tmp /64)

In [23]:
tqdm.pandas(desc="wordadd...")
ques['title_t2']=ques['title_t2'].progress_apply(lambda x:wordadd(x))
print('finished!')

wordadd...: 100%|██████████| 1829900/1829900 [01:09<00:00, 26369.16it/s]


finished!


In [24]:
tqdm.pandas(desc="wordadd...")
ques['desc_t2']=ques['desc_t2'].progress_apply(lambda x:wordadd(x))
print('finished!')

wordadd...: 100%|██████████| 1829900/1829900 [01:13<00:00, 24908.01it/s]


finished!


In [25]:
tqdm.pandas(desc="wordadd...")
ques['q_topic']=ques['q_topic'].progress_apply(lambda x:wordadd(x))
print('finished!')

wordadd...: 100%|██████████| 1829900/1829900 [01:09<00:00, 26508.59it/s]


finished!


In [26]:
ques.head()

Unnamed: 0,qid,title_t2,desc_t2,q_topic,q_day,q_hour
0,Q2234111670,0.020175,0.0,-0.24888,1018,5
1,Q760329790,-0.633451,-0.426139,-0.274191,1745,20
2,Q741313548,-0.327227,-0.076127,0.024638,2032,21
3,Q3481466230,-0.327119,-0.295625,-0.016809,2185,15
4,Q3966197028,-0.674786,0.0,-0.239849,2269,17


In [27]:
# 加载回答
ans = pd.read_csv(f'{base_path}/answer_info_0926.txt', header=None, sep='\t')
ans.columns = ['aid', 'qid', 'uid', 'ans_dt', 'ans_t1', 'ans_t2', 'is_good', 'is_rec', 'is_dest', 'has_img',
               'has_video', 'word_count', 'reci_cheer', 'reci_uncheer', 'reci_comment', 'reci_mark', 'reci_tks',
               'reci_xxx', 'reci_no_help', 'reci_dis']
del ans['ans_t1'], ans['ans_t2']
logging.info("ans %s", ans.shape)

ans['a_day'] = extract_day(ans['ans_dt'])
ans['a_hour'] = extract_hour(ans['ans_dt'])
del ans['ans_dt']

ans = pd.merge(ans, ques, on='qid')
del ques

[2019-12-13 09:17:37,526] INFO in <ipython-input-27-0f04b531e5be>: ans (4513735, 18)


In [28]:
print(ans['a_day'].min())
print(ans['a_day'].max())

3807
3867


In [29]:
# 回答距提问的天数
ans['diff_qa_days'] = ans['a_day'] - ans['q_day']

[2019-12-13 09:18:52,230] INFO in utils: Note: NumExpr detected 24 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
[2019-12-13 09:18:52,232] INFO in utils: NumExpr defaulting to 8 threads.


In [30]:
# 时间窗口划分
# train
# val
train_start = 3838
train_end = 3867

val_start = 3868
val_end = 3874

label_end = 3867
label_start = label_end - 6

train_label_feature_end = label_end - 7
train_label_feature_start = train_label_feature_end - 22

train_ans_feature_end = label_end - 7
train_ans_feature_start = train_ans_feature_end - 50

val_label_feature_end = val_start - 1
val_label_feature_start = val_label_feature_end - 22

val_ans_feature_end = val_start - 1
val_ans_feature_start = val_ans_feature_end - 50

train_label_feature = train[(train['day'] >= train_label_feature_start) & (train['day'] <= train_label_feature_end)]
logging.info("train_label_feature %s", train_label_feature.shape)

val_label_feature = train[(train['day'] >= val_label_feature_start) & (train['day'] <= val_label_feature_end)]
logging.info("val_label_feature %s", val_label_feature.shape)

train_label = train[(train['day'] > train_label_feature_end)]

logging.info("train feature start %s end %s, label start %s end %s", train_label_feature['day'].min(),
             train_label_feature['day'].max(), train_label['day'].min(), train_label['day'].max())

logging.info("test feature start %s end %s, label start %s end %s", val_label_feature['day'].min(),
             val_label_feature['day'].max(), test['day'].min(), test['day'].max())

[2019-12-13 09:18:55,305] INFO in <ipython-input-30-413cbf6f4a7d>: train_label_feature (6895493, 5)
[2019-12-13 09:18:57,649] INFO in <ipython-input-30-413cbf6f4a7d>: val_label_feature (7583553, 5)
[2019-12-13 09:18:58,818] INFO in <ipython-input-30-413cbf6f4a7d>: train feature start 3838 end 3860, label start 3861 end 3867
[2019-12-13 09:18:58,880] INFO in <ipython-input-30-413cbf6f4a7d>: test feature start 3845 end 3867, label start 3868 end 3874


In [31]:
# 确定ans的时间范围
# 3807~3874
train_ans_feature = ans[(ans['a_day'] >= train_ans_feature_start) & (ans['a_day'] <= train_ans_feature_end)]

val_ans_feature = ans[(ans['a_day'] >= val_ans_feature_start) & (ans['a_day'] <= val_ans_feature_end)]

logging.info("train ans feature %s, start %s end %s", train_ans_feature.shape, train_ans_feature['a_day'].min(),
             train_ans_feature['a_day'].max())

logging.info("val ans feature %s, start %s end %s", val_ans_feature.shape, val_ans_feature['a_day'].min(),
             val_ans_feature['a_day'].max())

fea_cols = ['is_good', 'is_rec', 'is_dest', 'has_img', 'has_video', 'word_count',
            'reci_cheer', 'reci_uncheer', 'reci_comment', 'reci_mark', 'reci_tks',
            'reci_xxx', 'reci_no_help', 'reci_dis', 'diff_qa_days']


def extract_feature1(target, label_feature, ans_feature):
    # 问题特征
    t1 = label_feature.groupby('qid')['label'].agg(['mean', 'sum', 'std', 'count']).reset_index()
    t1.columns = ['qid', 'q_inv_mean', 'q_inv_sum', 'q_inv_std', 'q_inv_count']
    target = pd.merge(target, t1, on='qid', how='left')

    # 用户特征
    t1 = label_feature.groupby('uid')['label'].agg(['mean', 'sum', 'std', 'count']).reset_index()
    t1.columns = ['uid', 'u_inv_mean', 'u_inv_sum', 'u_inv_std', 'u_inv_count']
    target = pd.merge(target, t1, on='uid', how='left')
    #
    # train_size = len(train)
    # data = pd.concat((train, test), sort=True)

    # 回答部分特征

    t1 = ans_feature.groupby('qid')['aid'].count().reset_index()
    t1.columns = ['qid', 'q_ans_count']
    target = pd.merge(target, t1, on='qid', how='left')

    t1 = ans_feature.groupby('uid')['aid'].count().reset_index()
    t1.columns = ['uid', 'u_ans_count']
    target = pd.merge(target, t1, on='uid', how='left')

    for col in fea_cols:
        t1 = ans_feature.groupby('uid')[col].agg(['sum', 'max', 'mean']).reset_index()
        t1.columns = ['uid', f'u_{col}_sum', f'u_{col}_max', f'u_{col}_mean']
        target = pd.merge(target, t1, on='uid', how='left')

        t1 = ans_feature.groupby('qid')[col].agg(['sum', 'max', 'mean']).reset_index()
        t1.columns = ['qid', f'q_{col}_sum', f'q_{col}_max', f'q_{col}_mean']
        target = pd.merge(target, t1, on='qid', how='left')
        logging.info("extract %s", col)
    return target


train_label = extract_feature1(train_label, train_label_feature, train_ans_feature)
test = extract_feature1(test, val_label_feature, val_ans_feature)

[2019-12-13 09:19:06,215] INFO in <ipython-input-31-0511893e776f>: train ans feature (3700178, 25), start 3810 end 3860
[2019-12-13 09:19:06,256] INFO in <ipython-input-31-0511893e776f>: val ans feature (3992334, 25), start 3817 end 3867
[2019-12-13 09:23:23,981] INFO in <ipython-input-31-0511893e776f>: extract is_good
[2019-12-13 09:24:37,221] INFO in <ipython-input-31-0511893e776f>: extract is_rec
[2019-12-13 09:25:49,421] INFO in <ipython-input-31-0511893e776f>: extract is_dest
[2019-12-13 09:26:54,823] INFO in <ipython-input-31-0511893e776f>: extract has_img
[2019-12-13 09:28:05,205] INFO in <ipython-input-31-0511893e776f>: extract has_video
[2019-12-13 09:29:17,277] INFO in <ipython-input-31-0511893e776f>: extract word_count
[2019-12-13 09:30:29,053] INFO in <ipython-input-31-0511893e776f>: extract reci_cheer
[2019-12-13 09:31:49,202] INFO in <ipython-input-31-0511893e776f>: extract reci_uncheer
[2019-12-13 09:33:14,829] INFO in <ipython-input-31-0511893e776f>: extract reci_commen

In [32]:
# 特征提取结束
logging.info("train shape %s, test shape %s", train_label.shape, test.shape)
assert len(test) == sub_size

# 加载用户
user = pd.read_csv(f'{base_path}/member_info_0926.txt', header=None, sep='\t')
user.columns = ['uid', 'gender', 'creat_keyword', 'level', 'hot', 'reg_type', 'reg_plat', 'freq', 'uf_b1', 'uf_b2',
                'uf_b3', 'uf_b4', 'uf_b5', 'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5', 'score', 'follow_topic',
                'inter_topic']

[2019-12-13 09:59:37,028] INFO in <ipython-input-32-d70daf315b0b>: train shape (2593669, 105), test shape (1141683, 104)


In [33]:
dit = {'daily': 4, 'weekly': 3, 'monthly': 2, 'new': 1,'unknow':0}
user['freq'] = user['freq'].map(dit)

In [34]:
user = user.drop(['creat_keyword','level','hot','reg_type','reg_plat'],axis=1)

In [35]:
user.head()

Unnamed: 0,uid,gender,freq,uf_b1,uf_b2,uf_b3,uf_b4,uf_b5,uf_c1,uf_c2,uf_c3,uf_c4,uf_c5,score,follow_topic,inter_topic
0,M1934753188,male,2.0,0,1,0,1,0,MD470265,BR470265,PV929066,CT929066,PF470265,764,T540,"T21107:1.7915097,T405:1.6123838,T4436:1.518003..."
1,M595924114,male,4.0,0,0,0,1,1,MD195122,BR596936,PV002320,CT840234,PF470265,671,"T44126,T15940,T839,T8978,T2934,T1113,T3914,T12...","T18016:2.0650618,T2384:1.2503042,T1142:1.13569..."
2,M1473482940,female,3.0,0,1,0,1,0,MD116493,BR641329,PV170953,CT470265,PF470265,454,"T30874,T2113,T8656,T21,T523,T8,T116,T5727,T68,...","T46:1.330939,T2159:1.1296458,T379:1.1241927,T1..."
3,M578477092,male,4.0,1,1,0,1,0,MD889589,BR803759,PV545833,CT545833,PF470265,588,"T946,T7323,T297,T2660,T36067,T53107,T2654,T507...","T15918:1.9479566,T8106:1.8578106,T4787:1.58486..."
4,M1088794709,male,3.0,0,1,0,0,0,MD825760,BR641329,PV071037,CT470265,PF470265,361,"T582,T558,T28776,T5186,T9081,T2419,T2693,T2299...","T65:1.5992582,T867:1.3179373,T93:1.2095009,T31..."


In [36]:
user['follow_topic'] = user['follow_topic'].apply(parse_list_1)
user['inter_topic'] = user['inter_topic'].apply(parse_map)

In [37]:
def topic_interest2v(x):
    if len(x)==0:
        return np.zeros(64)
    else:
        tmp=np.zeros(64)
        for i in x:
            tmp = tmp + topic_vector_dict[i]*x[i]
        return (tmp / len(x))

In [38]:
tqdm.pandas(desc="topic2v...")
user['follow_topic']=user['follow_topic'].progress_apply(lambda x:topic2v(x))
print('finished!')

topic2v...: 100%|██████████| 1931654/1931654 [05:48<00:00, 5547.85it/s] 


finished!


In [39]:
tqdm.pandas(desc="topic_interest2v...")
user['inter_topic']=user['inter_topic'].progress_apply(lambda x:topic_interest2v(x))
user.head()

topic_interest2v...: 100%|██████████| 1931654/1931654 [04:10<00:00, 7723.19it/s] 


Unnamed: 0,uid,gender,freq,uf_b1,uf_b2,uf_b3,uf_b4,uf_b5,uf_c1,uf_c2,uf_c3,uf_c4,uf_c5,score,follow_topic,inter_topic
0,M1934753188,male,2.0,0,1,0,1,0,MD470265,BR470265,PV929066,CT929066,PF470265,764,"[6.125305, -1.4180568, -1.3143845, -2.9268239,...","[0.5255145018431924, 2.8507211193389996, 0.571..."
1,M595924114,male,4.0,0,0,0,1,1,MD195122,BR596936,PV002320,CT840234,PF470265,671,"[0.30575216599999994, -0.9980522930666669, 0.0...","[-1.9947685124736019, 1.3387408371544118, -0.3..."
2,M1473482940,female,3.0,0,1,0,1,0,MD116493,BR641329,PV170953,CT470265,PF470265,454,"[0.1508028030833333, 0.7434654864583335, -0.25...","[-0.25824397617484, 0.6167275103940157, -0.075..."
3,M578477092,male,4.0,1,1,0,1,0,MD889589,BR803759,PV545833,CT545833,PF470265,588,"[1.4605925141999998, -0.33966116570100013, 0.3...","[1.4908855224214492, 1.4438666038597823, 1.179..."
4,M1088794709,male,3.0,0,1,0,0,0,MD825760,BR641329,PV071037,CT470265,PF470265,361,"[0.9643650885294115, 0.31516375361764704, 0.82...","[1.5879302100003165, -1.1453530874701519, -0.7..."


In [40]:
tqdm.pandas(desc="wordadd...")
user['follow_topic']=user['follow_topic'].progress_apply(lambda x:wordadd(x))
print('finished!')

wordadd...: 100%|██████████| 1931654/1931654 [00:52<00:00, 37120.64it/s]


finished!


In [41]:
tqdm.pandas(desc="wordadd...")
user['inter_topic']=user['inter_topic'].progress_apply(lambda x:wordadd(x))
print('finished!')

wordadd...: 100%|██████████| 1931654/1931654 [00:47<00:00, 40891.30it/s]


finished!


In [42]:
user.head()

Unnamed: 0,uid,gender,freq,uf_b1,uf_b2,uf_b3,uf_b4,uf_b5,uf_c1,uf_c2,uf_c3,uf_c4,uf_c5,score,follow_topic,inter_topic
0,M1934753188,male,2.0,0,1,0,1,0,MD470265,BR470265,PV929066,CT929066,PF470265,764,-0.14911,0.178521
1,M595924114,male,4.0,0,0,0,1,1,MD195122,BR596936,PV002320,CT840234,PF470265,671,0.04439,-0.144847
2,M1473482940,female,3.0,0,1,0,1,0,MD116493,BR641329,PV170953,CT470265,PF470265,454,0.020901,0.048022
3,M578477092,male,4.0,1,1,0,1,0,MD889589,BR803759,PV545833,CT545833,PF470265,588,-0.013006,-0.074952
4,M1088794709,male,3.0,0,1,0,0,0,MD825760,BR641329,PV071037,CT470265,PF470265,361,-0.100348,-0.234183


In [43]:

logging.info("user %s", user.shape)

unq = user.nunique()
logging.info("user unq %s", unq)

for x in unq[unq == 1].index:
    del user[x]
    logging.info('del unq==1 %s', x)

t = user.dtypes
cats = [x for x in t[t == 'object'].index if x not in ['follow_topic', 'inter_topic', 'uid','freq']]
logging.info("user cat %s", cats)

for d in cats:
    lb = LabelEncoder()
    user[d] = lb.fit_transform(user[d])
    logging.info('encode %s', d)

q_lb = LabelEncoder()
q_lb.fit(list(train_label['qid'].astype(str).values) + list(test['qid'].astype(str).values))
train_label['qid_enc'] = q_lb.transform(train_label['qid'])
test['qid_enc'] = q_lb.transform(test['qid'])

u_lb = LabelEncoder()
u_lb.fit(user['uid'])
train_label['uid_enc'] = u_lb.transform(train_label['uid'])
test['uid_enc'] = u_lb.transform(test['uid'])

# merge user
train_label = pd.merge(train_label, user, on='uid', how='left')
test = pd.merge(test, user, on='uid', how='left')
logging.info("train shape %s, test shape %s", train_label.shape, test.shape)

data = pd.concat((train_label, test), axis=0, sort=True)
# del train_label, test

[2019-12-13 10:15:58,003] INFO in <ipython-input-43-9e402841ed3b>: user (1931654, 16)
[2019-12-13 10:16:22,316] INFO in <ipython-input-43-9e402841ed3b>: user unq uid             1931654
gender                3
freq                  4
uf_b1                 2
uf_b2                 2
uf_b3                 2
uf_b4                 2
uf_b5                 2
uf_c1              2561
uf_c2               291
uf_c3               428
uf_c4              1556
uf_c5                 2
score               732
follow_topic    1288996
inter_topic     1399663
dtype: int64
[2019-12-13 10:16:22,755] INFO in <ipython-input-43-9e402841ed3b>: user cat ['gender', 'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5']
[2019-12-13 10:16:24,536] INFO in <ipython-input-43-9e402841ed3b>: encode gender
[2019-12-13 10:16:26,441] INFO in <ipython-input-43-9e402841ed3b>: encode uf_c1
[2019-12-13 10:16:29,045] INFO in <ipython-input-43-9e402841ed3b>: encode uf_c2
[2019-12-13 10:16:30,565] INFO in <ipython-input-43-9e402841ed3b>: e

In [44]:
train_label.head()

Unnamed: 0,qid,uid,label,day,hour,q_inv_mean,q_inv_sum,q_inv_std,q_inv_count,u_inv_mean,...,uf_b4,uf_b5,uf_c1,uf_c2,uf_c3,uf_c4,uf_c5,score,follow_topic,inter_topic
0,Q2166419046,M401693808,0,3865,22,,,,,0.0,...,0,0,2113,190,261,927,1,297,-0.12217,0.021705
1,Q604029601,M2317670257,0,3862,15,,,,,0.090909,...,0,0,1519,229,0,506,1,415,-0.015598,-0.182562
2,Q2443223942,M3544409350,0,3867,4,0.375,57.0,0.485723,152.0,0.0,...,0,0,551,226,188,815,1,296,0.0,0.070942
3,Q795459266,M2818659842,0,3861,20,0.166667,1.0,0.408248,6.0,0.285714,...,0,0,1519,229,0,506,1,380,-0.302475,-0.191518
4,Q110462128,M848334644,1,3862,8,,,,,0.634146,...,0,0,2161,31,396,1438,1,719,-0.095008,-0.176807


In [45]:
test.head()

Unnamed: 0,qid,uid,day,hour,q_inv_mean,q_inv_sum,q_inv_std,q_inv_count,u_inv_mean,u_inv_sum,...,uf_b4,uf_b5,uf_c1,uf_c2,uf_c3,uf_c4,uf_c5,score,follow_topic,inter_topic
0,Q1493039281,M64135255,3870,9,0.0,0.0,0.0,2.0,0.166667,1.0,...,0,0,1190,130,201,1374,1,370,-0.15902,-0.039165
1,Q2023398782,M2536956560,3872,22,,,,,0.0,0.0,...,1,0,1190,130,394,519,1,291,0.024704,-0.116434
2,Q4151338694,M3294926344,3874,15,,,,,0.111111,1.0,...,0,0,1190,130,203,443,1,451,0.0,-0.060149
3,Q3271436624,M3744310794,3873,4,0.5,3.0,0.547723,6.0,0.0,0.0,...,0,0,1190,130,186,1036,0,416,-0.123365,-0.227317
4,Q3314287018,M1349051752,3872,19,,,,,0.0,0.0,...,0,0,381,190,334,786,1,316,-0.173229,0.07879


In [46]:
data.head()

Unnamed: 0,day,follow_topic,freq,gender,hour,inter_topic,label,q_ans_count,q_diff_qa_days_max,q_diff_qa_days_mean,...,uf_b3,uf_b4,uf_b5,uf_c1,uf_c2,uf_c3,uf_c4,uf_c5,uid,uid_enc
0,3865,-0.12217,3.0,2,22,0.021705,0.0,,,,...,0,0,0,2113,190,261,927,1,M401693808,1508098
1,3862,-0.015598,3.0,2,15,-0.182562,0.0,,,,...,0,0,0,1519,229,0,506,1,M2317670257,657985
2,3867,0.0,2.0,2,4,0.070942,0.0,32.0,13.0,9.53125,...,0,0,0,551,226,188,815,1,M3544409350,1272353
3,3861,-0.302475,4.0,1,20,-0.191518,0.0,3.0,234.0,222.333333,...,0,0,0,1519,229,0,506,1,M2818659842,909154
4,3862,-0.095008,3.0,0,8,-0.176807,1.0,,,,...,0,0,0,2161,31,396,1438,1,M848334644,1856019


In [47]:
# count编码
count_fea = ['uid_enc', 'qid_enc', 'gender', 'freq', 'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5']
for feat in count_fea:
    col_name = '{}_count'.format(feat)
    data[col_name] = data[feat].map(data[feat].value_counts().astype(int))
    data.loc[data[col_name] < 2, feat] = -1
    data[feat] += 1
    data[col_name] = data[feat].map(data[feat].value_counts().astype(int))
    data[col_name] = (data[col_name] - data[col_name].min()) / (data[col_name].max() - data[col_name].min())
#     # 

# # 问题被回答的次数

In [48]:
# 压缩数据
t = data.dtypes
for x in t[t == 'int64'].index:
    data[x] = data[x].astype('int32')

for x in t[t == 'float64'].index:
    data[x] = data[x].astype('float32')

data['wk'] = data['day'] % 7

feature_cols = [x for x in data.columns if x not in ('label', 'uid', 'qid', 'dt', 'day')]

In [49]:
# target编码
logging.info("feature size %s", len(feature_cols))

X_train_all = data.iloc[:len(train_label)][feature_cols]
y_train_all = data.iloc[:len(train_label)]['label']
test = data.iloc[len(train_label):]
# del data
assert len(test) == sub_size

logging.info("train shape %s, test shape %s", train_label.shape, test.shape)

fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for index, (train_idx, val_idx) in enumerate(fold.split(X=X_train_all, y=y_train_all)):
    break

X_train, X_val, y_train, y_val = X_train_all.iloc[train_idx][feature_cols], X_train_all.iloc[val_idx][feature_cols], \
                                 y_train_all.iloc[train_idx], \
                                 y_train_all.iloc[val_idx]
del X_train_all

model_lgb = LGBMClassifier(n_estimators=3000, n_jobs=-1, objective='binary', seed=1000, silent=True)
model_lgb.fit(X_train, y_train,
              eval_metric=['logloss', 'auc'],
              eval_set=[(X_val, y_val)],
              early_stopping_rounds=100)

sub['label'] = model_lgb.predict_proba(test[feature_cols])[:, 1]


sub.to_csv('../example/result_1213.txt', index=None, header=None, sep='\t')

[2019-12-13 10:27:57,535] INFO in <ipython-input-49-3a526716f64a>: feature size 128
[2019-12-13 10:27:59,474] INFO in <ipython-input-49-3a526716f64a>: train shape (2593669, 122), test shape (1141683, 132)


[1]	valid_0's auc: 0.7582	valid_0's binary_logloss: 0.426574
Training until validation scores don't improve for 100 rounds
[2]	valid_0's auc: 0.775326	valid_0's binary_logloss: 0.416562
[3]	valid_0's auc: 0.78073	valid_0's binary_logloss: 0.408496
[4]	valid_0's auc: 0.784492	valid_0's binary_logloss: 0.401698
[5]	valid_0's auc: 0.785532	valid_0's binary_logloss: 0.39619
[6]	valid_0's auc: 0.787347	valid_0's binary_logloss: 0.391481
[7]	valid_0's auc: 0.790184	valid_0's binary_logloss: 0.387008
[8]	valid_0's auc: 0.792253	valid_0's binary_logloss: 0.3832
[9]	valid_0's auc: 0.794222	valid_0's binary_logloss: 0.3797
[10]	valid_0's auc: 0.795636	valid_0's binary_logloss: 0.376837
[11]	valid_0's auc: 0.796786	valid_0's binary_logloss: 0.374172
[12]	valid_0's auc: 0.797925	valid_0's binary_logloss: 0.371859
[13]	valid_0's auc: 0.799616	valid_0's binary_logloss: 0.369537
[14]	valid_0's auc: 0.800886	valid_0's binary_logloss: 0.36762
[15]	valid_0's auc: 0.8023	valid_0's binary_logloss: 0.36588

[129]	valid_0's auc: 0.829915	valid_0's binary_logloss: 0.335332
[130]	valid_0's auc: 0.829957	valid_0's binary_logloss: 0.335304
[131]	valid_0's auc: 0.830015	valid_0's binary_logloss: 0.335263
[132]	valid_0's auc: 0.830067	valid_0's binary_logloss: 0.335231
[133]	valid_0's auc: 0.830125	valid_0's binary_logloss: 0.335186
[134]	valid_0's auc: 0.83016	valid_0's binary_logloss: 0.335156
[135]	valid_0's auc: 0.830185	valid_0's binary_logloss: 0.335133
[136]	valid_0's auc: 0.830208	valid_0's binary_logloss: 0.335112
[137]	valid_0's auc: 0.830243	valid_0's binary_logloss: 0.335079
[138]	valid_0's auc: 0.830321	valid_0's binary_logloss: 0.335015
[139]	valid_0's auc: 0.830364	valid_0's binary_logloss: 0.334982
[140]	valid_0's auc: 0.830465	valid_0's binary_logloss: 0.334908
[141]	valid_0's auc: 0.830578	valid_0's binary_logloss: 0.334806
[142]	valid_0's auc: 0.830601	valid_0's binary_logloss: 0.334784
[143]	valid_0's auc: 0.83067	valid_0's binary_logloss: 0.334722
[144]	valid_0's auc: 0.8307

[256]	valid_0's auc: 0.833424	valid_0's binary_logloss: 0.332394
[257]	valid_0's auc: 0.83345	valid_0's binary_logloss: 0.332376
[258]	valid_0's auc: 0.833466	valid_0's binary_logloss: 0.332364
[259]	valid_0's auc: 0.833473	valid_0's binary_logloss: 0.332357
[260]	valid_0's auc: 0.833507	valid_0's binary_logloss: 0.332327
[261]	valid_0's auc: 0.833523	valid_0's binary_logloss: 0.332315
[262]	valid_0's auc: 0.833536	valid_0's binary_logloss: 0.332306
[263]	valid_0's auc: 0.833558	valid_0's binary_logloss: 0.332288
[264]	valid_0's auc: 0.833566	valid_0's binary_logloss: 0.33228
[265]	valid_0's auc: 0.833571	valid_0's binary_logloss: 0.332276
[266]	valid_0's auc: 0.833574	valid_0's binary_logloss: 0.332274
[267]	valid_0's auc: 0.833581	valid_0's binary_logloss: 0.332267
[268]	valid_0's auc: 0.833583	valid_0's binary_logloss: 0.332264
[269]	valid_0's auc: 0.8336	valid_0's binary_logloss: 0.332247
[270]	valid_0's auc: 0.833602	valid_0's binary_logloss: 0.332245
[271]	valid_0's auc: 0.833609

[383]	valid_0's auc: 0.835016	valid_0's binary_logloss: 0.331028
[384]	valid_0's auc: 0.835018	valid_0's binary_logloss: 0.331028
[385]	valid_0's auc: 0.835026	valid_0's binary_logloss: 0.331023
[386]	valid_0's auc: 0.835021	valid_0's binary_logloss: 0.331027
[387]	valid_0's auc: 0.835026	valid_0's binary_logloss: 0.331023
[388]	valid_0's auc: 0.83504	valid_0's binary_logloss: 0.331011
[389]	valid_0's auc: 0.835054	valid_0's binary_logloss: 0.330996
[390]	valid_0's auc: 0.835108	valid_0's binary_logloss: 0.330947
[391]	valid_0's auc: 0.835109	valid_0's binary_logloss: 0.330945
[392]	valid_0's auc: 0.835113	valid_0's binary_logloss: 0.330942
[393]	valid_0's auc: 0.835118	valid_0's binary_logloss: 0.330938
[394]	valid_0's auc: 0.83515	valid_0's binary_logloss: 0.330911
[395]	valid_0's auc: 0.835194	valid_0's binary_logloss: 0.330871
[396]	valid_0's auc: 0.835213	valid_0's binary_logloss: 0.330858
[397]	valid_0's auc: 0.835234	valid_0's binary_logloss: 0.330837
[398]	valid_0's auc: 0.8352

[510]	valid_0's auc: 0.836141	valid_0's binary_logloss: 0.33008
[511]	valid_0's auc: 0.836147	valid_0's binary_logloss: 0.330074
[512]	valid_0's auc: 0.836157	valid_0's binary_logloss: 0.330066
[513]	valid_0's auc: 0.836166	valid_0's binary_logloss: 0.330057
[514]	valid_0's auc: 0.836178	valid_0's binary_logloss: 0.330046
[515]	valid_0's auc: 0.836177	valid_0's binary_logloss: 0.330046
[516]	valid_0's auc: 0.836176	valid_0's binary_logloss: 0.330047
[517]	valid_0's auc: 0.836174	valid_0's binary_logloss: 0.330047
[518]	valid_0's auc: 0.836175	valid_0's binary_logloss: 0.330047
[519]	valid_0's auc: 0.836175	valid_0's binary_logloss: 0.330047
[520]	valid_0's auc: 0.836181	valid_0's binary_logloss: 0.330042
[521]	valid_0's auc: 0.836187	valid_0's binary_logloss: 0.330037
[522]	valid_0's auc: 0.836194	valid_0's binary_logloss: 0.330031
[523]	valid_0's auc: 0.836208	valid_0's binary_logloss: 0.330017
[524]	valid_0's auc: 0.836213	valid_0's binary_logloss: 0.330013
[525]	valid_0's auc: 0.836

[637]	valid_0's auc: 0.836756	valid_0's binary_logloss: 0.329543
[638]	valid_0's auc: 0.836764	valid_0's binary_logloss: 0.329538
[639]	valid_0's auc: 0.836769	valid_0's binary_logloss: 0.329533
[640]	valid_0's auc: 0.836777	valid_0's binary_logloss: 0.329526
[641]	valid_0's auc: 0.836776	valid_0's binary_logloss: 0.329528
[642]	valid_0's auc: 0.836785	valid_0's binary_logloss: 0.32952
[643]	valid_0's auc: 0.836815	valid_0's binary_logloss: 0.329497
[644]	valid_0's auc: 0.836828	valid_0's binary_logloss: 0.32948
[645]	valid_0's auc: 0.836832	valid_0's binary_logloss: 0.329475
[646]	valid_0's auc: 0.836841	valid_0's binary_logloss: 0.329468
[647]	valid_0's auc: 0.836855	valid_0's binary_logloss: 0.329455
[648]	valid_0's auc: 0.836863	valid_0's binary_logloss: 0.329447
[649]	valid_0's auc: 0.83687	valid_0's binary_logloss: 0.329442
[650]	valid_0's auc: 0.836882	valid_0's binary_logloss: 0.329432
[651]	valid_0's auc: 0.836889	valid_0's binary_logloss: 0.329427
[652]	valid_0's auc: 0.83689

[764]	valid_0's auc: 0.837388	valid_0's binary_logloss: 0.328994
[765]	valid_0's auc: 0.837386	valid_0's binary_logloss: 0.328996
[766]	valid_0's auc: 0.837384	valid_0's binary_logloss: 0.328997
[767]	valid_0's auc: 0.837388	valid_0's binary_logloss: 0.328992
[768]	valid_0's auc: 0.837389	valid_0's binary_logloss: 0.32899
[769]	valid_0's auc: 0.837389	valid_0's binary_logloss: 0.32899
[770]	valid_0's auc: 0.837407	valid_0's binary_logloss: 0.328978
[771]	valid_0's auc: 0.837418	valid_0's binary_logloss: 0.328971
[772]	valid_0's auc: 0.837428	valid_0's binary_logloss: 0.328959
[773]	valid_0's auc: 0.837431	valid_0's binary_logloss: 0.328956
[774]	valid_0's auc: 0.837429	valid_0's binary_logloss: 0.328956
[775]	valid_0's auc: 0.837429	valid_0's binary_logloss: 0.328957
[776]	valid_0's auc: 0.837434	valid_0's binary_logloss: 0.328953
[777]	valid_0's auc: 0.837434	valid_0's binary_logloss: 0.328953
[778]	valid_0's auc: 0.837433	valid_0's binary_logloss: 0.328954
[779]	valid_0's auc: 0.8374

[891]	valid_0's auc: 0.837885	valid_0's binary_logloss: 0.328572
[892]	valid_0's auc: 0.837885	valid_0's binary_logloss: 0.328573
[893]	valid_0's auc: 0.837892	valid_0's binary_logloss: 0.328566
[894]	valid_0's auc: 0.837899	valid_0's binary_logloss: 0.328561
[895]	valid_0's auc: 0.837901	valid_0's binary_logloss: 0.32856
[896]	valid_0's auc: 0.837903	valid_0's binary_logloss: 0.328558
[897]	valid_0's auc: 0.837903	valid_0's binary_logloss: 0.328559
[898]	valid_0's auc: 0.837901	valid_0's binary_logloss: 0.328561
[899]	valid_0's auc: 0.837898	valid_0's binary_logloss: 0.328563
[900]	valid_0's auc: 0.837898	valid_0's binary_logloss: 0.328562
[901]	valid_0's auc: 0.837906	valid_0's binary_logloss: 0.328557
[902]	valid_0's auc: 0.837911	valid_0's binary_logloss: 0.328554
[903]	valid_0's auc: 0.837918	valid_0's binary_logloss: 0.328549
[904]	valid_0's auc: 0.837922	valid_0's binary_logloss: 0.328545
[905]	valid_0's auc: 0.837926	valid_0's binary_logloss: 0.328542
[906]	valid_0's auc: 0.837

[1018]	valid_0's auc: 0.838311	valid_0's binary_logloss: 0.32819
[1019]	valid_0's auc: 0.838312	valid_0's binary_logloss: 0.32819
[1020]	valid_0's auc: 0.838314	valid_0's binary_logloss: 0.328188
[1021]	valid_0's auc: 0.838321	valid_0's binary_logloss: 0.328183
[1022]	valid_0's auc: 0.83833	valid_0's binary_logloss: 0.328176
[1023]	valid_0's auc: 0.838332	valid_0's binary_logloss: 0.328175
[1024]	valid_0's auc: 0.838335	valid_0's binary_logloss: 0.328173
[1025]	valid_0's auc: 0.838335	valid_0's binary_logloss: 0.328173
[1026]	valid_0's auc: 0.838334	valid_0's binary_logloss: 0.328175
[1027]	valid_0's auc: 0.838335	valid_0's binary_logloss: 0.328173
[1028]	valid_0's auc: 0.838342	valid_0's binary_logloss: 0.328168
[1029]	valid_0's auc: 0.838346	valid_0's binary_logloss: 0.328165
[1030]	valid_0's auc: 0.838346	valid_0's binary_logloss: 0.328165
[1031]	valid_0's auc: 0.838359	valid_0's binary_logloss: 0.328154
[1032]	valid_0's auc: 0.838366	valid_0's binary_logloss: 0.328148
[1033]	valid_

[1143]	valid_0's auc: 0.8387	valid_0's binary_logloss: 0.327868
[1144]	valid_0's auc: 0.83871	valid_0's binary_logloss: 0.32786
[1145]	valid_0's auc: 0.838707	valid_0's binary_logloss: 0.327861
[1146]	valid_0's auc: 0.838706	valid_0's binary_logloss: 0.327862
[1147]	valid_0's auc: 0.838713	valid_0's binary_logloss: 0.327857
[1148]	valid_0's auc: 0.838714	valid_0's binary_logloss: 0.327856
[1149]	valid_0's auc: 0.838713	valid_0's binary_logloss: 0.327857
[1150]	valid_0's auc: 0.83871	valid_0's binary_logloss: 0.327859
[1151]	valid_0's auc: 0.838719	valid_0's binary_logloss: 0.327852
[1152]	valid_0's auc: 0.838723	valid_0's binary_logloss: 0.32785
[1153]	valid_0's auc: 0.838728	valid_0's binary_logloss: 0.327846
[1154]	valid_0's auc: 0.838731	valid_0's binary_logloss: 0.327843
[1155]	valid_0's auc: 0.838737	valid_0's binary_logloss: 0.327839
[1156]	valid_0's auc: 0.838756	valid_0's binary_logloss: 0.327818
[1157]	valid_0's auc: 0.838773	valid_0's binary_logloss: 0.327806
[1158]	valid_0's

[1268]	valid_0's auc: 0.839178	valid_0's binary_logloss: 0.327471
[1269]	valid_0's auc: 0.839183	valid_0's binary_logloss: 0.327468
[1270]	valid_0's auc: 0.839184	valid_0's binary_logloss: 0.327468
[1271]	valid_0's auc: 0.839194	valid_0's binary_logloss: 0.32746
[1272]	valid_0's auc: 0.839197	valid_0's binary_logloss: 0.327458
[1273]	valid_0's auc: 0.839199	valid_0's binary_logloss: 0.327456
[1274]	valid_0's auc: 0.8392	valid_0's binary_logloss: 0.327455
[1275]	valid_0's auc: 0.839198	valid_0's binary_logloss: 0.327456
[1276]	valid_0's auc: 0.839204	valid_0's binary_logloss: 0.32745
[1277]	valid_0's auc: 0.839203	valid_0's binary_logloss: 0.327452
[1278]	valid_0's auc: 0.839207	valid_0's binary_logloss: 0.327449
[1279]	valid_0's auc: 0.839216	valid_0's binary_logloss: 0.327442
[1280]	valid_0's auc: 0.839215	valid_0's binary_logloss: 0.327442
[1281]	valid_0's auc: 0.839219	valid_0's binary_logloss: 0.327438
[1282]	valid_0's auc: 0.839225	valid_0's binary_logloss: 0.327435
[1283]	valid_0

[1393]	valid_0's auc: 0.83966	valid_0's binary_logloss: 0.327073
[1394]	valid_0's auc: 0.839668	valid_0's binary_logloss: 0.327064
[1395]	valid_0's auc: 0.839665	valid_0's binary_logloss: 0.327067
[1396]	valid_0's auc: 0.839671	valid_0's binary_logloss: 0.327063
[1397]	valid_0's auc: 0.839672	valid_0's binary_logloss: 0.32706
[1398]	valid_0's auc: 0.839679	valid_0's binary_logloss: 0.327054
[1399]	valid_0's auc: 0.839679	valid_0's binary_logloss: 0.327055
[1400]	valid_0's auc: 0.839679	valid_0's binary_logloss: 0.327055
[1401]	valid_0's auc: 0.839679	valid_0's binary_logloss: 0.327056
[1402]	valid_0's auc: 0.839688	valid_0's binary_logloss: 0.327048
[1403]	valid_0's auc: 0.839692	valid_0's binary_logloss: 0.327045
[1404]	valid_0's auc: 0.839689	valid_0's binary_logloss: 0.327048
[1405]	valid_0's auc: 0.83969	valid_0's binary_logloss: 0.327046
[1406]	valid_0's auc: 0.839694	valid_0's binary_logloss: 0.327044
[1407]	valid_0's auc: 0.839697	valid_0's binary_logloss: 0.327041
[1408]	valid_

[1518]	valid_0's auc: 0.839975	valid_0's binary_logloss: 0.326809
[1519]	valid_0's auc: 0.839981	valid_0's binary_logloss: 0.326805
[1520]	valid_0's auc: 0.839983	valid_0's binary_logloss: 0.326803
[1521]	valid_0's auc: 0.839981	valid_0's binary_logloss: 0.326805
[1522]	valid_0's auc: 0.839981	valid_0's binary_logloss: 0.326805
[1523]	valid_0's auc: 0.839983	valid_0's binary_logloss: 0.326805
[1524]	valid_0's auc: 0.839982	valid_0's binary_logloss: 0.326806
[1525]	valid_0's auc: 0.839986	valid_0's binary_logloss: 0.326802
[1526]	valid_0's auc: 0.839983	valid_0's binary_logloss: 0.326803
[1527]	valid_0's auc: 0.839987	valid_0's binary_logloss: 0.3268
[1528]	valid_0's auc: 0.839986	valid_0's binary_logloss: 0.326799
[1529]	valid_0's auc: 0.839992	valid_0's binary_logloss: 0.326796
[1530]	valid_0's auc: 0.83999	valid_0's binary_logloss: 0.326797
[1531]	valid_0's auc: 0.839991	valid_0's binary_logloss: 0.326796
[1532]	valid_0's auc: 0.839993	valid_0's binary_logloss: 0.326795
[1533]	valid_

[1643]	valid_0's auc: 0.840138	valid_0's binary_logloss: 0.326668
[1644]	valid_0's auc: 0.840148	valid_0's binary_logloss: 0.326661
[1645]	valid_0's auc: 0.840146	valid_0's binary_logloss: 0.326662
[1646]	valid_0's auc: 0.840145	valid_0's binary_logloss: 0.326663
[1647]	valid_0's auc: 0.840148	valid_0's binary_logloss: 0.32666
[1648]	valid_0's auc: 0.840148	valid_0's binary_logloss: 0.326661
[1649]	valid_0's auc: 0.840143	valid_0's binary_logloss: 0.326664
[1650]	valid_0's auc: 0.840147	valid_0's binary_logloss: 0.32666
[1651]	valid_0's auc: 0.840148	valid_0's binary_logloss: 0.326658
[1652]	valid_0's auc: 0.840148	valid_0's binary_logloss: 0.326658
[1653]	valid_0's auc: 0.840149	valid_0's binary_logloss: 0.326657
[1654]	valid_0's auc: 0.840159	valid_0's binary_logloss: 0.326648
[1655]	valid_0's auc: 0.840162	valid_0's binary_logloss: 0.326646
[1656]	valid_0's auc: 0.840163	valid_0's binary_logloss: 0.326646
[1657]	valid_0's auc: 0.840162	valid_0's binary_logloss: 0.326646
[1658]	valid

[1768]	valid_0's auc: 0.840352	valid_0's binary_logloss: 0.326487
[1769]	valid_0's auc: 0.840353	valid_0's binary_logloss: 0.326486
[1770]	valid_0's auc: 0.840355	valid_0's binary_logloss: 0.326483
[1771]	valid_0's auc: 0.840356	valid_0's binary_logloss: 0.326483
[1772]	valid_0's auc: 0.840354	valid_0's binary_logloss: 0.326485
[1773]	valid_0's auc: 0.840353	valid_0's binary_logloss: 0.326484
[1774]	valid_0's auc: 0.840356	valid_0's binary_logloss: 0.326482
[1775]	valid_0's auc: 0.840358	valid_0's binary_logloss: 0.326481
[1776]	valid_0's auc: 0.840357	valid_0's binary_logloss: 0.326482
[1777]	valid_0's auc: 0.840359	valid_0's binary_logloss: 0.326481
[1778]	valid_0's auc: 0.840358	valid_0's binary_logloss: 0.326481
[1779]	valid_0's auc: 0.840361	valid_0's binary_logloss: 0.326478
[1780]	valid_0's auc: 0.840359	valid_0's binary_logloss: 0.32648
[1781]	valid_0's auc: 0.840357	valid_0's binary_logloss: 0.326483
[1782]	valid_0's auc: 0.840357	valid_0's binary_logloss: 0.326483
[1783]	vali

[1893]	valid_0's auc: 0.8405	valid_0's binary_logloss: 0.326369
[1894]	valid_0's auc: 0.840496	valid_0's binary_logloss: 0.326373
[1895]	valid_0's auc: 0.8405	valid_0's binary_logloss: 0.32637
[1896]	valid_0's auc: 0.840498	valid_0's binary_logloss: 0.326371
[1897]	valid_0's auc: 0.8405	valid_0's binary_logloss: 0.32637
[1898]	valid_0's auc: 0.8405	valid_0's binary_logloss: 0.32637
[1899]	valid_0's auc: 0.840502	valid_0's binary_logloss: 0.32637
[1900]	valid_0's auc: 0.840505	valid_0's binary_logloss: 0.326368
[1901]	valid_0's auc: 0.840503	valid_0's binary_logloss: 0.326369
[1902]	valid_0's auc: 0.840508	valid_0's binary_logloss: 0.326366
[1903]	valid_0's auc: 0.840506	valid_0's binary_logloss: 0.326367
[1904]	valid_0's auc: 0.840507	valid_0's binary_logloss: 0.326366
[1905]	valid_0's auc: 0.840509	valid_0's binary_logloss: 0.326364
[1906]	valid_0's auc: 0.84051	valid_0's binary_logloss: 0.326364
[1907]	valid_0's auc: 0.840509	valid_0's binary_logloss: 0.326364
[1908]	valid_0's auc: 0

[2018]	valid_0's auc: 0.840732	valid_0's binary_logloss: 0.326187
[2019]	valid_0's auc: 0.84074	valid_0's binary_logloss: 0.32618
[2020]	valid_0's auc: 0.840743	valid_0's binary_logloss: 0.326178
[2021]	valid_0's auc: 0.840742	valid_0's binary_logloss: 0.326179
[2022]	valid_0's auc: 0.840743	valid_0's binary_logloss: 0.326178
[2023]	valid_0's auc: 0.840742	valid_0's binary_logloss: 0.326179
[2024]	valid_0's auc: 0.840741	valid_0's binary_logloss: 0.326182
[2025]	valid_0's auc: 0.840742	valid_0's binary_logloss: 0.326181
[2026]	valid_0's auc: 0.840743	valid_0's binary_logloss: 0.326179
[2027]	valid_0's auc: 0.840743	valid_0's binary_logloss: 0.326179
[2028]	valid_0's auc: 0.840745	valid_0's binary_logloss: 0.326178
[2029]	valid_0's auc: 0.840747	valid_0's binary_logloss: 0.326177
[2030]	valid_0's auc: 0.840751	valid_0's binary_logloss: 0.326173
[2031]	valid_0's auc: 0.840753	valid_0's binary_logloss: 0.326173
[2032]	valid_0's auc: 0.840753	valid_0's binary_logloss: 0.326172
[2033]	valid

[2143]	valid_0's auc: 0.841004	valid_0's binary_logloss: 0.325955
[2144]	valid_0's auc: 0.841001	valid_0's binary_logloss: 0.325956
[2145]	valid_0's auc: 0.840999	valid_0's binary_logloss: 0.325957
[2146]	valid_0's auc: 0.841003	valid_0's binary_logloss: 0.325955
[2147]	valid_0's auc: 0.841003	valid_0's binary_logloss: 0.325955
[2148]	valid_0's auc: 0.841006	valid_0's binary_logloss: 0.325952
[2149]	valid_0's auc: 0.841006	valid_0's binary_logloss: 0.325952
[2150]	valid_0's auc: 0.841008	valid_0's binary_logloss: 0.32595
[2151]	valid_0's auc: 0.841014	valid_0's binary_logloss: 0.325947
[2152]	valid_0's auc: 0.841014	valid_0's binary_logloss: 0.325947
[2153]	valid_0's auc: 0.841013	valid_0's binary_logloss: 0.325948
[2154]	valid_0's auc: 0.84101	valid_0's binary_logloss: 0.32595
[2155]	valid_0's auc: 0.84101	valid_0's binary_logloss: 0.325952
[2156]	valid_0's auc: 0.841008	valid_0's binary_logloss: 0.325954
[2157]	valid_0's auc: 0.841013	valid_0's binary_logloss: 0.325949
[2158]	valid_0

[2268]	valid_0's auc: 0.84117	valid_0's binary_logloss: 0.325809
[2269]	valid_0's auc: 0.841167	valid_0's binary_logloss: 0.32581
[2270]	valid_0's auc: 0.841167	valid_0's binary_logloss: 0.325811
[2271]	valid_0's auc: 0.841166	valid_0's binary_logloss: 0.325812
[2272]	valid_0's auc: 0.841167	valid_0's binary_logloss: 0.325809
[2273]	valid_0's auc: 0.841165	valid_0's binary_logloss: 0.325812
[2274]	valid_0's auc: 0.841162	valid_0's binary_logloss: 0.325814
[2275]	valid_0's auc: 0.841163	valid_0's binary_logloss: 0.325813
[2276]	valid_0's auc: 0.84116	valid_0's binary_logloss: 0.325815
[2277]	valid_0's auc: 0.841159	valid_0's binary_logloss: 0.325815
[2278]	valid_0's auc: 0.841159	valid_0's binary_logloss: 0.325816
[2279]	valid_0's auc: 0.841158	valid_0's binary_logloss: 0.325817
[2280]	valid_0's auc: 0.841156	valid_0's binary_logloss: 0.325819
[2281]	valid_0's auc: 0.841159	valid_0's binary_logloss: 0.325816
[2282]	valid_0's auc: 0.841157	valid_0's binary_logloss: 0.325817
[2283]	valid_

[2393]	valid_0's auc: 0.841279	valid_0's binary_logloss: 0.325701
[2394]	valid_0's auc: 0.841279	valid_0's binary_logloss: 0.325701
[2395]	valid_0's auc: 0.841279	valid_0's binary_logloss: 0.325702
[2396]	valid_0's auc: 0.841282	valid_0's binary_logloss: 0.325699
[2397]	valid_0's auc: 0.841283	valid_0's binary_logloss: 0.325698
[2398]	valid_0's auc: 0.84128	valid_0's binary_logloss: 0.325701
[2399]	valid_0's auc: 0.84128	valid_0's binary_logloss: 0.3257
[2400]	valid_0's auc: 0.841278	valid_0's binary_logloss: 0.325702
[2401]	valid_0's auc: 0.841274	valid_0's binary_logloss: 0.325706
[2402]	valid_0's auc: 0.841272	valid_0's binary_logloss: 0.325707
[2403]	valid_0's auc: 0.841272	valid_0's binary_logloss: 0.325707
[2404]	valid_0's auc: 0.841273	valid_0's binary_logloss: 0.325706
[2405]	valid_0's auc: 0.841274	valid_0's binary_logloss: 0.325705
[2406]	valid_0's auc: 0.841277	valid_0's binary_logloss: 0.325703
[2407]	valid_0's auc: 0.841277	valid_0's binary_logloss: 0.325703
[2408]	valid_0

[2518]	valid_0's auc: 0.841382	valid_0's binary_logloss: 0.325627
[2519]	valid_0's auc: 0.841381	valid_0's binary_logloss: 0.325628
[2520]	valid_0's auc: 0.841386	valid_0's binary_logloss: 0.325625
[2521]	valid_0's auc: 0.841386	valid_0's binary_logloss: 0.325624
[2522]	valid_0's auc: 0.841388	valid_0's binary_logloss: 0.325622
[2523]	valid_0's auc: 0.841387	valid_0's binary_logloss: 0.325622
[2524]	valid_0's auc: 0.841384	valid_0's binary_logloss: 0.325625
[2525]	valid_0's auc: 0.841383	valid_0's binary_logloss: 0.325624
[2526]	valid_0's auc: 0.841384	valid_0's binary_logloss: 0.325622
[2527]	valid_0's auc: 0.841385	valid_0's binary_logloss: 0.325622
[2528]	valid_0's auc: 0.841385	valid_0's binary_logloss: 0.325622
[2529]	valid_0's auc: 0.841391	valid_0's binary_logloss: 0.325618
[2530]	valid_0's auc: 0.841391	valid_0's binary_logloss: 0.325618
[2531]	valid_0's auc: 0.841391	valid_0's binary_logloss: 0.325617
[2532]	valid_0's auc: 0.841392	valid_0's binary_logloss: 0.325617
[2533]	val

[2643]	valid_0's auc: 0.841503	valid_0's binary_logloss: 0.32552
[2644]	valid_0's auc: 0.841501	valid_0's binary_logloss: 0.325521
[2645]	valid_0's auc: 0.841502	valid_0's binary_logloss: 0.32552
[2646]	valid_0's auc: 0.841501	valid_0's binary_logloss: 0.32552
[2647]	valid_0's auc: 0.841499	valid_0's binary_logloss: 0.325521
[2648]	valid_0's auc: 0.8415	valid_0's binary_logloss: 0.325521
[2649]	valid_0's auc: 0.841499	valid_0's binary_logloss: 0.325522
[2650]	valid_0's auc: 0.841498	valid_0's binary_logloss: 0.325524
[2651]	valid_0's auc: 0.8415	valid_0's binary_logloss: 0.325522
[2652]	valid_0's auc: 0.841499	valid_0's binary_logloss: 0.325523
[2653]	valid_0's auc: 0.841499	valid_0's binary_logloss: 0.325522
[2654]	valid_0's auc: 0.841501	valid_0's binary_logloss: 0.325521
[2655]	valid_0's auc: 0.841499	valid_0's binary_logloss: 0.325521
[2656]	valid_0's auc: 0.841501	valid_0's binary_logloss: 0.32552
[2657]	valid_0's auc: 0.841497	valid_0's binary_logloss: 0.325522
[2658]	valid_0's a

[2768]	valid_0's auc: 0.841585	valid_0's binary_logloss: 0.325454
[2769]	valid_0's auc: 0.841586	valid_0's binary_logloss: 0.325453
[2770]	valid_0's auc: 0.841592	valid_0's binary_logloss: 0.325448
[2771]	valid_0's auc: 0.841594	valid_0's binary_logloss: 0.325447
[2772]	valid_0's auc: 0.841596	valid_0's binary_logloss: 0.325446
[2773]	valid_0's auc: 0.841597	valid_0's binary_logloss: 0.325444
[2774]	valid_0's auc: 0.8416	valid_0's binary_logloss: 0.32544
[2775]	valid_0's auc: 0.841598	valid_0's binary_logloss: 0.325442
[2776]	valid_0's auc: 0.841599	valid_0's binary_logloss: 0.325441
[2777]	valid_0's auc: 0.841601	valid_0's binary_logloss: 0.325439
[2778]	valid_0's auc: 0.841606	valid_0's binary_logloss: 0.325436
[2779]	valid_0's auc: 0.841603	valid_0's binary_logloss: 0.325437
[2780]	valid_0's auc: 0.84161	valid_0's binary_logloss: 0.325432
[2781]	valid_0's auc: 0.841611	valid_0's binary_logloss: 0.325431
[2782]	valid_0's auc: 0.841609	valid_0's binary_logloss: 0.325432
[2783]	valid_0

[2893]	valid_0's auc: 0.841756	valid_0's binary_logloss: 0.325306
[2894]	valid_0's auc: 0.841757	valid_0's binary_logloss: 0.325307
[2895]	valid_0's auc: 0.841753	valid_0's binary_logloss: 0.325309
[2896]	valid_0's auc: 0.841759	valid_0's binary_logloss: 0.325306
[2897]	valid_0's auc: 0.841758	valid_0's binary_logloss: 0.325306
[2898]	valid_0's auc: 0.841759	valid_0's binary_logloss: 0.325305
[2899]	valid_0's auc: 0.841759	valid_0's binary_logloss: 0.325306
[2900]	valid_0's auc: 0.84176	valid_0's binary_logloss: 0.325305
[2901]	valid_0's auc: 0.841759	valid_0's binary_logloss: 0.325306
[2902]	valid_0's auc: 0.841764	valid_0's binary_logloss: 0.325303
[2903]	valid_0's auc: 0.841764	valid_0's binary_logloss: 0.325304
[2904]	valid_0's auc: 0.841766	valid_0's binary_logloss: 0.325302
[2905]	valid_0's auc: 0.841767	valid_0's binary_logloss: 0.325302
[2906]	valid_0's auc: 0.841766	valid_0's binary_logloss: 0.325302
[2907]	valid_0's auc: 0.841766	valid_0's binary_logloss: 0.325302
[2908]	vali