In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMClassifier
import logging

In [2]:
import pickle
from tqdm import tqdm, tqdm_notebook, _tqdm_notebook, tqdm_pandas

In [3]:
log_fmt = "[%(asctime)s] %(levelname)s in %(module)s: %(message)s"
logging.basicConfig(format=log_fmt, level=logging.INFO)

In [4]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
def extract_day(s):
    return s.apply(lambda x: int(x.split('-')[0][1:]))

def extract_hour(s):
    return s.apply(lambda x: int(x.split('-')[1][1:]))

In [6]:
def parse_str(d):
    return np.array(list(map(float, d.split())))
def parse_list_1(d):
    if d == '-1':
        return [0]
    return list(map(lambda x: int(x[1:]), str(d).split(',')))
def parse_map(d):
    if d == '-1':
        return {}
    return dict([int(z.split(':')[0][1:]), float(z.split(':')[1])] for z in d.split(','))

In [7]:
base_path = '../data'

In [8]:
# 加载邀请回答数据

train = pd.read_csv(f'{base_path}/invite_info_0926.txt', sep='\t', header=None)
train.columns = ['qid', 'uid', 'dt', 'label']
logging.info("invite %s", train.shape)

test = pd.read_csv(f'{base_path}/invite_info_evaluate_1_0926.txt', sep='\t', header=None)
test.columns = ['qid', 'uid', 'dt']
logging.info("test %s", test.shape)

sub = test.copy()

sub_size = len(sub)

train['day'] = extract_day(train['dt'])
train['hour'] = extract_hour(train['dt'])

test['day'] = extract_day(test['dt'])
test['hour'] = extract_hour(test['dt'])
del train['dt'], test['dt']


[2019-12-12 22:28:54,643] INFO in <ipython-input-8-12241a91563a>: invite (9489162, 4)
[2019-12-12 22:28:57,865] INFO in <ipython-input-8-12241a91563a>: test (1141683, 3)


In [9]:
# 加载问题
ques = pd.read_csv(f'{base_path}/question_info_0926.txt', header=None, sep='\t')
ques.columns = ['qid', 'q_dt', 'title_t1', 'title_t2', 'desc_t1', 'desc_t2', 'q_topic']
# del ques['title_t1'], ques['title_t2'], ques['desc_t1'], ques['desc_t2']
del ques['title_t1'],ques['desc_t1']
logging.info("ques %s", ques.shape)

ques['q_day'] = extract_day(ques['q_dt'])
ques['q_hour'] = extract_hour(ques['q_dt'])
del ques['q_dt']

[2019-12-12 22:30:30,386] INFO in <ipython-input-9-8ea131990986>: ques (1829900, 5)


In [10]:
# ques['关注话题'] = train['关注话题'].apply(parse_list_1)
ques['title_t2'] = ques['title_t2'].apply(parse_list_1)
ques['desc_t2'] = ques['desc_t2'].apply(parse_list_1)
ques['q_topic'] = ques['q_topic'].apply(parse_list_1)
# train['感兴趣话题'] = train['感兴趣话题'].apply(parse_map)
ques.head()

Unnamed: 0,qid,title_t2,desc_t2,q_topic,q_day,q_hour
0,Q2234111670,"[22414, 963, 10458]",[0],"[321, 730, 5784, 4389]",1018,5
1,Q760329790,"[12677, 16829, 15201, 6419, 101839]","[1296, 2118, 12677, 16829, 15201, 6419, 101839...","[278, 12673, 4677]",1745,20
2,Q741313548,"[700, 2781, 3280, 81215]","[732, 24400, 48321, 39608, 20788, 219486, 1183...",[226],2032,21
3,Q3481466230,"[3312, 1823, 1505, 638, 166, 461]","[6642, 4214, 3312, 1505, 2205, 232, 294, 7177,...","[51, 4468]",2185,15
4,Q3966197028,"[700, 895, 2253]",[0],"[54700, 81, 57, 17670, 43574]",2269,17


In [11]:
topicmap = pd.read_csv('../data/topic_vectors_64d.txt', 
                          names=['id', 'embed'], sep='\t')
topicmap['embed'] = topicmap['embed'].apply(parse_str)
topicmap['id'] = topicmap['id'].apply(lambda x: int(x[1:]))
topicmap.head()

Unnamed: 0,id,embed
0,1,"[0.16508673, -0.0037432343, -0.058245048, -0.0..."
1,2,"[1.608256, -1.0515573, -1.1897708, 1.1820835, ..."
2,3,"[3.3307428, -0.43252096, -2.1518784, -1.439003..."
3,4,"[2.4698818, -0.12998039, -0.4648351, 0.8796743..."
4,5,"[1.562477, -1.3560516, -0.3271215, -0.06341907..."


In [12]:
topicmap.shape

(100000, 2)

In [13]:
topic_vector_dict = dict(zip(np.array(topicmap['id']), np.array(topicmap['embed'])))

In [14]:
def topic2v(x):
    try:
        tmp = topic_vector_dict[x[0]]
    except:
        tmp = np.zeros(64)
    for i in x[1:]:
        tmp = tmp + topic_vector_dict[i]
    if len(tmp) == 0:
        return np.zeros(64)
    return (tmp / len(x))

In [15]:
tqdm.pandas(desc="topic2v...")
ques['q_topic']=ques['q_topic'].progress_apply(lambda x:topic2v(x))
print('finished!')

topic2v...: 100%|██████████| 1829900/1829900 [00:55<00:00, 33144.70it/s]


finished!


In [16]:
wordmap = pd.read_csv('../data/word_vectors_64d.txt', 
                          names=['id', 'embed'], sep='\t')
wordmap['embed'] = wordmap['embed'].apply(parse_str)
wordmap['id'] = wordmap['id'].apply(lambda x: int(x[1:]))
wordmap.head()

Unnamed: 0,id,embed
0,1,"[0.12561196, -0.57268924, -0.14478925, -0.0524..."
1,2,"[3.224765, 2.2482696, -0.511986, -0.5329892, -..."
2,3,"[-0.985937, 0.11307016, 0.012898494, -0.682206..."
3,4,"[-0.3367663, 0.039051324, 0.8155926, 0.8351733..."
4,5,"[0.3074205, -1.0977745, 0.7528213, 0.6299011, ..."


In [17]:
wordmap.shape
word_vector_dict = dict(zip(np.array(wordmap['id']), np.array(wordmap['embed'])))

In [18]:
def word2v(x):
    try:
        tmp = word_vector_dict[x[0]]
    except:
        tmp = np.zeros(64)
    for i in x[1:]:
        tmp = tmp + word_vector_dict[i]
    if len(tmp) == 0:
        return np.zeros(64)
    return (tmp / len(x))

In [19]:
tqdm.pandas(desc="word2v...")
ques['title_t2']=ques['title_t2'].progress_apply(lambda x:word2v(x))
print('finished!')

word2v...: 100%|██████████| 1829900/1829900 [01:23<00:00, 21992.18it/s]


finished!


In [20]:
tqdm.pandas(desc="word2v...")
ques['desc_t2']=ques['desc_t2'].progress_apply(lambda x:word2v(x))
print('finished!')

word2v...: 100%|██████████| 1829900/1829900 [01:43<00:00, 17677.80it/s]


finished!


In [21]:
ques.head()

Unnamed: 0,qid,title_t2,desc_t2,q_topic,q_day,q_hour
0,Q2234111670,"[3.4942667999999997, -0.8552949333333334, -4.0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-1.1211431250000001, 2.9202411, -0.0973278000...",1018,5
1,Q760329790,"[1.1900169984, -4.05578632, -3.945152239999999...","[0.040301072461538384, -1.3610320753846152, -3...","[-2.680529233333333, -5.466671466666667, -3.97...",1745,20
2,Q741313548,"[1.39550311, -0.394145575, -1.2251911, -0.3720...","[4.4959643713999995, 1.582830815, 0.0903138088...","[1.1679975, -0.9902606, 2.8614578, 2.517082, -...",2032,21
3,Q3481466230,"[0.5512061833333333, 0.0740648883333332, 1.733...","[1.5809253704166668, -1.2889901612499999, -0.0...","[-1.4455721, -1.401717435, 1.141536675, 3.5025...",2185,15
4,Q3966197028,"[-0.6215633333333334, 0.6605363, -0.6174622666...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1.281498462, 1.8468875400000002, -2.340192636...",2269,17


In [22]:
def wordadd(x):
    try:
        tmp = 0.0
    except:
        tmp = 0.0
    tmp=x.sum()
    return (tmp /64)

In [23]:
tqdm.pandas(desc="wordadd...")
ques['title_t2']=ques['title_t2'].progress_apply(lambda x:wordadd(x))
print('finished!')

wordadd...: 100%|██████████| 1829900/1829900 [00:41<00:00, 44497.29it/s]


finished!


In [24]:
tqdm.pandas(desc="wordadd...")
ques['desc_t2']=ques['desc_t2'].progress_apply(lambda x:wordadd(x))
print('finished!')

wordadd...: 100%|██████████| 1829900/1829900 [00:41<00:00, 44336.25it/s]


finished!


In [25]:
tqdm.pandas(desc="wordadd...")
ques['q_topic']=ques['q_topic'].progress_apply(lambda x:wordadd(x))
print('finished!')

wordadd...: 100%|██████████| 1829900/1829900 [00:38<00:00, 47340.23it/s]


finished!


In [26]:
ques.head()

Unnamed: 0,qid,title_t2,desc_t2,q_topic,q_day,q_hour
0,Q2234111670,0.020175,0.0,-0.24888,1018,5
1,Q760329790,-0.633451,-0.426139,-0.274191,1745,20
2,Q741313548,-0.327227,-0.076127,0.024638,2032,21
3,Q3481466230,-0.327119,-0.295625,-0.016809,2185,15
4,Q3966197028,-0.674786,0.0,-0.239849,2269,17


In [27]:
# 加载回答
ans = pd.read_csv(f'{base_path}/answer_info_0926.txt', header=None, sep='\t')
ans.columns = ['aid', 'qid', 'uid', 'ans_dt', 'ans_t1', 'ans_t2', 'is_good', 'is_rec', 'is_dest', 'has_img',
               'has_video', 'word_count', 'reci_cheer', 'reci_uncheer', 'reci_comment', 'reci_mark', 'reci_tks',
               'reci_xxx', 'reci_no_help', 'reci_dis']
del ans['ans_t1'], ans['ans_t2']
logging.info("ans %s", ans.shape)

ans['a_day'] = extract_day(ans['ans_dt'])
ans['a_hour'] = extract_hour(ans['ans_dt'])
del ans['ans_dt']

ans = pd.merge(ans, ques, on='qid')
del ques

[2019-12-12 22:42:17,800] INFO in <ipython-input-27-0f04b531e5be>: ans (4513735, 18)


In [28]:
print(ans['a_day'].min())
print(ans['a_day'].max())

3807
3867


In [29]:
# 回答距提问的天数
ans['diff_qa_days'] = ans['a_day'] - ans['q_day']

[2019-12-12 22:42:53,781] INFO in utils: Note: NumExpr detected 24 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
[2019-12-12 22:42:53,787] INFO in utils: NumExpr defaulting to 8 threads.


In [30]:
# 时间窗口划分
# train
# val
train_start = 3838
train_end = 3867

val_start = 3868
val_end = 3874

label_end = 3867
label_start = label_end - 6

train_label_feature_end = label_end - 7
train_label_feature_start = train_label_feature_end - 22

train_ans_feature_end = label_end - 7
train_ans_feature_start = train_ans_feature_end - 50

val_label_feature_end = val_start - 1
val_label_feature_start = val_label_feature_end - 22

val_ans_feature_end = val_start - 1
val_ans_feature_start = val_ans_feature_end - 50

train_label_feature = train[(train['day'] >= train_label_feature_start) & (train['day'] <= train_label_feature_end)]
logging.info("train_label_feature %s", train_label_feature.shape)

val_label_feature = train[(train['day'] >= val_label_feature_start) & (train['day'] <= val_label_feature_end)]
logging.info("val_label_feature %s", val_label_feature.shape)

train_label = train[(train['day'] > train_label_feature_end)]

logging.info("train feature start %s end %s, label start %s end %s", train_label_feature['day'].min(),
             train_label_feature['day'].max(), train_label['day'].min(), train_label['day'].max())

logging.info("test feature start %s end %s, label start %s end %s", val_label_feature['day'].min(),
             val_label_feature['day'].max(), test['day'].min(), test['day'].max())

[2019-12-12 22:42:55,200] INFO in <ipython-input-30-413cbf6f4a7d>: train_label_feature (6895493, 5)
[2019-12-12 22:42:55,989] INFO in <ipython-input-30-413cbf6f4a7d>: val_label_feature (7583553, 5)
[2019-12-12 22:42:56,495] INFO in <ipython-input-30-413cbf6f4a7d>: train feature start 3838 end 3860, label start 3861 end 3867
[2019-12-12 22:42:56,541] INFO in <ipython-input-30-413cbf6f4a7d>: test feature start 3845 end 3867, label start 3868 end 3874


In [31]:
# 确定ans的时间范围
# 3807~3874
train_ans_feature = ans[(ans['a_day'] >= train_ans_feature_start) & (ans['a_day'] <= train_ans_feature_end)]

val_ans_feature = ans[(ans['a_day'] >= val_ans_feature_start) & (ans['a_day'] <= val_ans_feature_end)]

logging.info("train ans feature %s, start %s end %s", train_ans_feature.shape, train_ans_feature['a_day'].min(),
             train_ans_feature['a_day'].max())

logging.info("val ans feature %s, start %s end %s", val_ans_feature.shape, val_ans_feature['a_day'].min(),
             val_ans_feature['a_day'].max())

fea_cols = ['is_good', 'is_rec', 'is_dest', 'has_img', 'has_video', 'word_count',
            'reci_cheer', 'reci_uncheer', 'reci_comment', 'reci_mark', 'reci_tks',
            'reci_xxx', 'reci_no_help', 'reci_dis', 'diff_qa_days']


def extract_feature1(target, label_feature, ans_feature):
    # 问题特征
    t1 = label_feature.groupby('qid')['label'].agg(['mean', 'sum', 'std', 'count']).reset_index()
    t1.columns = ['qid', 'q_inv_mean', 'q_inv_sum', 'q_inv_std', 'q_inv_count']
    target = pd.merge(target, t1, on='qid', how='left')

    # 用户特征
    t1 = label_feature.groupby('uid')['label'].agg(['mean', 'sum', 'std', 'count']).reset_index()
    t1.columns = ['uid', 'u_inv_mean', 'u_inv_sum', 'u_inv_std', 'u_inv_count']
    target = pd.merge(target, t1, on='uid', how='left')
    #
    # train_size = len(train)
    # data = pd.concat((train, test), sort=True)

    # 回答部分特征

    t1 = ans_feature.groupby('qid')['aid'].count().reset_index()
    t1.columns = ['qid', 'q_ans_count']
    target = pd.merge(target, t1, on='qid', how='left')

    t1 = ans_feature.groupby('uid')['aid'].count().reset_index()
    t1.columns = ['uid', 'u_ans_count']
    target = pd.merge(target, t1, on='uid', how='left')

    for col in fea_cols:
        t1 = ans_feature.groupby('uid')[col].agg(['sum', 'max', 'mean']).reset_index()
        t1.columns = ['uid', f'u_{col}_sum', f'u_{col}_max', f'u_{col}_mean']
        target = pd.merge(target, t1, on='uid', how='left')

        t1 = ans_feature.groupby('qid')[col].agg(['sum', 'max', 'mean']).reset_index()
        t1.columns = ['qid', f'q_{col}_sum', f'q_{col}_max', f'q_{col}_mean']
        target = pd.merge(target, t1, on='qid', how='left')
        logging.info("extract %s", col)
    return target


train_label = extract_feature1(train_label, train_label_feature, train_ans_feature)
test = extract_feature1(test, val_label_feature, val_ans_feature)

[2019-12-12 22:42:59,701] INFO in <ipython-input-31-0511893e776f>: train ans feature (3700178, 25), start 3810 end 3860
[2019-12-12 22:42:59,730] INFO in <ipython-input-31-0511893e776f>: val ans feature (3992334, 25), start 3817 end 3867
[2019-12-12 22:44:42,901] INFO in <ipython-input-31-0511893e776f>: extract is_good
[2019-12-12 22:45:17,870] INFO in <ipython-input-31-0511893e776f>: extract is_rec
[2019-12-12 22:45:52,787] INFO in <ipython-input-31-0511893e776f>: extract is_dest
[2019-12-12 22:46:25,492] INFO in <ipython-input-31-0511893e776f>: extract has_img
[2019-12-12 22:46:57,920] INFO in <ipython-input-31-0511893e776f>: extract has_video
[2019-12-12 22:47:29,471] INFO in <ipython-input-31-0511893e776f>: extract word_count
[2019-12-12 22:48:02,329] INFO in <ipython-input-31-0511893e776f>: extract reci_cheer
[2019-12-12 22:48:34,474] INFO in <ipython-input-31-0511893e776f>: extract reci_uncheer
[2019-12-12 22:49:06,776] INFO in <ipython-input-31-0511893e776f>: extract reci_commen

In [32]:
# 特征提取结束
logging.info("train shape %s, test shape %s", train_label.shape, test.shape)
assert len(test) == sub_size

# 加载用户
user = pd.read_csv(f'{base_path}/member_info_0926.txt', header=None, sep='\t')
user.columns = ['uid', 'gender', 'creat_keyword', 'level', 'hot', 'reg_type', 'reg_plat', 'freq', 'uf_b1', 'uf_b2',
                'uf_b3', 'uf_b4', 'uf_b5', 'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5', 'score', 'follow_topic',
                'inter_topic']

[2019-12-12 23:00:39,834] INFO in <ipython-input-32-d70daf315b0b>: train shape (2593669, 105), test shape (1141683, 104)


In [33]:
dit = {'daily': 4, 'weekly': 3, 'monthly': 2, 'new': 1,'unknow':0}
user['freq'] = user['freq'].map(dit)

In [34]:
user = user.drop(['creat_keyword','level','hot','reg_type','reg_plat'],axis=1)

In [35]:
user.head()

Unnamed: 0,uid,gender,freq,uf_b1,uf_b2,uf_b3,uf_b4,uf_b5,uf_c1,uf_c2,uf_c3,uf_c4,uf_c5,score,follow_topic,inter_topic
0,M1934753188,male,2.0,0,1,0,1,0,MD470265,BR470265,PV929066,CT929066,PF470265,764,T540,"T21107:1.7915097,T405:1.6123838,T4436:1.518003..."
1,M595924114,male,4.0,0,0,0,1,1,MD195122,BR596936,PV002320,CT840234,PF470265,671,"T44126,T15940,T839,T8978,T2934,T1113,T3914,T12...","T18016:2.0650618,T2384:1.2503042,T1142:1.13569..."
2,M1473482940,female,3.0,0,1,0,1,0,MD116493,BR641329,PV170953,CT470265,PF470265,454,"T30874,T2113,T8656,T21,T523,T8,T116,T5727,T68,...","T46:1.330939,T2159:1.1296458,T379:1.1241927,T1..."
3,M578477092,male,4.0,1,1,0,1,0,MD889589,BR803759,PV545833,CT545833,PF470265,588,"T946,T7323,T297,T2660,T36067,T53107,T2654,T507...","T15918:1.9479566,T8106:1.8578106,T4787:1.58486..."
4,M1088794709,male,3.0,0,1,0,0,0,MD825760,BR641329,PV071037,CT470265,PF470265,361,"T582,T558,T28776,T5186,T9081,T2419,T2693,T2299...","T65:1.5992582,T867:1.3179373,T93:1.2095009,T31..."


In [36]:
user['follow_topic'] = user['follow_topic'].apply(parse_list_1)
user['inter_topic'] = user['inter_topic'].apply(parse_map)

In [37]:
def topic_interest2v(x):
    if len(x)==0:
        return np.zeros(64)
    else:
        tmp=np.zeros(64)
        for i in x:
            tmp = tmp + topic_vector_dict[i]*x[i]
        return (tmp / len(x))

In [38]:
tqdm.pandas(desc="topic2v...")
user['follow_topic']=user['follow_topic'].progress_apply(lambda x:topic2v(x))
print('finished!')

topic2v...: 100%|██████████| 1931654/1931654 [03:04<00:00, 10454.86it/s]


finished!


In [39]:
tqdm.pandas(desc="topic_interest2v...")
user['inter_topic']=user['inter_topic'].progress_apply(lambda x:topic_interest2v(x))
user.head()

topic_interest2v...: 100%|██████████| 1931654/1931654 [02:48<00:00, 11459.31it/s]


Unnamed: 0,uid,gender,freq,uf_b1,uf_b2,uf_b3,uf_b4,uf_b5,uf_c1,uf_c2,uf_c3,uf_c4,uf_c5,score,follow_topic,inter_topic
0,M1934753188,male,2.0,0,1,0,1,0,MD470265,BR470265,PV929066,CT929066,PF470265,764,"[6.125305, -1.4180568, -1.3143845, -2.9268239,...","[0.5255145018431924, 2.8507211193389996, 0.571..."
1,M595924114,male,4.0,0,0,0,1,1,MD195122,BR596936,PV002320,CT840234,PF470265,671,"[0.30575216599999994, -0.9980522930666669, 0.0...","[-1.9947685124736019, 1.3387408371544118, -0.3..."
2,M1473482940,female,3.0,0,1,0,1,0,MD116493,BR641329,PV170953,CT470265,PF470265,454,"[0.1508028030833333, 0.7434654864583335, -0.25...","[-0.25824397617484, 0.6167275103940157, -0.075..."
3,M578477092,male,4.0,1,1,0,1,0,MD889589,BR803759,PV545833,CT545833,PF470265,588,"[1.4605925141999998, -0.33966116570100013, 0.3...","[1.4908855224214492, 1.4438666038597823, 1.179..."
4,M1088794709,male,3.0,0,1,0,0,0,MD825760,BR641329,PV071037,CT470265,PF470265,361,"[0.9643650885294115, 0.31516375361764704, 0.82...","[1.5879302100003165, -1.1453530874701519, -0.7..."


In [40]:
tqdm.pandas(desc="wordadd...")
user['follow_topic']=user['follow_topic'].progress_apply(lambda x:wordadd(x))
print('finished!')

wordadd...: 100%|██████████| 1931654/1931654 [00:37<00:00, 51411.39it/s]


finished!


In [41]:
tqdm.pandas(desc="wordadd...")
user['inter_topic']=user['inter_topic'].progress_apply(lambda x:wordadd(x))
print('finished!')

wordadd...: 100%|██████████| 1931654/1931654 [00:43<00:00, 44627.38it/s]


finished!


In [42]:
user.head()

Unnamed: 0,uid,gender,freq,uf_b1,uf_b2,uf_b3,uf_b4,uf_b5,uf_c1,uf_c2,uf_c3,uf_c4,uf_c5,score,follow_topic,inter_topic
0,M1934753188,male,2.0,0,1,0,1,0,MD470265,BR470265,PV929066,CT929066,PF470265,764,-0.14911,0.178521
1,M595924114,male,4.0,0,0,0,1,1,MD195122,BR596936,PV002320,CT840234,PF470265,671,0.04439,-0.144847
2,M1473482940,female,3.0,0,1,0,1,0,MD116493,BR641329,PV170953,CT470265,PF470265,454,0.020901,0.048022
3,M578477092,male,4.0,1,1,0,1,0,MD889589,BR803759,PV545833,CT545833,PF470265,588,-0.013006,-0.074952
4,M1088794709,male,3.0,0,1,0,0,0,MD825760,BR641329,PV071037,CT470265,PF470265,361,-0.100348,-0.234183


In [43]:

logging.info("user %s", user.shape)

unq = user.nunique()
logging.info("user unq %s", unq)

for x in unq[unq == 1].index:
    del user[x]
    logging.info('del unq==1 %s', x)

t = user.dtypes
cats = [x for x in t[t == 'object'].index if x not in ['follow_topic', 'inter_topic', 'uid','freq']]
logging.info("user cat %s", cats)

for d in cats:
    lb = LabelEncoder()
    user[d] = lb.fit_transform(user[d])
    logging.info('encode %s', d)

q_lb = LabelEncoder()
q_lb.fit(list(train_label['qid'].astype(str).values) + list(test['qid'].astype(str).values))
train_label['qid_enc'] = q_lb.transform(train_label['qid'])
test['qid_enc'] = q_lb.transform(test['qid'])

u_lb = LabelEncoder()
u_lb.fit(user['uid'])
train_label['uid_enc'] = u_lb.transform(train_label['uid'])
test['uid_enc'] = u_lb.transform(test['uid'])

# merge user
train_label = pd.merge(train_label, user, on='uid', how='left')
test = pd.merge(test, user, on='uid', how='left')
logging.info("train shape %s, test shape %s", train_label.shape, test.shape)


# del train_label, test

[2019-12-12 23:10:40,441] INFO in <ipython-input-43-d32dfab0c918>: user (1931654, 16)
[2019-12-12 23:10:52,937] INFO in <ipython-input-43-d32dfab0c918>: user unq uid             1931654
gender                3
freq                  4
uf_b1                 2
uf_b2                 2
uf_b3                 2
uf_b4                 2
uf_b5                 2
uf_c1              2561
uf_c2               291
uf_c3               428
uf_c4              1556
uf_c5                 2
score               732
follow_topic    1288996
inter_topic     1399663
dtype: int64
[2019-12-12 23:10:52,955] INFO in <ipython-input-43-d32dfab0c918>: user cat ['gender', 'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5']
[2019-12-12 23:10:54,217] INFO in <ipython-input-43-d32dfab0c918>: encode gender
[2019-12-12 23:10:55,529] INFO in <ipython-input-43-d32dfab0c918>: encode uf_c1
[2019-12-12 23:10:56,778] INFO in <ipython-input-43-d32dfab0c918>: encode uf_c2
[2019-12-12 23:10:57,823] INFO in <ipython-input-43-d32dfab0c918>: e

In [44]:
train_label.head()

Unnamed: 0,qid,uid,label,day,hour,q_inv_mean,q_inv_sum,q_inv_std,q_inv_count,u_inv_mean,...,uf_b4,uf_b5,uf_c1,uf_c2,uf_c3,uf_c4,uf_c5,score,follow_topic,inter_topic
0,Q2166419046,M401693808,0,3865,22,,,,,0.0,...,0,0,2113,190,261,927,1,297,-0.12217,0.021705
1,Q604029601,M2317670257,0,3862,15,,,,,0.090909,...,0,0,1519,229,0,506,1,415,-0.015598,-0.182562
2,Q2443223942,M3544409350,0,3867,4,0.375,57.0,0.485723,152.0,0.0,...,0,0,551,226,188,815,1,296,0.0,0.070942
3,Q795459266,M2818659842,0,3861,20,0.166667,1.0,0.408248,6.0,0.285714,...,0,0,1519,229,0,506,1,380,-0.302475,-0.191518
4,Q110462128,M848334644,1,3862,8,,,,,0.634146,...,0,0,2161,31,396,1438,1,719,-0.095008,-0.176807


In [45]:
test.head()

Unnamed: 0,qid,uid,day,hour,q_inv_mean,q_inv_sum,q_inv_std,q_inv_count,u_inv_mean,u_inv_sum,...,uf_b4,uf_b5,uf_c1,uf_c2,uf_c3,uf_c4,uf_c5,score,follow_topic,inter_topic
0,Q1493039281,M64135255,3870,9,0.0,0.0,0.0,2.0,0.166667,1.0,...,0,0,1190,130,201,1374,1,370,-0.15902,-0.039165
1,Q2023398782,M2536956560,3872,22,,,,,0.0,0.0,...,1,0,1190,130,394,519,1,291,0.024704,-0.116434
2,Q4151338694,M3294926344,3874,15,,,,,0.111111,1.0,...,0,0,1190,130,203,443,1,451,0.0,-0.060149
3,Q3271436624,M3744310794,3873,4,0.5,3.0,0.547723,6.0,0.0,0.0,...,0,0,1190,130,186,1036,0,416,-0.123365,-0.227317
4,Q3314287018,M1349051752,3872,19,,,,,0.0,0.0,...,0,0,381,190,334,786,1,316,-0.173229,0.07879


In [46]:
data = pd.concat((train_label, test), axis=0, sort=True)

In [47]:
data.head()

Unnamed: 0,day,follow_topic,freq,gender,hour,inter_topic,label,q_ans_count,q_diff_qa_days_max,q_diff_qa_days_mean,...,uf_b3,uf_b4,uf_b5,uf_c1,uf_c2,uf_c3,uf_c4,uf_c5,uid,uid_enc
0,3865,-0.12217,3.0,2,22,0.021705,0.0,,,,...,0,0,0,2113,190,261,927,1,M401693808,1508098
1,3862,-0.015598,3.0,2,15,-0.182562,0.0,,,,...,0,0,0,1519,229,0,506,1,M2317670257,657985
2,3867,0.0,2.0,2,4,0.070942,0.0,32.0,13.0,9.53125,...,0,0,0,551,226,188,815,1,M3544409350,1272353
3,3861,-0.302475,4.0,1,20,-0.191518,0.0,3.0,234.0,222.333333,...,0,0,0,1519,229,0,506,1,M2818659842,909154
4,3862,-0.095008,3.0,0,8,-0.176807,1.0,,,,...,0,0,0,2161,31,396,1438,1,M848334644,1856019


In [48]:
# # count编码
# count_fea = ['uid_enc', 'qid_enc', 'gender', 'freq', 'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5']
# for feat in count_fea:
#     col_name = '{}_count'.format(feat)
#     data[col_name] = data[feat].map(data[feat].value_counts().astype(int))
#     data.loc[data[col_name] < 2, feat] = -1
#     data[feat] += 1
#     data[col_name] = data[feat].map(data[feat].value_counts().astype(int))
#     data[col_name] = (data[col_name] - data[col_name].min()) / (data[col_name].max() - data[col_name].min())
#     # 

# # 问题被回答的次数

In [49]:
# 压缩数据
t = data.dtypes
for x in t[t == 'int64'].index:
    data[x] = data[x].astype('int32')

for x in t[t == 'float64'].index:
    data[x] = data[x].astype('float32')

data['wk'] = data['day'] % 7

feature_cols = [x for x in data.columns if x not in ('label', 'uid', 'qid', 'dt', 'day')]

In [None]:
# target编码
logging.info("feature size %s", len(feature_cols))

X_train_all = data.iloc[:len(train_label)][feature_cols]
y_train_all = data.iloc[:len(train_label)]['label']
test = data.iloc[len(train_label):]
# del data
assert len(test) == sub_size

logging.info("train shape %s, test shape %s", train_label.shape, test.shape)

fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for index, (train_idx, val_idx) in enumerate(fold.split(X=X_train_all, y=y_train_all)):
    break

X_train, X_val, y_train, y_val = X_train_all.iloc[train_idx][feature_cols], X_train_all.iloc[val_idx][feature_cols], \
                                 y_train_all.iloc[train_idx], \
                                 y_train_all.iloc[val_idx]
del X_train_all

model_lgb = LGBMClassifier(n_estimators=2000, n_jobs=-1, objective='binary', seed=1000, silent=True)
model_lgb.fit(X_train, y_train,
              eval_metric=['logloss', 'auc'],
              eval_set=[(X_val, y_val)],
              early_stopping_rounds=100)

sub['label'] = model_lgb.predict_proba(test[feature_cols])[:, 1]


sub.to_csv('../example/result_1212.txt', index=None, header=None, sep='\t')

[2019-12-12 23:15:58,848] INFO in <ipython-input-50-20da7376f2c4>: feature size 119
[2019-12-12 23:15:59,770] INFO in <ipython-input-50-20da7376f2c4>: train shape (2593669, 122), test shape (1141683, 123)


[1]	valid_0's auc: 0.725702	valid_0's binary_logloss: 0.42997
Training until validation scores don't improve for 100 rounds
[2]	valid_0's auc: 0.731656	valid_0's binary_logloss: 0.422522
[3]	valid_0's auc: 0.734278	valid_0's binary_logloss: 0.416605
[4]	valid_0's auc: 0.735336	valid_0's binary_logloss: 0.411852
[5]	valid_0's auc: 0.738737	valid_0's binary_logloss: 0.407635
[6]	valid_0's auc: 0.739843	valid_0's binary_logloss: 0.404274
[7]	valid_0's auc: 0.741071	valid_0's binary_logloss: 0.401308
[8]	valid_0's auc: 0.7422	valid_0's binary_logloss: 0.398795
[9]	valid_0's auc: 0.743582	valid_0's binary_logloss: 0.396563
[10]	valid_0's auc: 0.744597	valid_0's binary_logloss: 0.394657
[11]	valid_0's auc: 0.74533	valid_0's binary_logloss: 0.393003
[12]	valid_0's auc: 0.745973	valid_0's binary_logloss: 0.391589
[13]	valid_0's auc: 0.746726	valid_0's binary_logloss: 0.390284
[14]	valid_0's auc: 0.74763	valid_0's binary_logloss: 0.389097
[15]	valid_0's auc: 0.748352	valid_0's binary_logloss: 0

[129]	valid_0's auc: 0.768433	valid_0's binary_logloss: 0.372102
[130]	valid_0's auc: 0.768495	valid_0's binary_logloss: 0.372068
[131]	valid_0's auc: 0.768517	valid_0's binary_logloss: 0.372054
[132]	valid_0's auc: 0.768552	valid_0's binary_logloss: 0.372037
[133]	valid_0's auc: 0.768601	valid_0's binary_logloss: 0.372015
[134]	valid_0's auc: 0.768632	valid_0's binary_logloss: 0.371995
[135]	valid_0's auc: 0.768646	valid_0's binary_logloss: 0.371988
[136]	valid_0's auc: 0.76867	valid_0's binary_logloss: 0.371968
[137]	valid_0's auc: 0.768725	valid_0's binary_logloss: 0.371936
[138]	valid_0's auc: 0.76879	valid_0's binary_logloss: 0.371897
[139]	valid_0's auc: 0.768841	valid_0's binary_logloss: 0.371869
[140]	valid_0's auc: 0.768861	valid_0's binary_logloss: 0.371856
[141]	valid_0's auc: 0.768898	valid_0's binary_logloss: 0.371831
[142]	valid_0's auc: 0.768936	valid_0's binary_logloss: 0.371811
[143]	valid_0's auc: 0.768961	valid_0's binary_logloss: 0.371796
[144]	valid_0's auc: 0.7690

[256]	valid_0's auc: 0.771683	valid_0's binary_logloss: 0.370239
[257]	valid_0's auc: 0.771691	valid_0's binary_logloss: 0.370231
[258]	valid_0's auc: 0.771705	valid_0's binary_logloss: 0.370223
[259]	valid_0's auc: 0.771717	valid_0's binary_logloss: 0.370217
[260]	valid_0's auc: 0.771734	valid_0's binary_logloss: 0.370207
[261]	valid_0's auc: 0.771778	valid_0's binary_logloss: 0.370185
[262]	valid_0's auc: 0.771792	valid_0's binary_logloss: 0.370178
[263]	valid_0's auc: 0.771824	valid_0's binary_logloss: 0.370163
[264]	valid_0's auc: 0.771832	valid_0's binary_logloss: 0.370159
[265]	valid_0's auc: 0.771838	valid_0's binary_logloss: 0.370155
[266]	valid_0's auc: 0.771869	valid_0's binary_logloss: 0.370141
[267]	valid_0's auc: 0.771892	valid_0's binary_logloss: 0.370131
[268]	valid_0's auc: 0.771903	valid_0's binary_logloss: 0.370125
[269]	valid_0's auc: 0.771927	valid_0's binary_logloss: 0.370111
[270]	valid_0's auc: 0.771936	valid_0's binary_logloss: 0.370105
[271]	valid_0's auc: 0.77

[383]	valid_0's auc: 0.773569	valid_0's binary_logloss: 0.369154
[384]	valid_0's auc: 0.773574	valid_0's binary_logloss: 0.369151
[385]	valid_0's auc: 0.773591	valid_0's binary_logloss: 0.36914
[386]	valid_0's auc: 0.77359	valid_0's binary_logloss: 0.369141
[387]	valid_0's auc: 0.773599	valid_0's binary_logloss: 0.369137
[388]	valid_0's auc: 0.773606	valid_0's binary_logloss: 0.369135
[389]	valid_0's auc: 0.773619	valid_0's binary_logloss: 0.369128
[390]	valid_0's auc: 0.773628	valid_0's binary_logloss: 0.369122
[391]	valid_0's auc: 0.773643	valid_0's binary_logloss: 0.369113
[392]	valid_0's auc: 0.77366	valid_0's binary_logloss: 0.369101
[393]	valid_0's auc: 0.773684	valid_0's binary_logloss: 0.369081
[394]	valid_0's auc: 0.773698	valid_0's binary_logloss: 0.369066
[395]	valid_0's auc: 0.773722	valid_0's binary_logloss: 0.36905
[396]	valid_0's auc: 0.773755	valid_0's binary_logloss: 0.369033
[397]	valid_0's auc: 0.773782	valid_0's binary_logloss: 0.369018
[398]	valid_0's auc: 0.773779

[510]	valid_0's auc: 0.77501	valid_0's binary_logloss: 0.368312
[511]	valid_0's auc: 0.775005	valid_0's binary_logloss: 0.368314
[512]	valid_0's auc: 0.775007	valid_0's binary_logloss: 0.36831
[513]	valid_0's auc: 0.775007	valid_0's binary_logloss: 0.368309
[514]	valid_0's auc: 0.775009	valid_0's binary_logloss: 0.368308
[515]	valid_0's auc: 0.775009	valid_0's binary_logloss: 0.368307
[516]	valid_0's auc: 0.775025	valid_0's binary_logloss: 0.3683
[517]	valid_0's auc: 0.775031	valid_0's binary_logloss: 0.368296
[518]	valid_0's auc: 0.775032	valid_0's binary_logloss: 0.368294
[519]	valid_0's auc: 0.775051	valid_0's binary_logloss: 0.368279
[520]	valid_0's auc: 0.775061	valid_0's binary_logloss: 0.368274
[521]	valid_0's auc: 0.775065	valid_0's binary_logloss: 0.368271
[522]	valid_0's auc: 0.775091	valid_0's binary_logloss: 0.368259
[523]	valid_0's auc: 0.775095	valid_0's binary_logloss: 0.368257
[524]	valid_0's auc: 0.775123	valid_0's binary_logloss: 0.368243
[525]	valid_0's auc: 0.775122

[637]	valid_0's auc: 0.776196	valid_0's binary_logloss: 0.367606
[638]	valid_0's auc: 0.776203	valid_0's binary_logloss: 0.367602
[639]	valid_0's auc: 0.776237	valid_0's binary_logloss: 0.367587
[640]	valid_0's auc: 0.776274	valid_0's binary_logloss: 0.36756
[641]	valid_0's auc: 0.776281	valid_0's binary_logloss: 0.367553
[642]	valid_0's auc: 0.776299	valid_0's binary_logloss: 0.367543
[643]	valid_0's auc: 0.776305	valid_0's binary_logloss: 0.36754
[644]	valid_0's auc: 0.776318	valid_0's binary_logloss: 0.367531
[645]	valid_0's auc: 0.776319	valid_0's binary_logloss: 0.367529
[646]	valid_0's auc: 0.776317	valid_0's binary_logloss: 0.36753
[647]	valid_0's auc: 0.776335	valid_0's binary_logloss: 0.36752
[648]	valid_0's auc: 0.776348	valid_0's binary_logloss: 0.367511
[649]	valid_0's auc: 0.776359	valid_0's binary_logloss: 0.367505
[650]	valid_0's auc: 0.776366	valid_0's binary_logloss: 0.367503
[651]	valid_0's auc: 0.776367	valid_0's binary_logloss: 0.367502
[652]	valid_0's auc: 0.776383

[764]	valid_0's auc: 0.777237	valid_0's binary_logloss: 0.366975
[765]	valid_0's auc: 0.777266	valid_0's binary_logloss: 0.36696
[766]	valid_0's auc: 0.777284	valid_0's binary_logloss: 0.366951
[767]	valid_0's auc: 0.777286	valid_0's binary_logloss: 0.36695
[768]	valid_0's auc: 0.777283	valid_0's binary_logloss: 0.366952
[769]	valid_0's auc: 0.777281	valid_0's binary_logloss: 0.366953
[770]	valid_0's auc: 0.777291	valid_0's binary_logloss: 0.366946
[771]	valid_0's auc: 0.77732	valid_0's binary_logloss: 0.366932
[772]	valid_0's auc: 0.777326	valid_0's binary_logloss: 0.366929
[773]	valid_0's auc: 0.77734	valid_0's binary_logloss: 0.366921
[774]	valid_0's auc: 0.777354	valid_0's binary_logloss: 0.366913
[775]	valid_0's auc: 0.777374	valid_0's binary_logloss: 0.366903
[776]	valid_0's auc: 0.777395	valid_0's binary_logloss: 0.366891
[777]	valid_0's auc: 0.777402	valid_0's binary_logloss: 0.366885
[778]	valid_0's auc: 0.777417	valid_0's binary_logloss: 0.366876
[779]	valid_0's auc: 0.777416

[891]	valid_0's auc: 0.778289	valid_0's binary_logloss: 0.366348
[892]	valid_0's auc: 0.778299	valid_0's binary_logloss: 0.366339
[893]	valid_0's auc: 0.778301	valid_0's binary_logloss: 0.366338
[894]	valid_0's auc: 0.778302	valid_0's binary_logloss: 0.366337
[895]	valid_0's auc: 0.7783	valid_0's binary_logloss: 0.366339
[896]	valid_0's auc: 0.778299	valid_0's binary_logloss: 0.36634
[897]	valid_0's auc: 0.778315	valid_0's binary_logloss: 0.366329
[898]	valid_0's auc: 0.778326	valid_0's binary_logloss: 0.366323
[899]	valid_0's auc: 0.778333	valid_0's binary_logloss: 0.366319
[900]	valid_0's auc: 0.778339	valid_0's binary_logloss: 0.366316
[901]	valid_0's auc: 0.77834	valid_0's binary_logloss: 0.366316
[902]	valid_0's auc: 0.778335	valid_0's binary_logloss: 0.366319
[903]	valid_0's auc: 0.778347	valid_0's binary_logloss: 0.366314
[904]	valid_0's auc: 0.778344	valid_0's binary_logloss: 0.366315
[905]	valid_0's auc: 0.778346	valid_0's binary_logloss: 0.366314
[906]	valid_0's auc: 0.778343

[1018]	valid_0's auc: 0.778905	valid_0's binary_logloss: 0.365986
[1019]	valid_0's auc: 0.778906	valid_0's binary_logloss: 0.365984
[1020]	valid_0's auc: 0.778916	valid_0's binary_logloss: 0.365978
[1021]	valid_0's auc: 0.778925	valid_0's binary_logloss: 0.365973
[1022]	valid_0's auc: 0.778929	valid_0's binary_logloss: 0.365973
[1023]	valid_0's auc: 0.778929	valid_0's binary_logloss: 0.365973
[1024]	valid_0's auc: 0.778937	valid_0's binary_logloss: 0.365967
[1025]	valid_0's auc: 0.778958	valid_0's binary_logloss: 0.365957
[1026]	valid_0's auc: 0.778965	valid_0's binary_logloss: 0.365954
[1027]	valid_0's auc: 0.778961	valid_0's binary_logloss: 0.365956
[1028]	valid_0's auc: 0.778976	valid_0's binary_logloss: 0.365948
[1029]	valid_0's auc: 0.778976	valid_0's binary_logloss: 0.365949
[1030]	valid_0's auc: 0.778979	valid_0's binary_logloss: 0.365946
[1031]	valid_0's auc: 0.778976	valid_0's binary_logloss: 0.365948
[1032]	valid_0's auc: 0.778982	valid_0's binary_logloss: 0.365944
[1033]	val

[1143]	valid_0's auc: 0.779482	valid_0's binary_logloss: 0.365652
[1144]	valid_0's auc: 0.779476	valid_0's binary_logloss: 0.365656
[1145]	valid_0's auc: 0.77949	valid_0's binary_logloss: 0.365646
[1146]	valid_0's auc: 0.77949	valid_0's binary_logloss: 0.365647
[1147]	valid_0's auc: 0.77949	valid_0's binary_logloss: 0.365645
[1148]	valid_0's auc: 0.779506	valid_0's binary_logloss: 0.365637
[1149]	valid_0's auc: 0.779501	valid_0's binary_logloss: 0.365639
[1150]	valid_0's auc: 0.779503	valid_0's binary_logloss: 0.365638
[1151]	valid_0's auc: 0.779509	valid_0's binary_logloss: 0.365634
[1152]	valid_0's auc: 0.779515	valid_0's binary_logloss: 0.36563
[1153]	valid_0's auc: 0.779514	valid_0's binary_logloss: 0.365631
[1154]	valid_0's auc: 0.779515	valid_0's binary_logloss: 0.365629
[1155]	valid_0's auc: 0.779514	valid_0's binary_logloss: 0.36563
[1156]	valid_0's auc: 0.779528	valid_0's binary_logloss: 0.365624
[1157]	valid_0's auc: 0.779528	valid_0's binary_logloss: 0.365623
[1158]	valid_0'

[1268]	valid_0's auc: 0.779915	valid_0's binary_logloss: 0.365403
[1269]	valid_0's auc: 0.779935	valid_0's binary_logloss: 0.365393
[1270]	valid_0's auc: 0.779947	valid_0's binary_logloss: 0.365387
[1271]	valid_0's auc: 0.77995	valid_0's binary_logloss: 0.365384
[1272]	valid_0's auc: 0.779949	valid_0's binary_logloss: 0.365384
[1273]	valid_0's auc: 0.779955	valid_0's binary_logloss: 0.365382
[1274]	valid_0's auc: 0.779961	valid_0's binary_logloss: 0.365378
[1275]	valid_0's auc: 0.779961	valid_0's binary_logloss: 0.365377
[1276]	valid_0's auc: 0.779961	valid_0's binary_logloss: 0.365377
[1277]	valid_0's auc: 0.779958	valid_0's binary_logloss: 0.365379
[1278]	valid_0's auc: 0.779963	valid_0's binary_logloss: 0.365377
[1279]	valid_0's auc: 0.779959	valid_0's binary_logloss: 0.365379
[1280]	valid_0's auc: 0.779972	valid_0's binary_logloss: 0.365374
[1281]	valid_0's auc: 0.779971	valid_0's binary_logloss: 0.365373
[1282]	valid_0's auc: 0.77998	valid_0's binary_logloss: 0.365368
[1283]	valid

[1393]	valid_0's auc: 0.780564	valid_0's binary_logloss: 0.365049
[1394]	valid_0's auc: 0.78058	valid_0's binary_logloss: 0.365042
[1395]	valid_0's auc: 0.780592	valid_0's binary_logloss: 0.365034
[1396]	valid_0's auc: 0.780598	valid_0's binary_logloss: 0.365032
[1397]	valid_0's auc: 0.780596	valid_0's binary_logloss: 0.365033
[1398]	valid_0's auc: 0.780598	valid_0's binary_logloss: 0.365032
[1399]	valid_0's auc: 0.780599	valid_0's binary_logloss: 0.365031
[1400]	valid_0's auc: 0.78062	valid_0's binary_logloss: 0.365022
[1401]	valid_0's auc: 0.780624	valid_0's binary_logloss: 0.36502
[1402]	valid_0's auc: 0.78063	valid_0's binary_logloss: 0.365015
[1403]	valid_0's auc: 0.780632	valid_0's binary_logloss: 0.365015
[1404]	valid_0's auc: 0.78064	valid_0's binary_logloss: 0.365009
[1405]	valid_0's auc: 0.780638	valid_0's binary_logloss: 0.365011
[1406]	valid_0's auc: 0.780639	valid_0's binary_logloss: 0.365011
[1407]	valid_0's auc: 0.780639	valid_0's binary_logloss: 0.36501
[1408]	valid_0's

[1518]	valid_0's auc: 0.781205	valid_0's binary_logloss: 0.364719
[1519]	valid_0's auc: 0.781212	valid_0's binary_logloss: 0.364716
[1520]	valid_0's auc: 0.781216	valid_0's binary_logloss: 0.364711
[1521]	valid_0's auc: 0.781228	valid_0's binary_logloss: 0.364707
[1522]	valid_0's auc: 0.781237	valid_0's binary_logloss: 0.364703
[1523]	valid_0's auc: 0.781247	valid_0's binary_logloss: 0.364697
[1524]	valid_0's auc: 0.781249	valid_0's binary_logloss: 0.364696
[1525]	valid_0's auc: 0.781249	valid_0's binary_logloss: 0.364696
[1526]	valid_0's auc: 0.781256	valid_0's binary_logloss: 0.364692
[1527]	valid_0's auc: 0.781256	valid_0's binary_logloss: 0.364692
[1528]	valid_0's auc: 0.781258	valid_0's binary_logloss: 0.364692
[1529]	valid_0's auc: 0.78126	valid_0's binary_logloss: 0.364692
[1530]	valid_0's auc: 0.781263	valid_0's binary_logloss: 0.364688
[1531]	valid_0's auc: 0.781265	valid_0's binary_logloss: 0.364688
[1532]	valid_0's auc: 0.781265	valid_0's binary_logloss: 0.364688
[1533]	vali

[1643]	valid_0's auc: 0.781714	valid_0's binary_logloss: 0.364439
[1644]	valid_0's auc: 0.781716	valid_0's binary_logloss: 0.364439
[1645]	valid_0's auc: 0.781715	valid_0's binary_logloss: 0.36444
[1646]	valid_0's auc: 0.781722	valid_0's binary_logloss: 0.364436
[1647]	valid_0's auc: 0.781724	valid_0's binary_logloss: 0.364437
[1648]	valid_0's auc: 0.781725	valid_0's binary_logloss: 0.364436
[1649]	valid_0's auc: 0.781728	valid_0's binary_logloss: 0.364434
[1650]	valid_0's auc: 0.78173	valid_0's binary_logloss: 0.364432
[1651]	valid_0's auc: 0.781743	valid_0's binary_logloss: 0.364425
[1652]	valid_0's auc: 0.781751	valid_0's binary_logloss: 0.364419
[1653]	valid_0's auc: 0.78175	valid_0's binary_logloss: 0.36442
[1654]	valid_0's auc: 0.781749	valid_0's binary_logloss: 0.364419
[1655]	valid_0's auc: 0.781747	valid_0's binary_logloss: 0.36442
[1656]	valid_0's auc: 0.781762	valid_0's binary_logloss: 0.364413
[1657]	valid_0's auc: 0.781776	valid_0's binary_logloss: 0.364408
[1658]	valid_0'

[1768]	valid_0's auc: 0.782238	valid_0's binary_logloss: 0.364151
[1769]	valid_0's auc: 0.782236	valid_0's binary_logloss: 0.364152
[1770]	valid_0's auc: 0.782237	valid_0's binary_logloss: 0.364153
[1771]	valid_0's auc: 0.782238	valid_0's binary_logloss: 0.364154
[1772]	valid_0's auc: 0.782246	valid_0's binary_logloss: 0.364149
[1773]	valid_0's auc: 0.782256	valid_0's binary_logloss: 0.364143
[1774]	valid_0's auc: 0.782258	valid_0's binary_logloss: 0.364141
[1775]	valid_0's auc: 0.782256	valid_0's binary_logloss: 0.364141
[1776]	valid_0's auc: 0.78227	valid_0's binary_logloss: 0.364132
[1777]	valid_0's auc: 0.78227	valid_0's binary_logloss: 0.364132
[1778]	valid_0's auc: 0.782284	valid_0's binary_logloss: 0.364125
[1779]	valid_0's auc: 0.782293	valid_0's binary_logloss: 0.364119
[1780]	valid_0's auc: 0.782295	valid_0's binary_logloss: 0.364118
[1781]	valid_0's auc: 0.782307	valid_0's binary_logloss: 0.364112
[1782]	valid_0's auc: 0.78231	valid_0's binary_logloss: 0.364109
[1783]	valid_

[1893]	valid_0's auc: 0.782628	valid_0's binary_logloss: 0.363931
[1894]	valid_0's auc: 0.78263	valid_0's binary_logloss: 0.36393
[1895]	valid_0's auc: 0.782636	valid_0's binary_logloss: 0.363929
[1896]	valid_0's auc: 0.782634	valid_0's binary_logloss: 0.363931
[1897]	valid_0's auc: 0.782633	valid_0's binary_logloss: 0.363931
[1898]	valid_0's auc: 0.782637	valid_0's binary_logloss: 0.363929
[1899]	valid_0's auc: 0.782641	valid_0's binary_logloss: 0.363928
[1900]	valid_0's auc: 0.782644	valid_0's binary_logloss: 0.363926
[1901]	valid_0's auc: 0.782639	valid_0's binary_logloss: 0.363929
[1902]	valid_0's auc: 0.782647	valid_0's binary_logloss: 0.363924
[1903]	valid_0's auc: 0.782653	valid_0's binary_logloss: 0.36392
[1904]	valid_0's auc: 0.782656	valid_0's binary_logloss: 0.363919
[1905]	valid_0's auc: 0.782655	valid_0's binary_logloss: 0.36392
[1906]	valid_0's auc: 0.78266	valid_0's binary_logloss: 0.363918
[1907]	valid_0's auc: 0.782664	valid_0's binary_logloss: 0.363915
[1908]	valid_0'