In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import gc
import time
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from catboost import CatBoostClassifier, Pool
import pickle
import os

In [2]:
if not os.path.exists('./out/'):
    os.mkdir('./out/')
DATA_PATH = '../pkl/'

In [3]:
print('load data...')
invite_info = pd.read_pickle(os.path.join(DATA_PATH, 'invite_info.pkl'))
invite_info_evaluate = pd.read_pickle(os.path.join(DATA_PATH, 'invite_info_evaluate.pkl'))
data = pd.read_pickle(os.path.join(DATA_PATH, 'cbt_data.pkl'))
print(list(data.columns))

load data...
['iday', 'ihour', 'itime', 'label', 'qid', 'uid', 'invite_answer_gap', 'gender', 'freq', 'A1', 'B1', 'C1', 'D1', 'E1', 'A2', 'B2', 'C2', 'D2', 'E2', 'score', 'num_topic_a', 'num_topic_i', 'most_topic_i', 'min_topic_iv', 'max_topic_iv', 'mean_topic_iv', 'std_topic_iv', 'num_title_sw', 'num_title_w', 'num_desc_sw', 'num_desc_w', 'num_qtopic', 'qhour', 'inv_que_gap', 'num_topic_a_com', 'num_topic_i_com', 'min_topic_iv_com', 'max_topic_iv_com', 'mean_topic_iv_com', 'std_topic_iv_com', 'user_cnt', 'question_cnt', 'question_curr_expo', 'question_history_expo', 'question_future_expo', 'user_curr_expo', 'user_history_expo', 'user_future_expo', 'prev_excellent_sum', 'prev_recommend_sum', 'prev_figure_sum', 'prev_video_sum', 'prev_num_word_sum', 'prev_num_like_sum', 'prev_num_unlike_sum', 'prev_num_comment_sum', 'prev_num_favor_sum', 'prev_num_thank_sum', 'prev_num_report_sum', 'prev_num_nohelp_sum', 'prev_num_oppose_sum', 'prev_cnt_sum', 'prev_excellent_mean', 'prev_recommend_mean'

In [4]:
len_train = len(invite_info)
train = data[:len_train]
test = data[len_train:]

In [5]:
del data
gc.collect()

40

In [6]:
drop_feats = ['qid', 'uid', 'itime', 'label', 'iday', 'user_curr_expo', 'user_curr_expo_d']
used_feats = [f for f in train.columns if f not in drop_feats]
print(len(used_feats))
print(used_feats)

138
['ihour', 'invite_answer_gap', 'gender', 'freq', 'A1', 'B1', 'C1', 'D1', 'E1', 'A2', 'B2', 'C2', 'D2', 'E2', 'score', 'num_topic_a', 'num_topic_i', 'most_topic_i', 'min_topic_iv', 'max_topic_iv', 'mean_topic_iv', 'std_topic_iv', 'num_title_sw', 'num_title_w', 'num_desc_sw', 'num_desc_w', 'num_qtopic', 'qhour', 'inv_que_gap', 'num_topic_a_com', 'num_topic_i_com', 'min_topic_iv_com', 'max_topic_iv_com', 'mean_topic_iv_com', 'std_topic_iv_com', 'user_cnt', 'question_cnt', 'question_curr_expo', 'question_history_expo', 'question_future_expo', 'user_history_expo', 'user_future_expo', 'prev_excellent_sum', 'prev_recommend_sum', 'prev_figure_sum', 'prev_video_sum', 'prev_num_word_sum', 'prev_num_like_sum', 'prev_num_unlike_sum', 'prev_num_comment_sum', 'prev_num_favor_sum', 'prev_num_thank_sum', 'prev_num_report_sum', 'prev_num_nohelp_sum', 'prev_num_oppose_sum', 'prev_cnt_sum', 'prev_excellent_mean', 'prev_recommend_mean', 'prev_figure_mean', 'prev_video_mean', 'prev_num_word_mean', 'pre

In [7]:
train_x = train[used_feats].reset_index(drop=True)
train_y = train['label'].reset_index(drop=True)
test_x = test[used_feats].reset_index(drop=True)

In [8]:
print(test_x.shape)
print(train_x.shape)

(1141683, 138)
(9489162, 138)


In [9]:
del train, test
gc.collect()

20

In [10]:
preds = np.zeros((test_x.shape[0], 2))
scores = []
imp = pd.DataFrame()
imp['feat'] = used_feats

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for index, (tr_idx, va_idx) in enumerate(kfold.split(train_x, train_y)):
    print('*' * 30)
    X_train, y_train, X_valid, y_valid = train_x.iloc[tr_idx], train_y.iloc[tr_idx], train_x.iloc[va_idx], train_y.iloc[va_idx]
    cate_features = []
    train_pool = Pool(X_train, y_train, cat_features=cate_features)
    eval_pool = Pool(X_valid, y_valid,cat_features=cate_features)
    
    del X_train, y_train, X_valid, y_valid
    gc.collect()    
    
    cbt_model = CatBoostClassifier(iterations=10000,
                       learning_rate=0.1,
                       eval_metric='AUC',
                       use_best_model=True,
                       random_seed=42,
                       logging_level='Verbose',
                       task_type='GPU',
                       devices='0',
                       early_stopping_rounds=300,
                       loss_function='Logloss',
                       depth=12,
#                        depth=11,
                       )
    cbt_model.fit(train_pool, eval_set=eval_pool, verbose=100)
    
    imp['score%d' % (index+1)] = cbt_model.feature_importances_
    
    score = cbt_model.best_score_['validation']['AUC']
    scores.append(score)
    print('fold %d round %d : score: %.6f | mean score %.6f' % (index+1, cbt_model.best_iteration_, score,np.mean(scores))) 
    preds += cbt_model.predict_proba(test_x)  
    
    del cbt_model, train_pool, eval_pool
    gc.collect()    

******************************
0:	learn: 0.8084730	test: 0.8087192	best: 0.8087192 (0)	total: 316ms	remaining: 52m 38s
100:	learn: 0.8761715	test: 0.8738145	best: 0.8738145 (100)	total: 30s	remaining: 49m 2s
200:	learn: 0.8850540	test: 0.8800405	best: 0.8800405 (200)	total: 58.8s	remaining: 47m 48s
300:	learn: 0.8909656	test: 0.8833749	best: 0.8833749 (300)	total: 1m 28s	remaining: 47m 17s
400:	learn: 0.8953655	test: 0.8855072	best: 0.8855072 (400)	total: 1m 56s	remaining: 46m 33s
500:	learn: 0.8990941	test: 0.8867908	best: 0.8867911 (499)	total: 2m 26s	remaining: 46m 9s
600:	learn: 0.9023721	test: 0.8876308	best: 0.8876328 (598)	total: 2m 55s	remaining: 45m 40s
700:	learn: 0.9053734	test: 0.8883600	best: 0.8883600 (700)	total: 3m 24s	remaining: 45m 14s
800:	learn: 0.9082235	test: 0.8890114	best: 0.8890114 (800)	total: 3m 54s	remaining: 44m 52s
900:	learn: 0.9109918	test: 0.8894401	best: 0.8894423 (898)	total: 4m 24s	remaining: 44m 33s
1000:	learn: 0.9134594	test: 0.8897655	best: 0.889

KeyboardInterrupt: 

In [11]:
imp.sort_values(by='score1', ascending=False).head(50)

Unnamed: 0,feat,score1,score2
37,question_curr_expo,7.831966,7.611003
14,score,7.792951,7.656012
28,inv_que_gap,4.679114,4.680837
118,ques_user_title_desc_textattbirnn_enc,2.900159,3.188296
0,ihour,2.854827,2.883619
36,question_cnt,2.803115,2.795075
41,user_future_expo,2.689129,2.507119
75,qtime_mean,2.365863,2.399506
35,user_cnt,2.269791,2.543107
89,unnext_itime_gap,2.175468,2.25917


In [None]:
# 

In [None]:
invite_info_evaluate.head()

In [None]:
PATH = '../data/data_set_0926'
invite_info_evaluate = pd.read_csv(os.path.join(PATH, 'invite_info_evaluate_1_0926.txt'), 
                          names=['question_id', 'author_id', 'invite_time'], sep='\t')
result = invite_info_evaluate
result['result'] = preds[:, 1] / len(scores)
result.head()

In [None]:
localtime = time.localtime(time.time())
save_path = './out/result_%02d%02d%02d%02d.txt' % (localtime[1], localtime[2], localtime[3], localtime[4])
result.to_csv(save_path, sep='\t', index=False, header=False)
print('%s saved.' % save_path)

In [None]:
# 0.874397  0.849049237632428
# 0.880489  0.844101158553199 -
# 0.878943  0.851078697864456
# 0.880510  0.854421755061172
# 0.885294  0.858160387338417
# 0.885622  0.857313282888585 -
# 0.886196  0.856...          -
# 0.887172  0.863096415680472
# 0.888104  0.862779338260129
# 0.888148  0.862893038464606
# 0.888559  0.863349443045746
# 0.888572  0.864079617350822
# 0.888655  0.863959534391522
# 0.888649  0.863831492125684
# 0.889573  0.867511284503794
# 0.890168  0.868963718646371 
# 0.890949  0.870415259776506
# 0.891941 0.870415067307698 -
# 0.891003 0.870312768496781 -
# 0.889943 0.869245658566105 -
# 0.891043 0.870431557725156
# 0.893536
# 0.891412

In [None]:
# 0.890058 13 0.870409639912773

In [None]:
# !pip list