In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import gc
import time
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import xgboost as xgb
import pickle
import os

In [2]:
# !pip install xgboost

In [3]:
if not os.path.exists('./out/'):
    os.mkdir('./out/')
DATA_PATH = '../pkl/'

In [4]:
print('load data...')
invite_info = pd.read_pickle(os.path.join(DATA_PATH, 'invite_info.pkl'))
invite_info_evaluate = pd.read_pickle(os.path.join(DATA_PATH, 'invite_info_evaluate.pkl'))
data = pd.read_pickle(os.path.join(DATA_PATH, 'cbt_data.pkl'))
print(list(data.columns))

load data...
['iday', 'ihour', 'itime', 'label', 'qid', 'uid', 'invite_answer_gap', 'gender', 'freq', 'A1', 'B1', 'C1', 'D1', 'E1', 'A2', 'B2', 'C2', 'D2', 'E2', 'score', 'num_topic_a', 'num_topic_i', 'most_topic_i', 'min_topic_iv', 'max_topic_iv', 'mean_topic_iv', 'std_topic_iv', 'num_title_sw', 'num_title_w', 'num_desc_sw', 'num_desc_w', 'num_qtopic', 'qhour', 'inv_que_gap', 'num_topic_a_com', 'num_topic_i_com', 'min_topic_iv_com', 'max_topic_iv_com', 'mean_topic_iv_com', 'std_topic_iv_com', 'user_cnt', 'question_cnt', 'question_curr_expo', 'question_history_expo', 'question_future_expo', 'user_curr_expo', 'user_history_expo', 'user_future_expo', 'prev_excellent_sum', 'prev_recommend_sum', 'prev_figure_sum', 'prev_video_sum', 'prev_num_word_sum', 'prev_num_like_sum', 'prev_num_unlike_sum', 'prev_num_comment_sum', 'prev_num_favor_sum', 'prev_num_thank_sum', 'prev_num_report_sum', 'prev_num_nohelp_sum', 'prev_num_oppose_sum', 'prev_cnt_sum', 'prev_excellent_mean', 'prev_recommend_mean'

In [5]:
len_train = len(invite_info)
train = data[:len_train]
test = data[len_train:]

In [6]:
del data
gc.collect()

40

In [7]:
drop_feats = ['qid', 'uid', 'itime', 'label', 'iday', 'user_curr_expo', 'user_curr_expo_d']
used_feats = [f for f in train.columns if f not in drop_feats]
print(len(used_feats))
print(used_feats)

119
['ihour', 'invite_answer_gap', 'gender', 'freq', 'A1', 'B1', 'C1', 'D1', 'E1', 'A2', 'B2', 'C2', 'D2', 'E2', 'score', 'num_topic_a', 'num_topic_i', 'most_topic_i', 'min_topic_iv', 'max_topic_iv', 'mean_topic_iv', 'std_topic_iv', 'num_title_sw', 'num_title_w', 'num_desc_sw', 'num_desc_w', 'num_qtopic', 'qhour', 'inv_que_gap', 'num_topic_a_com', 'num_topic_i_com', 'min_topic_iv_com', 'max_topic_iv_com', 'mean_topic_iv_com', 'std_topic_iv_com', 'user_cnt', 'question_cnt', 'question_curr_expo', 'question_history_expo', 'question_future_expo', 'user_history_expo', 'user_future_expo', 'prev_excellent_sum', 'prev_recommend_sum', 'prev_figure_sum', 'prev_video_sum', 'prev_num_word_sum', 'prev_num_like_sum', 'prev_num_unlike_sum', 'prev_num_comment_sum', 'prev_num_favor_sum', 'prev_num_thank_sum', 'prev_num_report_sum', 'prev_num_nohelp_sum', 'prev_num_oppose_sum', 'prev_cnt_sum', 'prev_excellent_mean', 'prev_recommend_mean', 'prev_figure_mean', 'prev_video_mean', 'prev_num_word_mean', 'pre

In [8]:
train_x = train[used_feats].reset_index(drop=True)
train_y = train['label'].reset_index(drop=True)
test_x = test[used_feats].reset_index(drop=True)

In [9]:
print(test_x.shape)
print(train_x.shape)

(1141683, 119)
(9489162, 119)


In [10]:
del train, test
gc.collect()

40

In [11]:
%%time
preds = np.zeros((test_x.shape[0], 2))
scores = []
imp = pd.DataFrame()
imp['feat'] = used_feats

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for index, (tr_idx, va_idx) in enumerate(kfold.split(train_x, train_y)):
    print('*' * 30)
    X_train, y_train, X_valid, y_valid = train_x.iloc[tr_idx], train_y.iloc[tr_idx], train_x.iloc[va_idx], train_y.iloc[va_idx]

    xgb_model = xgb.XGBClassifier(
                                n_estimators=10000,
                                max_depth=12,
                                learning_rate=0.1,
                                reg_lambda=10,
                                subsample=0.9,
                                colsample_bytree=0.9,
                                missing=np.nan,
                                random_state=42,
                                tree_method='gpu_hist'  
                            )
    
#     xgb_model = xgb.XGBClassifier(
#                                 n_estimators=10000,
#                                 max_depth=12,
#                                 learning_rate=0.1,
#                                 reg_lambda=3.0,
#                                 subsample=0.9,
#                                 colsample_bytree=0.9,
#                                 missing=np.nan,
#                                 random_state=42,
#                                 tree_method='gpu_hist'  
#                             )


    xgb_model.fit(X_train, y_train, eval_set = [(X_valid, y_valid)], eval_metric='auc', early_stopping_rounds=300, verbose=100)
    
    imp['score%d' % (index+1)] = xgb_model.feature_importances_
    
    score = xgb_model.best_score
    scores.append(score)
    ##
    print('fold %d round %d : score: %.6f | mean score %.6f' % (index+1, xgb_model.best_iteration, score,np.mean(scores))) 
    preds += xgb_model.predict_proba(test_x) 
    
    del xgb_model
    del X_train, y_train, X_valid, y_valid
    gc.collect()  

******************************
[0]	validation_0-auc:0.833111
Will train until validation_0-auc hasn't improved in 300 rounds.
[100]	validation_0-auc:0.883636
[200]	validation_0-auc:0.889221
[300]	validation_0-auc:0.891031
[400]	validation_0-auc:0.891959
[500]	validation_0-auc:0.892295
[600]	validation_0-auc:0.89251
[700]	validation_0-auc:0.89268
[800]	validation_0-auc:0.892726
[900]	validation_0-auc:0.892725
[1000]	validation_0-auc:0.892709
[1100]	validation_0-auc:0.892668
Stopping. Best iteration:
[858]	validation_0-auc:0.892756

fold 1 round 858 : score: 0.892756 | mean score 0.892756
******************************
[0]	validation_0-auc:0.833722
Will train until validation_0-auc hasn't improved in 300 rounds.
[100]	validation_0-auc:0.88356
[200]	validation_0-auc:0.889005
[300]	validation_0-auc:0.890916
[400]	validation_0-auc:0.891784
[500]	validation_0-auc:0.892289
[600]	validation_0-auc:0.892521
[700]	validation_0-auc:0.892655
[800]	validation_0-auc:0.892743
[900]	validation_0-auc:0.

In [12]:
# del xgb_model
# del X_train, y_train, X_valid, y_valid

In [13]:
imp.sort_values(by='score1', ascending=False).head(50)

Unnamed: 0,feat,score1,score2,score3,score4,score5
37,question_curr_expo,0.058739,0.057997,0.056954,0.058028,0.057382
115,user_ques_topic_dnn_enc_C,0.045099,0.044442,0.042044,0.045013,0.046981
14,score,0.034182,0.033641,0.03308,0.033289,0.032685
94,user_history_accept_rate,0.028974,0.028199,0.028166,0.028287,0.027535
118,ques_user_title_desc_textattbirnn_enc,0.027679,0.027362,0.027151,0.02685,0.026355
16,num_topic_i,0.02534,0.025082,0.026021,0.023552,0.022901
28,inv_que_gap,0.022871,0.022498,0.022345,0.022595,0.022457
93,user_history_accept,0.021146,0.020189,0.020182,0.019838,0.0196
41,user_future_expo,0.020136,0.019951,0.019628,0.020175,0.019487
36,question_cnt,0.017171,0.016542,0.016605,0.016819,0.016807


In [14]:
# 

In [15]:
PATH = '../data/data_set_0926'
invite_info_evaluate = pd.read_csv(os.path.join(PATH, 'invite_info_evaluate_1_0926.txt'), 
                          names=['question_id', 'author_id', 'invite_time'], sep='\t')
result = invite_info_evaluate
result['result'] = preds[:, 1] / len(scores)
result.head()

Unnamed: 0,question_id,author_id,invite_time,result
0,Q1493039281,M64135255,D3870-H9,0.009926
1,Q2023398782,M2536956560,D3872-H22,0.00033
2,Q4151338694,M3294926344,D3874-H15,0.013453
3,Q3271436624,M3744310794,D3873-H4,0.233823
4,Q3314287018,M1349051752,D3872-H19,0.01292


In [16]:
localtime = time.localtime(time.time())
save_path = './out/result_xgb_%02d%02d%02d%02d.txt' % (localtime[1], localtime[2], localtime[3], localtime[4])
result.to_csv(save_path, sep='\t', index=False, header=False)
print('%s saved.' % save_path)

./out/result_xgb_12101735.txt saved.


In [17]:
# 0.874397  0.849049237632428
# 0.880489  0.844101158553199 -
# 0.878943  0.851078697864456
# 0.880510  0.854421755061172
# 0.885294  0.858160387338417
# 0.885622  0.857313282888585 -
# 0.886196  0.856...          -
# 0.887172  0.863096415680472
# 0.888104  0.862779338260129
# 0.888148  0.862893038464606
# 0.888559  0.863349443045746
# 0.888572  0.864079617350822
# 0.888655  0.863959534391522
# 0.888649  0.863831492125684
# 0.889573  0.867511284503794
# 0.890168  0.868963718646371 

In [18]:
# 0.892180
# 0.892914

In [21]:
!pip list | grep xgb

xgboost                            0.90               


In [22]:
!pip list | grep tqdm

tqdm                               4.40.0             
