In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import gc
import time
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, log_loss
import pickle
import os
from tqdm import tqdm

In [2]:
import os
import tensorflow as tf

os.environ['CUDA_VISIBLE_DEVICES'] = "0"
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.Session(config=config)

from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Embedding, Input, BatchNormalization, Dropout, concatenate, PReLU, Flatten, Concatenate
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Reshape, Lambda, concatenate, dot, add, RepeatVector
from keras.layers import Dropout, GaussianDropout, multiply, SpatialDropout1D, BatchNormalization, subtract
from keras.utils import Sequence
from keras.callbacks import Callback
from keras import backend as K

Using TensorFlow backend.


In [3]:
if not os.path.exists('./out/'):
    os.mkdir('./out/')
DATA_PATH = '../pkl/'
WEIGHT_PATH = './weights/'

In [4]:
print('load data...')
invite_info = pd.read_pickle(os.path.join(DATA_PATH, 'invite_info.pkl'))
invite_info_evaluate = pd.read_pickle(os.path.join(DATA_PATH, 'invite_info_evaluate.pkl'))
data = pd.read_pickle(os.path.join(DATA_PATH, 'cbt_data.pkl'))
print(list(data.columns))

load data...
['iday', 'ihour', 'itime', 'label', 'qid', 'uid', 'invite_answer_gap', 'gender', 'freq', 'A1', 'B1', 'C1', 'D1', 'E1', 'A2', 'B2', 'C2', 'D2', 'E2', 'score', 'num_topic_a', 'num_topic_i', 'most_topic_i', 'min_topic_iv', 'max_topic_iv', 'mean_topic_iv', 'std_topic_iv', 'num_title_sw', 'num_title_w', 'num_desc_sw', 'num_desc_w', 'num_qtopic', 'qhour', 'inv_que_gap', 'num_topic_a_com', 'num_topic_i_com', 'min_topic_iv_com', 'max_topic_iv_com', 'mean_topic_iv_com', 'std_topic_iv_com', 'user_cnt', 'question_cnt', 'question_curr_expo', 'question_history_expo', 'question_future_expo', 'user_curr_expo', 'user_history_expo', 'user_future_expo', 'prev_excellent_sum', 'prev_recommend_sum', 'prev_figure_sum', 'prev_video_sum', 'prev_num_word_sum', 'prev_num_like_sum', 'prev_num_unlike_sum', 'prev_num_comment_sum', 'prev_num_favor_sum', 'prev_num_thank_sum', 'prev_num_report_sum', 'prev_num_nohelp_sum', 'prev_num_oppose_sum', 'prev_cnt_sum', 'prev_excellent_mean', 'prev_recommend_mean'

In [5]:
drop_feats = ['qid', 'uid', 'itime', 'label', 'iday', 'user_curr_expo', 'user_curr_expo_d']
used_feats = [f for f in data.columns if f not in drop_feats]
print(len(used_feats))
print(used_feats)

120
['ihour', 'invite_answer_gap', 'gender', 'freq', 'A1', 'B1', 'C1', 'D1', 'E1', 'A2', 'B2', 'C2', 'D2', 'E2', 'score', 'num_topic_a', 'num_topic_i', 'most_topic_i', 'min_topic_iv', 'max_topic_iv', 'mean_topic_iv', 'std_topic_iv', 'num_title_sw', 'num_title_w', 'num_desc_sw', 'num_desc_w', 'num_qtopic', 'qhour', 'inv_que_gap', 'num_topic_a_com', 'num_topic_i_com', 'min_topic_iv_com', 'max_topic_iv_com', 'mean_topic_iv_com', 'std_topic_iv_com', 'user_cnt', 'question_cnt', 'question_curr_expo', 'question_history_expo', 'question_future_expo', 'user_history_expo', 'user_future_expo', 'prev_excellent_sum', 'prev_recommend_sum', 'prev_figure_sum', 'prev_video_sum', 'prev_num_word_sum', 'prev_num_like_sum', 'prev_num_unlike_sum', 'prev_num_comment_sum', 'prev_num_favor_sum', 'prev_num_thank_sum', 'prev_num_report_sum', 'prev_num_nohelp_sum', 'prev_num_oppose_sum', 'prev_cnt_sum', 'prev_excellent_mean', 'prev_recommend_mean', 'prev_figure_mean', 'prev_video_mean', 'prev_num_word_mean', 'pre

In [6]:
cat_feat = ['gender', 'freq', 'A1', 'B1', 'C1', 'D1', 'E1', 'A2', 'B2', 'C2', 'D2', 'E2', 'most_topic_i']
num_feat = [f for f in used_feats if f not in cat_feat]
print('cat_feat: {}, num_feat: {}'.format(len(cat_feat), len(num_feat)))

cat_feat: 13, num_feat: 107


In [7]:
for f in tqdm(cat_feat):
    data[f] = data[f].fillna('-1')
    data[f] = LabelEncoder().fit_transform(data[f])

100%|██████████| 13/13 [00:10<00:00,  1.23it/s]


In [8]:
for f in tqdm(num_feat):
    arr = data[f].values
    na_idx = np.isnan(arr)
    inf_idx = np.isinf(arr)
    arr[na_idx] = 0
    arr[inf_idx] = 0
    max_, min_ = arr.max(), arr.min()
    arr = (arr - min_) / (max_ - min_)
    arr[na_idx] = -0.1
    arr[inf_idx] = 1.1
    arr += 0.1
    data[f] = arr

100%|██████████| 107/107 [00:20<00:00,  5.13it/s]


In [9]:
len_train = len(invite_info)
train = data[:len_train]
test = data[len_train:]

In [10]:
single_embed_cnt = [(f, data[f].nunique()) for f in cat_feat]
single_embed_cnt 

[('gender', 3),
 ('freq', 5),
 ('A1', 2),
 ('B1', 2),
 ('C1', 2),
 ('D1', 2),
 ('E1', 2),
 ('A2', 2312),
 ('B2', 255),
 ('C2', 400),
 ('D2', 1356),
 ('E2', 2),
 ('most_topic_i', 18406)]

In [11]:
del data
gc.collect()

5

In [12]:
train_x = train[used_feats].reset_index(drop=True)
train_y = train['label'].reset_index(drop=True)
test_x = test[used_feats].reset_index(drop=True)

In [13]:
print(test_x.shape)
print(train_x.shape)

(1141683, 120)
(9489162, 120)


In [14]:
del train, test

In [15]:
# def DNN(single_embed_cnt, inp_dense_dim, embed_size=32):
    
#     # single embedding
#     inp_single_embed = []
#     out_single_embed = []
#     for feat_name, inp_embed_dim in single_embed_cnt:
#         inp = Input(shape=(1,))
#         inp_single_embed.append(inp)
#         x = Embedding(inp_embed_dim, embed_size)(inp)
#         x = Flatten()(x)
#         out_single_embed.append(x)
    
#     # dense
#     inp_dense = Input(shape=(inp_dense_dim,))
#     x = Dense(256)(inp_dense)
#     x = BatchNormalization()(x)
#     x = PReLU()(x)
#     out_dense = Dropout(0.2)(x)
    
    
#     # concat
#     conc = concatenate(out_single_embed + [out_dense])
#     conc = Dense(256)(conc)
#     conc = BatchNormalization()(conc)
#     conc = PReLU()(conc)
#     conc = Dropout(0.2)(conc)
#     conc = Dense(256)(conc)
#     conc = BatchNormalization()(conc)
#     conc = PReLU()(conc)
#     conc = Dropout(0.2)(conc)
#     conc = Dense(256)(conc)
#     conc = BatchNormalization()(conc)
#     conc = PReLU()(conc)
#     conc = Dropout(0.2)(conc)
#     out = Dense(1, activation="sigmoid")(conc)
#     model = Model(inputs=inp_single_embed+[inp_dense], outputs=out)
    
#     return model

In [16]:
def DNN(single_embed_cnt, inp_dense_dim, embed_size=32):
    
    # single embedding
    inp_single_embed = []
    out_single_embed = []
    out_lr = []
    for feat_name, inp_embed_dim in single_embed_cnt:
        inp = Input(shape=(1,))
        inp_single_embed.append(inp)
        x = Embedding(inp_embed_dim, embed_size)(inp)
        x = Flatten()(x)
        out_single_embed.append(x)
        # 
        x = Embedding(inp_embed_dim, 1)(inp)
        x = Flatten()(x)
        out_lr.append(Dense(1)(x))
    
    # dense
    inp_dense = Input(shape=(inp_dense_dim,))
    out_lr.append(Dense(1)(inp_dense))
    x = Dense(256)(inp_dense)
    x = BatchNormalization()(x)
    x = PReLU()(x)
    out_dense = Dropout(0.2)(x)
    
    
    # concat
    conc = concatenate(out_single_embed + [out_dense])
    conc = Dense(256)(conc)
    conc = BatchNormalization()(conc)
    conc = PReLU()(conc)
    conc = Dropout(0.2)(conc)
    conc = Dense(256)(conc)
    conc = BatchNormalization()(conc)
    conc = PReLU()(conc)
    conc = Dropout(0.2)(conc)
    conc = Dense(1)(conc)
    conc = concatenate(out_lr + [conc])
    conc = BatchNormalization()(conc)
    conc = PReLU()(conc)
    conc = Dropout(0.2)(conc)
    out = Dense(1, activation="sigmoid")(conc)
    model = Model(inputs=inp_single_embed+[inp_dense], outputs=out)
    
    return model

In [17]:
class DataSequence(Sequence):
    
    def __init__(self, x, y, single_embed_feat, dense_feat, batch_size=128):
        self.x = x
        self.y = y
        self.single_embed_feat = single_embed_feat
        self.dense_feat = dense_feat
        self.batch_size = batch_size
#         self.x_single_embed = [x[f].values for f in single_embed_feat]
#         self.x_dense = x[dense_feat].values
        
    def __len__(self):
        return int(np.ceil(len(self.x) / float(self.batch_size)))
    
    def __getitem__(self, idx):
        batch = self.x.iloc[idx * self.batch_size:(idx + 1) * self.batch_size]
#         batch_x = [batch[f].values for f in self.single_embed_feat] + [batch[f].values for f in self.dense_feat]
        batch_x = [batch[f].values for f in self.single_embed_feat] + [batch[self.dense_feat].values]
#         batch_x = [xf[idx * self.batch_size:(idx + 1) * self.batch_size] for xf in self.x_single_embed] \
#                 + [self.x_dense[idx * self.batch_size:(idx + 1) * self.batch_size]]
        batch_y = self.y[idx * self.batch_size:(idx + 1) * self.batch_size]
        return batch_x, batch_y

In [18]:
class MetricsCallback(Callback):
    def __init__(self, x_trn, y_trn, x_val, y_val, cat_feat, num_feat, batch_size=50000, save_name='weight.h5'):
        self.trn_generator = DataSequence(x_trn, y_trn, cat_feat, num_feat, BATCH_SIZE)
        self.val_generator = DataSequence(x_val, y_val, cat_feat, num_feat, BATCH_SIZE)
        self.y_trn = y_trn
        self.y_val = y_val
        self.save_name = save_name
        self.best_score = 0.5

    def on_epoch_end(self, epoch, logs={}):
        
        y_pred = self.model.predict_generator(self.trn_generator, 
                                              max_queue_size=10, 
                                              workers=1, 
                                              use_multiprocessing=False, 
                                              verbose=0)
        roc = roc_auc_score(self.y_trn, y_pred)
        y_pred_val = self.model.predict_generator(self.val_generator, 
                                              max_queue_size=10, 
                                              workers=1, 
                                              use_multiprocessing=False, 
                                              verbose=0)
        roc_val = roc_auc_score(self.y_val, y_pred_val)
        print('\rroc-auc: %s - roc-auc_val: %s' % (str(round(roc,4)),str(round(roc_val,4))),end=100*' '+'\n')
        
        if roc_val > self.best_score:
            self.best_score = roc_val
            self.model.save_weights(os.path.join(WEIGHT_PATH, self.save_name))

        return

In [None]:
%%time
BATCH_SIZE = 1024

test_pred = np.zeros((len(test_x), 1))

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for i, (tr_idx, va_idx) in enumerate(kfold.split(train_x, train_y)):
    print('-'*100)
    print('Fold %d' % i)
    trn_x, trn_y = train_x.iloc[tr_idx], train_y[tr_idx]
    val_x, val_y = train_x.iloc[va_idx], train_y[va_idx]
    
    K.clear_session()
    model = DNN(single_embed_cnt, len(num_feat), embed_size=32)
    model.compile(loss='binary_crossentropy', optimizer='adam',)
    trn_generator = DataSequence(trn_x, trn_y, cat_feat, num_feat, batch_size=BATCH_SIZE)
    val_generator = DataSequence(val_x, val_y, cat_feat, num_feat, batch_size=BATCH_SIZE)
    test_generator = DataSequence(test_x, np.zeros(len(test_x)), cat_feat, num_feat, batch_size=BATCH_SIZE)
    history = model.fit_generator(generator=trn_generator, 
                        epochs=20, 
                        verbose=1, 
                        callbacks=[MetricsCallback(trn_x, trn_y, val_x, val_y, cat_feat, num_feat, 
                                                   batch_size=BATCH_SIZE*4,
                                                   save_name='main_dnn_f%d.h5' % i)], 
                        max_queue_size=10, 
                        workers=1, 
                        use_multiprocessing=False)
    
    model.load_weights(os.path.join(WEIGHT_PATH, 'main_dnn_f%d.h5' % i))
    
    test_pred += model.predict_generator(test_generator) / 5
    
    del trn_x, trn_y, val_x, val_y
    gc.collect()

----------------------------------------------------------------------------------------------------
Fold 0
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Epoch 1/20
roc-auc: 0.8527 - roc-auc_val: 0.8506                                                                                                    
Epoch 2/20
roc-auc: 0.8622 - roc-auc_val: 0.8597                                                                                                    
Epoch 3/20
roc-auc: 0.8686 - roc-auc_val: 0.8633                                                                                                    
Epoch 4/20
roc-auc: 0.8606 - roc-auc_val: 0.8541                                                                                                    
Epoch 5/20
roc-auc: 0.8737 - roc-auc_val: 0.8644                                                                                                    
Epoch 6/20
roc-auc: 0.8766 

In [None]:
# del trn_x, trn_y, val_x, val_y
# gc.collect()

In [None]:
PATH = '../data/data_set_0926'
invite_info_evaluate = pd.read_csv(os.path.join(PATH, 'invite_info_evaluate_1_0926.txt'), 
                          names=['question_id', 'author_id', 'invite_time'], sep='\t')
result = invite_info_evaluate
result['result'] = test_pred 
result.head()

In [None]:
localtime = time.localtime(time.time())
save_path = './out/result_dnn_%02d%02d%02d%02d.txt' % (localtime[1], localtime[2], localtime[3], localtime[4])
result.to_csv(save_path, sep='\t', index=False, header=False)
print('%s saved.' % save_path)

In [None]:
# 0.874397  0.849049237632428
# 0.880489  0.844101158553199 -
# 0.878943  0.851078697864456
# 0.880510  0.854421755061172
# 0.885294  0.858160387338417
# 0.885622  0.857313282888585 -
# 0.886196  0.856...          -
# 0.887172  0.863096415680472
# 0.888104  0.862779338260129
# 0.888148  0.862893038464606
# 0.888559  0.863349443045746
# 0.888572  0.864079617350822
# 0.888655  0.863959534391522
# 0.888649  0.863831492125684
# 0.889573  0.867511284503794
# 0.890168  0.868963718646371 