In [1]:
import pandas as pd
import numpy as np
import pickle
import gc
import os
import time
import copy
import multiprocessing as mp
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import roc_auc_score, log_loss
from tqdm import tqdm
from scipy import sparse, spatial
import warnings
warnings.filterwarnings('ignore')
tqdm.pandas()

In [2]:
import os
import tensorflow as tf

os.environ['CUDA_VISIBLE_DEVICES'] = "0"
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.compat.v1.Session(config=config)

from keras.models import Sequential, Model
from keras.layers import Input, Dense, CuDNNLSTM, Bidirectional, Embedding, CuDNNGRU, Conv1D, MaxPooling1D
from keras.layers import Flatten, PReLU, Dropout, BatchNormalization, SpatialDropout1D, concatenate
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.preprocessing.sequence import pad_sequences
from keras.utils import Sequence
from keras.callbacks import Callback
from keras import backend as K
from keras.engine.topology import Layer 
# from keras.utils.training_utils import multi_gpu_model

Using TensorFlow backend.


In [3]:
tic = time.time()
DATA_PATH = '../pkl/'
SAVE_PATH = './feats/'
WEIGHT_PATH = './weights/'
if not os.path.exists(SAVE_PATH):
    print('create dir: %s' % SAVE_PATH)
    os.mkdir(SAVE_PATH)
if not os.path.exists(WEIGHT_PATH):
    print('create dir: %s' % WEIGHT_PATH)
    os.mkdir(WEIGHT_PATH)

print('user_topic_dnn_enc_B...')

user_topic_dnn_enc_B...


In [4]:
data = pd.read_pickle(os.path.join(DATA_PATH, 'invite_data.pkl'))
user_info = pd.read_pickle(os.path.join(DATA_PATH, 'user_info.pkl'))
topic = pd.read_pickle(os.path.join(DATA_PATH, 'topic.pkl'))
len_train = (~data['label'].isnull()).sum()
print(len_train)

9489162


In [5]:
data.head()

Unnamed: 0,iday,ihour,itime,label,qid,uid
0,3865,22,92782,0.0,2166419046,401693808
1,3844,11,92267,0.0,1550017551,3392373099
2,3862,15,92703,0.0,604029601,2317670257
3,3849,11,92387,0.0,2350061229,1618461867
4,3867,4,92812,0.0,2443223942,3544409350


In [8]:
user_info['topic'] = user_info['topic_a'] + user_info['topic_ik']

In [9]:
data = data.merge(user_info[['uid', 'topic']], 'left', 'uid')
# data['topic'] = data['topic'].fillna('-1')
data.head()

Unnamed: 0,iday,ihour,itime,label,qid,uid,topic
0,3865,22,92782,0.0,2166419046,401693808,"[1727, 5310, 3402, 916, 1506, 26329, 7293, 180..."
1,3844,11,92267,0.0,1550017551,3392373099,"[42595, 3, 8520, 597, 6485, 6212, 25664, 148, ..."
2,3862,15,92703,0.0,604029601,2317670257,"[610, 448, 61, 2801, 9019, 65, 233, 190, 55, 5..."
3,3849,11,92387,0.0,2350061229,1618461867,"[5, 33331, 2274, 31, 245, 516, 309, 1326, 119,..."
4,3867,4,92812,0.0,2443223942,3544409350,"[0, 4876, 2467, 245, 68, 556, 42, 8, 825, 227,..."


In [10]:
max_len = data['topic'].apply(len).max()
print(max_len)

110


In [11]:
np.random.seed(42)
def shuffle(l):
    np.random.shuffle(l)
    return l

data['topic'] = data['topic'].progress_apply(shuffle)

100%|██████████| 10630845/10630845 [00:29<00:00, 365319.38it/s]


In [12]:
data.head()

Unnamed: 0,iday,ihour,itime,label,qid,uid,topic
0,3865,22,92782,0.0,2166419046,401693808,"[16, 1734, 18098, 2794, 14572, 38, 3020, 28, 1..."
1,3844,11,92267,0.0,1550017551,3392373099,"[25664, 2959, 394, 460, 2294, 910, 112, 8520, ..."
2,3862,15,92703,0.0,604029601,2317670257,"[316, 207, 2801, 8943, 148, 553, 233, 11078, 9..."
3,3849,11,92387,0.0,2350061229,1618461867,"[1074, 320, 124, 1918, 119, 31, 820, 1096, 177..."
4,3867,4,92812,0.0,2443223942,3544409350,"[825, 4876, 556, 0, 68, 637, 42, 2467, 245, 22..."


In [13]:
topics = pad_sequences(data['topic'], maxlen=max_len)

In [14]:
topics.shape

(10630845, 110)

In [15]:
embed_weights = np.array([[0] * 64] + [list(v) for v in topic['vector'].values])

In [16]:
embed_weights.shape

(100001, 64)

In [17]:
class DataSequence(Sequence):
    
    def __init__(self, x, y, batch_size=128):
        self.x = x
        self.y = y
        self.batch_size = batch_size
         
    def __len__(self):
        return int(np.ceil(self.x.shape[0] / float(self.batch_size)))
    
    def __getitem__(self, idx):
        batch_idx = np.arange(idx * self.batch_size, min((idx + 1) * self.batch_size, self.x.shape[0]))
        batch_x = self.x[batch_idx]# .toarray() # sparse to dense
        batch_y = self.y[batch_idx]
        return batch_x, batch_y

In [18]:
class MetricsCallback(Callback):
    def __init__(self, x_trn, y_trn, x_val, y_val, batch_size=128, save_name='weight.h5'):
        self.trn_generator = DataSequence(x_trn, y_trn, BATCH_SIZE)
        self.val_generator = DataSequence(x_val, y_val, BATCH_SIZE)
        self.y_trn = y_trn
        self.y_val = y_val
        self.save_name = save_name
        self.best_score = 0.5

    def on_epoch_end(self, epoch, logs={}):
        # eval train
        y_pred = self.model.predict_generator(self.trn_generator, 
                                              max_queue_size=10, 
                                              workers=1, 
                                              use_multiprocessing=False, 
                                              verbose=0)
        roc = roc_auc_score(self.y_trn, y_pred)
        # eval valid
        y_pred_val = self.model.predict_generator(self.val_generator, 
                                              max_queue_size=10, 
                                              workers=1, 
                                              use_multiprocessing=False, 
                                              verbose=0)
        roc_val = roc_auc_score(self.y_val, y_pred_val)
        print('\rroc-auc: %s - roc-auc_val: %s' % (str(round(roc,4)),str(round(roc_val,4))),end=100*' '+'\n')
#         print(roc_val)
#         print(log_loss(self.y_val, y_pred_val)/len(y_pred_val))
        if roc_val > self.best_score:
            self.best_score = roc_val
            self.model.save_weights(os.path.join(WEIGHT_PATH, self.save_name))
        
        return

In [19]:
def Net(in_dim, hiddens=(256,256)):
    inp = Input(shape=(max_len,))
    x = Embedding(embed_weights.shape[0], embed_weights.shape[1], weights=[embed_weights], trainable=False)(inp)
    x = Dropout(0.2)(x)
    x = Flatten()(x)
    for hs in hiddens:
        x = Dense(hs)(x)
        x = BatchNormalization()(x)
        x = PReLU()(x)
        x = Dropout(0.2)(x)
    out = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=out)
    return model

In [20]:
train_x = topics[:len_train]
test_x = topics[len_train:]
train_y = data['label'].values[:len_train]

In [21]:
print(train_x.shape)
print(test_x.shape)
print(train_y.sum() / len(train_y))

(9489162, 110)
(1141683, 110)
0.1773515933229931


In [23]:
BATCH_SIZE = 1024

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for i, (tr_idx, va_idx) in enumerate(kfold.split(train_x, train_y)):
    print('-'*100)
    print('Fold %d' % i)
    trn_x, trn_y = train_x[tr_idx], train_y[tr_idx]
    val_x, val_y = train_x[va_idx], train_y[va_idx]
    
    K.clear_session()
    model = Net(in_dim=train_x.shape[1], hiddens=(256,256,256))
    model.compile(loss='binary_crossentropy', optimizer='adam',)
    
    trn_generator = DataSequence(trn_x, trn_y, batch_size=BATCH_SIZE)
    val_generator = DataSequence(val_x, val_y, batch_size=BATCH_SIZE)
    
    history = model.fit_generator(generator=trn_generator, 
                        epochs=30, 
                        verbose=1, 
                        callbacks=[MetricsCallback(trn_x, trn_y, val_x, val_y, 
                                                   batch_size=BATCH_SIZE*4, 
                                                   save_name='user_topic_dnn_enc_B_weight_fold%d.h5' % i)], 
                        max_queue_size=100, 
                        workers=1, 
                        use_multiprocessing=False,)
    
    del trn_x, trn_y, val_x, val_y
    gc.collect()

----------------------------------------------------------------------------------------------------
Fold 0
Epoch 1/30
roc-auc: 0.6194 - roc-auc_val: 0.6028                                                                                                    
Epoch 2/30
roc-auc: 0.6495 - roc-auc_val: 0.6204                                                                                                    
Epoch 3/30
roc-auc: 0.6681 - roc-auc_val: 0.6293                                                                                                    
Epoch 4/30
roc-auc: 0.6776 - roc-auc_val: 0.6333                                                                                                    
Epoch 5/30
roc-auc: 0.6848 - roc-auc_val: 0.6362                                                                                                    
Epoch 6/30
roc-auc: 0.6919 - roc-auc_val: 0.6377                                                                                                   

roc-auc: 0.6968 - roc-auc_val: 0.6401                                                                                                    
Epoch 8/30
roc-auc: 0.7017 - roc-auc_val: 0.6415                                                                                                    
Epoch 9/30
roc-auc: 0.7042 - roc-auc_val: 0.642                                                                                                    
Epoch 10/30
roc-auc: 0.707 - roc-auc_val: 0.6427                                                                                                    
Epoch 11/30
roc-auc: 0.7091 - roc-auc_val: 0.643                                                                                                    
Epoch 12/30
roc-auc: 0.7107 - roc-auc_val: 0.6438                                                                                                    
Epoch 13/30
roc-auc: 0.7132 - roc-auc_val: 0.6439                                                                    

roc-auc: 0.7138 - roc-auc_val: 0.6446                                                                                                    
Epoch 15/30
roc-auc: 0.7157 - roc-auc_val: 0.6453                                                                                                    
Epoch 16/30
roc-auc: 0.715 - roc-auc_val: 0.6456                                                                                                    
Epoch 17/30
roc-auc: 0.7182 - roc-auc_val: 0.6452                                                                                                    
Epoch 18/30
roc-auc: 0.7199 - roc-auc_val: 0.6457                                                                                                    
Epoch 19/30
roc-auc: 0.7198 - roc-auc_val: 0.6462                                                                                                    
Epoch 20/30
roc-auc: 0.7221 - roc-auc_val: 0.6457                                                                

roc-auc: 0.7227 - roc-auc_val: 0.6457                                                                                                    
Epoch 21/30
roc-auc: 0.7225 - roc-auc_val: 0.6452                                                                                                    
Epoch 22/30
roc-auc: 0.7235 - roc-auc_val: 0.6458                                                                                                    
Epoch 23/30
roc-auc: 0.723 - roc-auc_val: 0.6462                                                                                                    
Epoch 24/30
roc-auc: 0.7239 - roc-auc_val: 0.6462                                                                                                    
Epoch 25/30
roc-auc: 0.7262 - roc-auc_val: 0.646                                                                                                    
Epoch 26/30
roc-auc: 0.7262 - roc-auc_val: 0.6468                                                                 

roc-auc: 0.7273 - roc-auc_val: 0.6464                                                                                                    
Epoch 27/30
roc-auc: 0.7276 - roc-auc_val: 0.6466                                                                                                    
Epoch 28/30
roc-auc: 0.727 - roc-auc_val: 0.647                                                                                                    
Epoch 29/30
roc-auc: 0.7267 - roc-auc_val: 0.6466                                                                                                    
Epoch 30/30
roc-auc: 0.73 - roc-auc_val: 0.6469                                                                                                    


In [24]:
# 0.4608

In [25]:
test_generator = DataSequence(test_x, np.zeros(test_x.shape[0]), batch_size=BATCH_SIZE * 4)

test_pred = np.zeros((test_x.shape[0], 1))
train_pred = np.zeros((train_x.shape[0], 1))

for i, (tr_idx, va_idx) in enumerate(kfold.split(train_x, train_y)):
    print('-'*100)
    print('Fold %d' % i)
    trn_x, trn_y = train_x[tr_idx], train_y[tr_idx]
    val_x, val_y = train_x[va_idx], train_y[va_idx]
    
    K.clear_session()
    model = Net(in_dim=train_x.shape[1], hiddens=(256,256,256))
    model.compile(loss='binary_crossentropy', optimizer='adam',)
    
    trn_generator = DataSequence(trn_x, trn_y, batch_size=BATCH_SIZE * 4)
    val_generator = DataSequence(val_x, val_y, batch_size=BATCH_SIZE * 4)
    
    model.load_weights(os.path.join(WEIGHT_PATH, 'user_topic_dnn_enc_B_weight_fold%d.h5' % i))
    
    train_pred[va_idx] = model.predict_generator(val_generator)
    test_pred += model.predict_generator(test_generator) / 5
    
    del trn_x, trn_y, val_x, val_y
    gc.collect()
    print('Predict Done.')

----------------------------------------------------------------------------------------------------
Fold 0
Predict Done.
----------------------------------------------------------------------------------------------------
Fold 1
Predict Done.
----------------------------------------------------------------------------------------------------
Fold 2
Predict Done.
----------------------------------------------------------------------------------------------------
Fold 3
Predict Done.
----------------------------------------------------------------------------------------------------
Fold 4
Predict Done.


In [26]:
arr = np.vstack([train_pred, test_pred])
feat = pd.DataFrame(arr, columns=['user_topic_dnn_enc_B'])
feat.head()

Unnamed: 0,user_topic_dnn_enc_B
0,0.118857
1,0.118567
2,0.170073
3,0.057185
4,0.144254


In [27]:
feat.to_pickle('./feats/user_topic_dnn_enc_B.pkl')