In [1]:
import pandas as pd
import numpy as np
import pickle
import gc
import os
import time
import copy
import multiprocessing as mp
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import roc_auc_score, log_loss
from tqdm import tqdm
from scipy import sparse, spatial
import warnings
warnings.filterwarnings('ignore')
tqdm.pandas()

In [2]:
import os
import tensorflow as tf

os.environ['CUDA_VISIBLE_DEVICES'] = "0"
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.compat.v1.Session(config=config)

from keras.models import Sequential, Model
from keras.layers import Input, Dense, CuDNNLSTM, Bidirectional, Embedding, CuDNNGRU, Conv1D, MaxPooling1D
from keras.layers import Flatten, PReLU, Dropout, BatchNormalization, SpatialDropout1D, concatenate
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.preprocessing.sequence import pad_sequences
from keras.utils import Sequence
from keras.callbacks import Callback
from keras import backend as K
from keras.engine.topology import Layer 
# from keras.utils.training_utils import multi_gpu_model

Using TensorFlow backend.


In [3]:
tic = time.time()
DATA_PATH = '../pkl/'
SAVE_PATH = './feats/'
WEIGHT_PATH = './weights/'
if not os.path.exists(SAVE_PATH):
    print('create dir: %s' % SAVE_PATH)
    os.mkdir(SAVE_PATH)
if not os.path.exists(WEIGHT_PATH):
    print('create dir: %s' % WEIGHT_PATH)
    os.mkdir(WEIGHT_PATH)

print('ques_topic_dnn_enc...')

ques_topic_dnn_enc...


In [4]:
data = pd.read_pickle(os.path.join(DATA_PATH, 'invite_data.pkl'))
question_info = pd.read_pickle(os.path.join(DATA_PATH, 'question_info.pkl'))
topic = pd.read_pickle(os.path.join(DATA_PATH, 'topic.pkl'))
len_train = (~data['label'].isnull()).sum()
print(len_train)

9489162


In [5]:
data.head()

Unnamed: 0,iday,ihour,itime,label,qid,uid
0,3865,22,92782,0.0,2166419046,401693808
1,3844,11,92267,0.0,1550017551,3392373099
2,3862,15,92703,0.0,604029601,2317670257
3,3849,11,92387,0.0,2350061229,1618461867
4,3867,4,92812,0.0,2443223942,3544409350


In [6]:
question_info.head()

Unnamed: 0,qid,qtime,title_sw,title_w,desc_sw,desc_w,topic,qday,qhour
0,2234111670,24437,"[211, 204, 1715, 69, 2033, 138, 57, 138, 8, 28...","[22414, 963, 10458]",[0],[0],"[321, 730, 5784, 4389]",1018,5
1,760329790,41900,"[69, 2033, 138, 2616, 2668, 36, 2594, 1165, 20...","[12677, 16829, 15201, 6419, 101839]","[146, 982, 401, 297, 17, 2616, 2668, 36, 2594,...","[1296, 2118, 12677, 16829, 15201, 6419, 101839...","[278, 12673, 4677]",1745,20
2,741313548,48789,"[153, 662, 1218, 853, 325, 1056, 467, 398, 102...","[700, 2781, 3280, 81215]","[1956, 3583, 153, 34, 35, 1016, 586, 586, 716,...","[732, 24400, 48321, 39608, 20788, 219486, 1183...",[226],2032,21
3,3481466230,52455,"[22, 179, 57, 451, 594, 118, 882, 655, 1, 433,...","[3312, 1823, 1505, 638, 166, 461]","[323, 37, 1, 606, 1227, 29, 22, 179, 7, 44, 27...","[6642, 4214, 3312, 1505, 2205, 232, 294, 7177,...","[51, 4468]",2185,15
4,3966197028,54473,"[1622, 223, 1218, 853, 390, 220, 753, 909, 557...","[700, 895, 2253]",[0],[0],"[54700, 81, 57, 17670, 43574]",2269,17


In [7]:
data = data.merge(question_info[['qid', 'topic']], 'left', 'qid')
# data['topic'] = data['topic'].fillna('-1')
data.head()

Unnamed: 0,iday,ihour,itime,label,qid,uid,topic
0,3865,22,92782,0.0,2166419046,401693808,"[456, 112, 9566, 5310]"
1,3844,11,92267,0.0,1550017551,3392373099,"[2, 3095]"
2,3862,15,92703,0.0,604029601,2317670257,"[6090, 2156, 97, 456]"
3,3849,11,92387,0.0,2350061229,1618461867,[856]
4,3867,4,92812,0.0,2443223942,3544409350,"[26, 76, 17]"


In [8]:
max_len = data['topic'].apply(len).max()
print(max_len)

13


In [9]:
np.random.seed(42)
def shuffle(l):
    np.random.shuffle(l)
    return l

data['topic'] = data['topic'].progress_apply(shuffle)

100%|██████████| 10630845/10630845 [00:21<00:00, 494573.49it/s]


In [10]:
data.head()

Unnamed: 0,iday,ihour,itime,label,qid,uid,topic
0,3865,22,92782,0.0,2166419046,401693808,"[112, 9566, 5310, 456]"
1,3844,11,92267,0.0,1550017551,3392373099,"[2, 3095]"
2,3862,15,92703,0.0,604029601,2317670257,"[456, 2156, 97, 6090]"
3,3849,11,92387,0.0,2350061229,1618461867,[856]
4,3867,4,92812,0.0,2443223942,3544409350,"[76, 17, 26]"


In [11]:
topics = pad_sequences(data['topic'], maxlen=max_len)

In [12]:
topics.shape

(10630845, 13)

In [13]:
embed_weights = np.array([[0] * 64] + [list(v) for v in topic['vector'].values])

In [14]:
embed_weights.shape

(100001, 64)

In [15]:
class DataSequence(Sequence):
    
    def __init__(self, x, y, batch_size=128):
        self.x = x
        self.y = y
        self.batch_size = batch_size
         
    def __len__(self):
        return int(np.ceil(self.x.shape[0] / float(self.batch_size)))
    
    def __getitem__(self, idx):
        batch_idx = np.arange(idx * self.batch_size, min((idx + 1) * self.batch_size, self.x.shape[0]))
        batch_x = self.x[batch_idx]# .toarray() # sparse to dense
        batch_y = self.y[batch_idx]
        return batch_x, batch_y

In [16]:
class MetricsCallback(Callback):
    def __init__(self, x_trn, y_trn, x_val, y_val, batch_size=128, save_name='weight.h5'):
        self.trn_generator = DataSequence(x_trn, y_trn, BATCH_SIZE)
        self.val_generator = DataSequence(x_val, y_val, BATCH_SIZE)
        self.y_trn = y_trn
        self.y_val = y_val
        self.save_name = save_name
        self.best_score = 0.5

    def on_epoch_end(self, epoch, logs={}):
        # eval train
        y_pred = self.model.predict_generator(self.trn_generator, 
                                              max_queue_size=10, 
                                              workers=1, 
                                              use_multiprocessing=False, 
                                              verbose=0)
        roc = roc_auc_score(self.y_trn, y_pred)
        # eval valid
        y_pred_val = self.model.predict_generator(self.val_generator, 
                                              max_queue_size=10, 
                                              workers=1, 
                                              use_multiprocessing=False, 
                                              verbose=0)
        roc_val = roc_auc_score(self.y_val, y_pred_val)
        print('\rroc-auc: %s - roc-auc_val: %s' % (str(round(roc,4)),str(round(roc_val,4))),end=100*' '+'\n')
#         print(roc_val)
#         print(log_loss(self.y_val, y_pred_val)/len(y_pred_val))
        if roc_val > self.best_score:
            self.best_score = roc_val
            self.model.save_weights(os.path.join(WEIGHT_PATH, self.save_name))
        
        return

In [17]:
def Net(in_dim, hiddens=(256,256)):
    inp = Input(shape=(max_len,))
    x = Embedding(embed_weights.shape[0], embed_weights.shape[1], weights=[embed_weights], trainable=False)(inp)
    x = Dropout(0.2)(x)
    x = Flatten()(x)
    for hs in hiddens:
        x = Dense(hs)(x)
        x = BatchNormalization()(x)
        x = PReLU()(x)
        x = Dropout(0.2)(x)
    out = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=out)
    return model

In [18]:
train_x = topics[:len_train]
test_x = topics[len_train:]
train_y = data['label'].values[:len_train]

In [19]:
print(train_x.shape)
print(test_x.shape)
print(train_y.sum() / len(train_y))

(9489162, 13)
(1141683, 13)
0.1773515933229931


In [21]:
BATCH_SIZE = 1024

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for i, (tr_idx, va_idx) in enumerate(kfold.split(train_x, train_y)):
    print('-'*100)
    print('Fold %d' % i)
    trn_x, trn_y = train_x[tr_idx], train_y[tr_idx]
    val_x, val_y = train_x[va_idx], train_y[va_idx]
    
    K.clear_session()
    model = Net(in_dim=train_x.shape[1], hiddens=(256,256,256))
    model.compile(loss='binary_crossentropy', optimizer='adam',)
    
    trn_generator = DataSequence(trn_x, trn_y, batch_size=BATCH_SIZE)
    val_generator = DataSequence(val_x, val_y, batch_size=BATCH_SIZE)
    
    history = model.fit_generator(generator=trn_generator, 
                        epochs=30, 
                        verbose=1, 
                        callbacks=[MetricsCallback(trn_x, trn_y, val_x, val_y, 
                                                   batch_size=BATCH_SIZE*4, 
                                                   save_name='ques_topic_dnn_enc_B_weight_fold%d.h5' % i)], 
                        max_queue_size=100, 
                        workers=1, 
                        use_multiprocessing=False,)
    
    del trn_x, trn_y, val_x, val_y
    gc.collect()
    
    break

----------------------------------------------------------------------------------------------------
Fold 0


KeyboardInterrupt: 

In [21]:
# 0.4608

In [22]:
test_generator = DataSequence(test_x, np.zeros(test_x.shape[0]), batch_size=BATCH_SIZE * 4)

test_pred = np.zeros((test_x.shape[0], 1))
train_pred = np.zeros((train_x.shape[0], 1))

for i, (tr_idx, va_idx) in enumerate(kfold.split(train_x, train_y)):
    print('-'*100)
    print('Fold %d' % i)
    trn_x, trn_y = train_x[tr_idx], train_y[tr_idx]
    val_x, val_y = train_x[va_idx], train_y[va_idx]
    
    K.clear_session()
    model = Net(in_dim=train_x.shape[1], hiddens=(256,256,256))
    model.compile(loss='binary_crossentropy', optimizer='adam',)
    
    trn_generator = DataSequence(trn_x, trn_y, batch_size=BATCH_SIZE * 4)
    val_generator = DataSequence(val_x, val_y, batch_size=BATCH_SIZE * 4)
    
    model.load_weights(os.path.join(WEIGHT_PATH, 'ques_topic_dnn_enc_B_weight_fold%d.h5' % i))
    
    train_pred[va_idx] = model.predict_generator(val_generator)
    test_pred += model.predict_generator(test_generator) / 5
    
    del trn_x, trn_y, val_x, val_y
    gc.collect()
    print('Predict Done.')

----------------------------------------------------------------------------------------------------
Fold 0
Predict Done.
----------------------------------------------------------------------------------------------------
Fold 1
Predict Done.
----------------------------------------------------------------------------------------------------
Fold 2
Predict Done.
----------------------------------------------------------------------------------------------------
Fold 3
Predict Done.
----------------------------------------------------------------------------------------------------
Fold 4
Predict Done.


In [23]:
arr = np.vstack([train_pred, test_pred])
feat = pd.DataFrame(arr, columns=['ques_topic_dnn_enc_B'])
feat.head()

Unnamed: 0,ques_topic_dnn_enc_B
0,0.116971
1,0.192841
2,0.057636
3,0.145904
4,0.239827


In [24]:
feat.to_pickle('./feats/ques_topic_dnn_enc_B.pkl')