In [1]:
import pandas as pd
import numpy as np
import pickle
import gc
import os
import time
import copy
import multiprocessing as mp
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import roc_auc_score, log_loss
from tqdm import tqdm
from scipy import sparse, spatial
import warnings
warnings.filterwarnings('ignore')
tqdm.pandas()

  from collections import Mapping, defaultdict


In [2]:
import os
import tensorflow as tf

os.environ['CUDA_VISIBLE_DEVICES'] = "0"
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.compat.v1.Session(config=config)

from keras.models import Sequential, Model
from keras.layers import Input, Dense, CuDNNLSTM, Bidirectional, Embedding, CuDNNGRU, Conv1D, MaxPooling1D
from keras.layers import Flatten, PReLU, Dropout, BatchNormalization, SpatialDropout1D, concatenate
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.preprocessing.sequence import pad_sequences
from keras.utils import Sequence
from keras.callbacks import Callback
from keras import backend as K
from keras.engine.topology import Layer 
# from keras.utils.training_utils import multi_gpu_model

Using TensorFlow backend.


In [3]:
tic = time.time()
DATA_PATH = '../pkl/'
SAVE_PATH = './feats/'
WEIGHT_PATH = './weights/'
if not os.path.exists(SAVE_PATH):
    print('create dir: %s' % SAVE_PATH)
    os.mkdir(SAVE_PATH)
if not os.path.exists(WEIGHT_PATH):
    print('create dir: %s' % WEIGHT_PATH)
    os.mkdir(WEIGHT_PATH)

print('ques_topic_dnn_enc...')

ques_topic_dnn_enc...


In [4]:
data = pd.read_pickle(os.path.join(DATA_PATH, 'invite_data.pkl'))
user_info = pd.read_pickle(os.path.join(DATA_PATH, 'user_info.pkl'))
topic = pd.read_pickle(os.path.join(DATA_PATH, 'topic.pkl'))
len_train = (~data['label'].isnull()).sum()
print(len_train)

9489162


In [5]:
data.head()

Unnamed: 0,iday,ihour,itime,label,qid,uid
0,3865,22,92782,0.0,2166419046,401693808
1,3844,11,92267,0.0,1550017551,3392373099
2,3862,15,92703,0.0,604029601,2317670257
3,3849,11,92387,0.0,2350061229,1618461867
4,3867,4,92812,0.0,2443223942,3544409350


In [6]:
user_info['topic'] = user_info['topic_a'] + user_info['topic_ik']

In [7]:
data = data.merge(user_info[['uid', 'topic']], 'left', 'uid')
# data['topic'] = data['topic'].fillna('-1')
data.head()

Unnamed: 0,iday,ihour,itime,label,qid,uid,topic
0,3865,22,92782,0.0,2166419046,401693808,"[1727, 5310, 3402, 916, 1506, 26329, 7293, 180..."
1,3844,11,92267,0.0,1550017551,3392373099,"[42595, 3, 8520, 597, 6485, 6212, 25664, 148, ..."
2,3862,15,92703,0.0,604029601,2317670257,"[610, 448, 61, 2801, 9019, 65, 233, 190, 55, 5..."
3,3849,11,92387,0.0,2350061229,1618461867,"[5, 33331, 2274, 31, 245, 516, 309, 1326, 119,..."
4,3867,4,92812,0.0,2443223942,3544409350,"[0, 4876, 2467, 245, 68, 556, 42, 8, 825, 227,..."


In [8]:
max_len = data['topic'].apply(len).max()
print(max_len)

110


In [9]:
np.random.seed(42)
def shuffle(l):
    np.random.shuffle(l)
    return l

data['topic'] = data['topic'].progress_apply(shuffle)

100%|██████████| 10630845/10630845 [00:27<00:00, 380324.70it/s]


In [10]:
data.head()

Unnamed: 0,iday,ihour,itime,label,qid,uid,topic
0,3865,22,92782,0.0,2166419046,401693808,"[16, 1734, 18098, 2794, 14572, 38, 3020, 28, 1..."
1,3844,11,92267,0.0,1550017551,3392373099,"[25664, 2959, 394, 460, 2294, 910, 112, 8520, ..."
2,3862,15,92703,0.0,604029601,2317670257,"[316, 207, 2801, 8943, 148, 553, 233, 11078, 9..."
3,3849,11,92387,0.0,2350061229,1618461867,"[1074, 320, 124, 1918, 119, 31, 820, 1096, 177..."
4,3867,4,92812,0.0,2443223942,3544409350,"[825, 4876, 556, 0, 68, 637, 42, 2467, 245, 22..."


In [11]:
uid_topic_map_31 = data[['uid', 'topic']].copy()
uid_topic_map_31.drop_duplicates(subset=['uid'], inplace=True)
uid_topic_map_31.to_pickle('./shuffle_map/uid_topic_map_31.pkl')
print(uid_topic_map_31.shape)

(1419265, 2)


In [12]:
topics = pad_sequences(data['topic'], maxlen=max_len)

In [13]:
topics

array([[    0,     0,     0, ...,    68,    46,  5310],
       [    0,     0,     0, ...,   235,  1470,     3],
       [    0,     0,     0, ...,   740,   610,   587],
       ...,
       [    0,     0,     0, ...,  1322,  1910,     6],
       [    0,     0,     0, ...,   129,    38,    61],
       [    0,     0,     0, ...,  1796,  2800, 10431]], dtype=int32)

In [14]:
topics.shape

(10630845, 110)

In [15]:
embed_weights = np.array([[0] * 64] + [list(v) for v in topic['vector'].values])

In [16]:
embed_weights.shape

(100001, 64)

In [17]:
class DataSequence(Sequence):
    
    def __init__(self, x, y, batch_size=128):
        self.x = x
        self.y = y
        self.batch_size = batch_size
         
    def __len__(self):
        return int(np.ceil(self.x.shape[0] / float(self.batch_size)))
    
    def __getitem__(self, idx):
        batch_idx = np.arange(idx * self.batch_size, min((idx + 1) * self.batch_size, self.x.shape[0]))
        batch_x = self.x[batch_idx]# .toarray() # sparse to dense
        batch_y = self.y[batch_idx]
        return batch_x, batch_y

In [18]:
class MetricsCallback(Callback):
    def __init__(self, x_trn, y_trn, x_val, y_val, batch_size=128, save_name='weight.h5'):
        self.trn_generator = DataSequence(x_trn, y_trn, BATCH_SIZE)
        self.val_generator = DataSequence(x_val, y_val, BATCH_SIZE)
        self.y_trn = y_trn
        self.y_val = y_val
        self.save_name = save_name
        self.best_score = 0.5

    def on_epoch_end(self, epoch, logs={}):
        # eval train
        y_pred = self.model.predict_generator(self.trn_generator, 
                                              max_queue_size=10, 
                                              workers=1, 
                                              use_multiprocessing=False, 
                                              verbose=0)
        roc = roc_auc_score(self.y_trn, y_pred)
        # eval valid
        y_pred_val = self.model.predict_generator(self.val_generator, 
                                              max_queue_size=10, 
                                              workers=1, 
                                              use_multiprocessing=False, 
                                              verbose=0)
        roc_val = roc_auc_score(self.y_val, y_pred_val)
        print('\rroc-auc: %s - roc-auc_val: %s' % (str(round(roc,4)),str(round(roc_val,4))),end=100*' '+'\n')
#         print(roc_val)
#         print(log_loss(self.y_val, y_pred_val)/len(y_pred_val))
        if roc_val > self.best_score:
            self.best_score = roc_val
            self.model.save_weights(os.path.join(WEIGHT_PATH, self.save_name))
        
        return

In [19]:
def Net(in_dim, hiddens=(256,)):
    inp = Input(shape=(max_len,))
    x = Embedding(embed_weights.shape[0], embed_weights.shape[1], weights=[embed_weights], trainable=False)(inp)
    x = Dropout(0.2)(x)
    x = Conv1D(256, 2, padding='valid', activation='relu', strides=1)(x)
    x = MaxPooling1D(pool_size=4)(x)
    x = Flatten()(x)
    for hs in hiddens:
        x = Dense(hs)(x)
        x = BatchNormalization()(x)
        x = PReLU()(x)
        x = Dropout(0.2)(x)
    out = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=out)
    return model

In [20]:
train_x = topics[:len_train]
test_x = topics[len_train:]
train_y = data['label'].values[:len_train]

In [21]:
print(train_x.shape)
print(test_x.shape)
print(train_y.sum() / len(train_y))

(9489162, 110)
(1141683, 110)
0.1773515933229931


In [22]:
# train
BATCH_SIZE = 1024

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# for i, (tr_idx, va_idx) in enumerate(kfold.split(train_x, train_y)):
#     print('-'*100)
#     print('Fold %d' % i)
#     trn_x, trn_y = train_x[tr_idx], train_y[tr_idx]
#     val_x, val_y = train_x[va_idx], train_y[va_idx]
    
#     K.clear_session()
#     model = Net(in_dim=train_x.shape[1], hiddens=(256,256,256))
#     model.compile(loss='binary_crossentropy', optimizer='adam',)
    
#     trn_generator = DataSequence(trn_x, trn_y, batch_size=BATCH_SIZE)
#     val_generator = DataSequence(val_x, val_y, batch_size=BATCH_SIZE)
    
#     history = model.fit_generator(generator=trn_generator, 
#                         epochs=30, 
#                         verbose=1, 
#                         callbacks=[MetricsCallback(trn_x, trn_y, val_x, val_y, 
#                                                    batch_size=BATCH_SIZE*4, 
#                                                    save_name='user_topic_dnn_enc_A_weight_fold%d.h5' % i)], 
#                         max_queue_size=100, 
#                         workers=1, 
#                         use_multiprocessing=False,)
    
#     del trn_x, trn_y, val_x, val_y
#     gc.collect()

In [23]:
# 0.4608

In [24]:
BATCH_SIZE = 1024

test_generator = DataSequence(test_x, np.zeros(test_x.shape[0]), batch_size=BATCH_SIZE * 4)

test_pred = np.zeros((test_x.shape[0], 1))
train_pred = np.zeros((train_x.shape[0], 1))

for i, (tr_idx, va_idx) in enumerate(kfold.split(train_x, train_y)):
    print('-'*100)
    print('Fold %d' % i)
    trn_x, trn_y = train_x[tr_idx], train_y[tr_idx]
    val_x, val_y = train_x[va_idx], train_y[va_idx]
    
    K.clear_session()
    model = Net(in_dim=train_x.shape[1], hiddens=(256,256,256))
    model.compile(loss='binary_crossentropy', optimizer='adam',)
    
    trn_generator = DataSequence(trn_x, trn_y, batch_size=BATCH_SIZE * 4)
    val_generator = DataSequence(val_x, val_y, batch_size=BATCH_SIZE * 4)
    
    model.load_weights(os.path.join(WEIGHT_PATH, 'user_topic_dnn_enc_A_weight_fold%d.h5' % i))
    
    train_pred[va_idx] = model.predict_generator(val_generator)
    test_pred += model.predict_generator(test_generator) / 5
    
    print(roc_auc_score(val_y, train_pred[va_idx]))

    del trn_x, trn_y, val_x, val_y
    gc.collect()
    print('Predict Done.')

----------------------------------------------------------------------------------------------------
Fold 0
Instructions for updating:
Colocations handled automatically by placer.
0.6431040880222472
Predict Done.
----------------------------------------------------------------------------------------------------
Fold 1
0.6431329820470291
Predict Done.
----------------------------------------------------------------------------------------------------
Fold 2
0.6429093656776126
Predict Done.
----------------------------------------------------------------------------------------------------
Fold 3
0.6434188656247765
Predict Done.
----------------------------------------------------------------------------------------------------
Fold 4
0.6435249952821259
Predict Done.


In [23]:
arr = np.vstack([train_pred, test_pred])
feat = pd.DataFrame(arr, columns=['user_topic_dnn_enc_A'])
feat.head()

Unnamed: 0,user_topic_dnn_enc_A
0,0.082465
1,0.12736
2,0.090479
3,0.075117
4,0.158286


In [26]:
feat.to_pickle('./feats/user_topic_dnn_enc_A.pkl')