In [1]:
from os.path import join,exists
from datetime import datetime
import numpy as np
import pandas as pd
import tensorflow as tf
from keras import Model
from keras.layers import Embedding,Activation,Dense,Input,Lambda,Concatenate,Softmax,Dropout,Dot,Add,Multiply
from keras.callbacks import EarlyStopping,Callback
from keras import backend as K
from keras.regularizers import l2
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [32]:
not_cate_cols=['id','click','hour']
train_data=pd.read_csv('raw_data/train.csv',dtype={'id':'str'})
test_data=pd.read_csv('raw_data/test.csv',dtype={'id':'str'})
traintst=pd.concat([train_data,test_data]).sample(frac=0.1,random_state=42)
for c in traintst.columns:
    if c not in not_cate_cols:
        traintst[c] = LabelEncoder().fit_transform(traintst[c].astype('str'))
traintst.info(null_counts=True)
# traintst=pd.read_hdf('raw_data/traintst.hdf','avazu')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4500643 entries, 23002356 to 1525489
Data columns (total 24 columns):
C1                  4500643 non-null int64
C14                 4500643 non-null int64
C15                 4500643 non-null int64
C16                 4500643 non-null int64
C17                 4500643 non-null int64
C18                 4500643 non-null int64
C19                 4500643 non-null int64
C20                 4500643 non-null int64
C21                 4500643 non-null int64
app_category        4500643 non-null int64
app_domain          4500643 non-null int64
app_id              4500643 non-null int64
banner_pos          4500643 non-null int64
click               4042456 non-null float64
device_conn_type    4500643 non-null int64
device_id           4500643 non-null int64
device_ip           4500643 non-null int64
device_model        4500643 non-null int64
device_type         4500643 non-null int64
hour                4500643 non-null int64
id                

In [33]:
label_col='click'

time=traintst.hour.astype('str')
time_hour=time.apply(lambda x:int(x[-2:]))

def get_weekday(s):
    return (int(s[:6])-141021)%7+1
time_weekday=time.apply(get_weekday)

traintst['time_hour']=time_hour
traintst['time_weekday']=time_weekday

In [34]:
tra_data=traintst[traintst.hour<14103000]
val_data=traintst[(traintst.hour>=14103000)&(traintst.hour<14103100)]
tst_data=traintst[traintst.hour>=14103100]
tra_data.shape,val_data.shape,tst_data.shape

((3620275, 26), (422181, 26), (458187, 26))

In [35]:
cols_uniq=[(c,len(traintst[c].unique())) for c in set(traintst.columns)-set(not_cate_cols)]
sml_cate_cols=[cu for cu in cols_uniq if cu[1]<100]
print(sml_cate_cols)
med_cate_cols=[cu for cu in cols_uniq if cu[1]>=100 and cu[1]<1000]
print(med_cate_cols)
big_cate_cols=[cu for cu in cols_uniq if cu[1]>=1000 and cu[1]<10000]
print(big_cate_cols)
sup_cate_cols=[cu for cu in cols_uniq if cu[1]>=10000]
print(sup_cate_cols)

[('device_type', 5), ('site_category', 24), ('C15', 8), ('C18', 4), ('C16', 9), ('device_conn_type', 4), ('C1', 7), ('time_weekday', 7), ('C19', 68), ('app_category', 31), ('banner_pos', 7), ('time_hour', 24), ('C21', 61)]
[('C17', 467), ('C20', 166), ('app_domain', 319)]
[('app_id', 5216), ('C14', 2673), ('site_domain', 4387), ('device_model', 6402), ('site_id', 3523)]
[('device_id', 544554), ('device_ip', 1800055)]


In [36]:
train_root='train'
mi_path=join(train_root,'models_info.hdf')
if exists(mi_path):
    models_info=pd.read_hdf(mi_path)
else:
    models_info=[]

In [45]:
sel_cols=[c for c in cols_uniq if c[0] != 'device_ip']
cols=[c[0] for c in sel_cols]

x_tra=tra_data[cols]
y_tra=tra_data[label_col]
x_val=val_data[cols]
y_val=val_data[label_col]
x_tst=tst_data[cols]
x_tra.shape,x_val.shape,x_tst.shape

((3620275, 22), (422181, 22), (458187, 22))

In [46]:
def auc(y_true, y_pred):
    auc = tf.metrics.auc(y_true, y_pred)[1]
    K.get_session().run(tf.local_variables_initializer())
    return auc

In [47]:
def lr(sel_cols):
    input=Input((len(sel_cols),))
    attrs=[]
    f=0
    for i in range(len(sel_cols)):
        x=Lambda(lambda x:x[:,i])(input)
#         attrs.append(Embedding(sel_cols[i][1]+1,sel_cols[i][1]+1,embeddings_initializer='identity',trainable=False,dtype='uint8')(x))
        attrs.append(Lambda(lambda x:K.one_hot(K.cast(x,'int32'),sel_cols[i][1]))(x))
        f+=sel_cols[i][1]
    x=Concatenate()(attrs)
    output=Dense(1,activation='sigmoid',kernel_initializer='glorot_uniform')(x)

    model=Model(inputs=input,outputs=output)
    model.name='lr_f%d'%f
    model.compile(optimizer='adam',loss='binary_crossentropy',metrics=[auc])
    model.summary()
    return model

In [54]:
# K.clear_session()
def fm(sel_cols):
    # embedding dimention
    k=16
    input=Input((len(sel_cols),))

    # set attribute embedding matrix
    attrs_ord1=[]
    attrs_emb=[]
    for i in range(len(sel_cols)):
        x=Lambda(lambda x:x[:,i])(input)
#         attrs_ord1.append(Embedding(sel_cols[i][1]+1,sel_cols[i][1]+1,embeddings_initializer='identity',trainable=False)(x))
        attrs_ord1.append(Lambda(lambda x:K.one_hot(K.cast(x,'int32'),sel_cols[i][1]))(x))
        attrs_emb.append(Embedding(sel_cols[i][1]+1,k,embeddings_initializer='glorot_uniform')(x))

    # calc 2nd order product
    attrs_ord2=[]    
    for i in range(len(sel_cols)):
        for j in range(i+1,len(sel_cols)):
            attrs_ord2.append(Dot(1)([attrs_emb[i],attrs_emb[j]]))

    # lr for 1st order attributes
    x1=Concatenate()(attrs_ord1)
    x1=Dense(1,kernel_initializer='glorot_uniform')(x1)

    # 1st + 2nd 
    attrs_ord2.append(x1)
    x=Add()(attrs_ord2)
    output=Activation('sigmoid')(x)

    model=Model(inputs=input,outputs=output)
    model.name='fm_k%d'%k
    model.compile(optimizer='adam',loss='binary_crossentropy',metrics=[auc])
    model.summary()
    return model

def fm2(sel_cols):
    k=16
    input=Input((len(sel_cols),))
    embs1=[]
    embs2=[]
    for i in range(len(sel_cols)):
        x=Lambda(lambda x:x[:,i])(input)
        attrs_ord1.append(Lambda(lambda x:K.one_hot(K.cast(x,'int32'),sel_cols[i][1]))(x))
        embs2.append(Embedding(sel_cols[i][1]+1,k,embeddings_initializer='glorot_uniform')(x))
    embs3=[]    
    for i in range(len(sel_cols)):
        for j in range(i+1,len(sel_cols)):
            embs3.append(Dot(1)([embs2[i],embs2[j]]))
    
    x=Concatenate()(embs1+embs3)
    output=Dense(1,activation='sigmoid',kernel_initializer='glorot_uniform')(x)

    model=Model(inputs=input,outputs=output)
    model.name='fm2_k%d'%k
    model.compile(optimizer='adam',loss='binary_crossentropy',metrics=[auc])
    model.summary()
    return model

In [49]:
def deep_fm(sel_cols):
    k=4
    h=256
    reg=1e-5
    input=Input((len(sel_cols),))
    
    # set attribute embedding matrix
    attrs_ord1=[]
    attrs_emb=[]
    for i in range(len(sel_cols)):
        x=Lambda(lambda x:x[:,i])(input)
        attrs_ord1.append(Lambda(lambda x:K.one_hot(K.cast(x,'int32'),sel_cols[i][1]))(x))
        attrs_emb.append(Embedding(sel_cols[i][1]+1,k,embeddings_initializer='glorot_uniform',embeddings_regularizer=l2(reg))(x))

    # calc 2nd order product
    attrs_ord2=[]    
    for i in range(len(sel_cols)):
        for j in range(i+1,len(sel_cols)):
            attrs_ord2.append(Dot(1)([attrs_emb[i],attrs_emb[j]]))
    
    # deep interactions 
    d=Concatenate()(attrs_emb)
    d=Dense(h,activation='relu',kernel_initializer='glorot_uniform',kernel_regularizer=l2(reg))(d)
    d=Dropout(0.75)(d)
    d=Dense(h,activation='relu',kernel_initializer='glorot_uniform',kernel_regularizer=l2(reg))(d)
    d=Dropout(0.5)(d)
    w=Concatenate()(attrs_ord1+attrs_ord2)
    x=Concatenate()([w,d])
    output=Dense(1,activation='sigmoid',kernel_initializer='glorot_uniform',kernel_regularizer=l2(reg))(x)

    model=Model(inputs=input,outputs=output)
    model.name='deep_fm_k%d_h%d'%(k,h)
    model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
    model.summary()
    return model

In [50]:
# # def attention_fm(sel_cols):
# k=0
# for i in range(len(sel_cols)):
#     k+=sel_cols[i][1]
# k=len(sel_cols)*(len(sel_cols)-1)//2
# input=Input((len(sel_cols),))
# embs1=[]
# embs2=[]
# for i in range(len(sel_cols)):
#     x=Lambda(lambda x:x[:,i])(input)
#     embs1.append(Embedding(sel_cols[i][1]+1,sel_cols[i][1]+1,embeddings_initializer='identity',trainable=False)(x))
#     embs2.append(Embedding(sel_cols[i][1]+1,k,embeddings_initializer='glorot_normal')(x))

# embs3=[]
# atts=[]
# t=256
# w=K.random_normal((k,t))
# h=K.random_normal((1,t))
# b=K.random_normal((1,t))
# for i in range(len(sel_cols)):
#     for j in range(i+1,len(sel_cols)):
#         p=Multiply()([embs2[i],embs2[j]])
#         a=Lambda(lambda p,arguments={'w':w}:K.dot(p,w))(p)
#         a=Activation('relu')(Add()([a,b]))
#         a=Dot(1)([h,a])
#         atts.append(a)
#         embs3.append(p)
# a=Concatenate()(atts)
# a=Softmax()(a)
# emb_pool=Add()(embs3)
# emb_pool=Multiply()([a,emb_pool])

# x=Concatenate()(embs1)
# x=Concatenate()([x,emb_pool])
# output=Dense(1,activation='sigmoid',kernel_initializer='glorot_normal')(x)

# model=Model(inputs=input,outputs=output)
# model.name='attention_fm_k%d'%k
# model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['binary_accuracy'])
# model.summary()
# #     return model

In [55]:


class LossHistory(Callback):
    def on_train_begin(self, logs={}):
        self.logs=[]

    def on_epoch_end(self, batch, logs={}):
        self.logs.append(logs)
        
    def train_detail(self):
        log=self.logs[-1]
        log['epochs']=len(self.logs)
        return log
    
class auc_callback(Callback):
    def __init__(self,x_tra,y_tra,x_val,y_val):
        self.x_tra = x_tra
        self.y_tra = y_tra
        self.x_val = x_val
        self.y_val = y_val
        
    def on_train_begin(self, logs={}):
        self.logs=[]

    def on_epoch_end(self, epoch, logs={}):
        y_pred = self.model.predict(self.x_tra)
        self.auc = roc_auc_score(self.y_tra, y_pred)
        y_pred_val = self.model.predict(self.x_val)
        self.auc_val = roc_auc_score(self.y_val, y_pred_val)
        print('\rauc: %.4f - auc_val: %.4f' % (self.auc,self.auc_val),end=100*' '+'\n')
        self.logs.append(logs)
        return
    
    def train_detail(self):
        log=self.logs[-1]
        log['auc']=self.auc
        log['auc_val']=self.auc_val
        return log
    
def run(models,only_compile=False):
    global models_info
    tra_models=[]
    for m in models:
        K.clear_session()
        
        batch_s=128

        if m == 'lr':
            model=lr(sel_cols)
        elif m == 'fm':
            model=fm(sel_cols)
        elif m == 'deep_fm':
            model=deep_fm(sel_cols)
        else:
            model=None
            
        if only_compile:
            continue

        m_info={'model':model.name}
        trainable_count = int(
            np.sum([K.count_params(p) for p in set(model.trainable_weights)]))
        m_info['params_count']=trainable_count

        lh=LossHistory()
        roc=auc_callback(x_tra,y_tra,x_val,y_val)
        model.fit(x_tra,y_tra,validation_data=(x_val,y_val),epochs=100,batch_size=batch_s,
                  callbacks=[EarlyStopping(min_delta=1e-4,patience=1),lh,roc])
        tra_models.append(model)

        results=tst_data[['id']].copy()
        results[label_col]=model.predict(x_tst)
        results[['id',label_col]].to_csv('output/%s_%s.csv'%(datetime.utcnow().strftime('%y%m%d_%H%M%S'),model.name),
                                         index=False,float_format='%.3f')

        m_info.update(lh.train_detail())
        m_info.update(roc.train_detail())
        if str(type(models_info)).find('list')!=-1:
            models_info.append(m_info)
            models_info=pd.DataFrame(models_info)
        else:
            models_info=models_info.append(m_info,ignore_index=True)
        models_info.to_hdf(mi_path,key='models')
        
    return tra_models

In [56]:
models=run(['fm'])

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 22)           0                                            
__________________________________________________________________________________________________
lambda_1 (Lambda)               (None,)              0           input_1[0][0]                    
__________________________________________________________________________________________________
lambda_3 (Lambda)               (None,)              0           input_1[0][0]                    
__________________________________________________________________________________________________
lambda_5 (Lambda)               (None,)              0           input_1[0][0]                    
__________________________________________________________________________________________________
lambda_7 (

Train on 3620275 samples, validate on 422181 samples
Epoch 1/100
auc: 0.7713 - auc_val: 0.7356                                                                                                    
Epoch 2/100
auc: 0.7897 - auc_val: 0.7235                                                                                                    


In [57]:
models_info[['model','params_count','epochs','loss','val_loss']]

Unnamed: 0,model,params_count,epochs,loss,val_loss
0,lr_f33342,33364,3,0.395412,0.400594
1,fm_16,567172,3,0.387692,0.404866
2,deep_fm_k16_h256,740591,12,0.494015,0.536752
3,deep_fm_k16_h256,740591,2,0.394316,0.553624
4,deep_fm_k4_h256,205163,3,0.398473,0.40381
5,deep_fm_k4_h256,2928982,2,0.398929,0.406616
6,fm_k16,9655775,2,0.385956,0.410166
