In [1]:
from os.path import join,exists
from datetime import datetime
import numpy as np
import pandas as pd
import tensorflow as tf
from keras import Model
from keras.layers import Embedding,Activation,Dense,Input,Lambda,Concatenate,Softmax,Dropout,Dot,Add,Multiply,Reshape
from keras.callbacks import EarlyStopping,Callback
from keras.metrics import binary_crossentropy
from keras import backend as K
from sklearn.preprocessing import LabelEncoder,StandardScaler
from tqdm import tnrange

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
%%time
big_data=pd.read_hdf('raw_data/big.hdf',key='train')
label=pd.read_hdf('raw_data/label.hdf',key='label').values.ravel()

int_fea_num=13
cat_fea_num=26

s=StandardScaler()
for c in range(int_fea_num):
    big_data[c].fillna(big_data[c].quantile(),inplace=True)
    big_data[c]=s.fit_transform(big_data[c].reshape(-1,1))
    
tst_len=6042135
val_len=6000000
x_tra,y_tra,x_val,y_val,x_tst=big_data[:big_data.shape[0]-val_len-tst_len],label[:big_data.shape[0]-val_len-tst_len],big_data[big_data.shape[0]-val_len-tst_len:big_data.shape[0]-tst_len],label[-val_len:],big_data[big_data.shape[0]-tst_len:]
print(x_tra.shape,y_tra.shape,x_val.shape,y_val.shape,x_tst.shape)

  # Remove the CWD from sys.path while we load stuff.


(39840617, 39) () (6000000, 39) () (6042135, 39)
CPU times: user 2min 41s, sys: 23.3 s, total: 3min 4s
Wall time: 3min 4s


In [20]:
# big_data.to_hdf('raw_data/big_int_proc.hdf',key='big')

In [3]:
%%time
emb_dim={}
for c in range(int_fea_num,int_fea_num+cat_fea_num):
    emb_dim[c]=big_data[c].max()+1
sorted_emb_dim=sorted([(c,emb_dim[c]) for c in emb_dim],key=lambda x:x[1],reverse=True)
print(sorted_emb_dim)
drop_cols=[e[0] for e in sorted_emb_dim if e[1]>1000000]

[(15, 11299105), (24, 9292738), (33, 7822987), (28, 6047969), (16, 2416541), (36, 303075), (38, 148165), (22, 95979), (27, 15210), (19, 12597), (23, 5724), (30, 5721), (25, 3207), (31, 2178), (13, 1460), (20, 633), (14, 585), (17, 305), (37, 105), (26, 27), (18, 24), (34, 18), (35, 15), (29, 10), (32, 4), (21, 3)]
CPU times: user 9.06 s, sys: 2.72 s, total: 11.8 s
Wall time: 11.8 s


In [4]:
train_root='train'
mi_path=join(train_root,'models_info.hdf')
if exists(mi_path):
    models_info=pd.read_hdf(mi_path)
else:
    models_info=[]

In [5]:
def lr(input_dim):
    input=Input((input_dim,))
    attrs=[]
    f=0
    for i in range(input_dim):
        x=Lambda(lambda x:x[:,i])(input)
        if i in drop_cols:
            continue
        if i < int_fea_num:
            attrs.append(Reshape((1,))(x))
        else:
            attrs.append(Lambda(lambda x:K.one_hot(K.cast(x,'int32'),emb_dim[i]))(x))
        f+=1
    x=Concatenate()(attrs)
    output=Dense(1,activation='sigmoid')(x)

    model=Model(inputs=input,outputs=output)
    model.name='lr_f%d'%f
    model.compile(optimizer='adam',loss='binary_crossentropy',metrics=[binary_crossentropy])
    model.summary()
    return model

class LossHistory(Callback):
    def on_train_begin(self, logs={}):
        self.logs=[]

    def on_epoch_end(self, batch, logs={}):
        self.logs.append(logs)
        
    def train_detail(self):
        log=self.logs[-1]
        log['epochs']=len(self.logs)
        return log

    
def run(models,only_compile=False):
    global models_info
    tra_models=[]
    for m in models:
        K.clear_session()
        
        batch_s=256

        if m == 'lr':
            model=lr(big_data.shape[1])
        elif m == 'fm':
            model=fm(sel_cols)
        elif m == 'deep_fm':
            model=deep_fm(sel_cols)
        else:
            model=None
            
        if only_compile:
            continue

        m_info={'model':model.name}
        trainable_count = int(
            np.sum([K.count_params(p) for p in set(model.trainable_weights)]))
        m_info['params_count']=trainable_count

        lh=LossHistory()
        model.fit(x_tra,y_tra,validation_data=(x_val,y_val),epochs=100,batch_size=batch_s,
                  callbacks=[EarlyStopping(min_delta=1e-4,patience=1),lh])
        tra_models.append(model)

        
        m_info.update(lh.train_detail())
        if str(type(models_info)).find('list')!=-1:
            models_info.append(m_info)
            models_info=pd.DataFrame(models_info)
        else:
            models_info=models_info.append(m_info,ignore_index=True)
        models_info.to_hdf(mi_path,key='models')
        
    return tra_models

In [19]:
models=run(['lr'])
models_info[['model','params_count','epochs','loss','val_loss']]

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 39)           0                                            
__________________________________________________________________________________________________
lambda_1 (Lambda)               (None,)              0           input_1[0][0]                    
__________________________________________________________________________________________________
lambda_2 (Lambda)               (None,)              0           input_1[0][0]                    
__________________________________________________________________________________________________
lambda_3 (Lambda)               (None,)              0           input_1[0][0]                    
__________________________________________________________________________________________________
lambda_4 (

Instructions for updating:
Use tf.cast instead.
Train on 39840617 samples, validate on 6000000 samples
Epoch 1/100
 1355008/39840617 [>.............................] - ETA: 21:54:54 - loss: 0.4918 - binary_crossentropy: 0.4918

KeyboardInterrupt: 