In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import config
from sklearn.model_selection import StratifiedKFold
from load_data import FeatureDict, DataPaser
from model import DCN

  from ._conv import register_converters as _register_converters


## Load data

In [2]:
def load_data():
    dfTrain = pd.read_csv('data/train.csv')
    dfTest = pd.read_csv('data/test.csv')
    
    def preprocess(df):
        cols = [c for c in df.columns if c not in ['id','target']]
        df['missing_feat'] = np.sum((df[cols] == -1).values, axis=1)
        df['ps_car_13_x_ps_reg_03'] = df['ps_car_13'] * df['ps_reg_03']
        return df
    
    dfTrain = preprocess(dfTrain)
    dfTest = preprocess(dfTest)
    
    cols = [c for c in dfTrain.columns if c not in ['id','target']]
    cols = [c for c in cols if (not c in config.IGNORE_COLS)]
    
    X_train = dfTrain[cols].values
    y_train = dfTrain['target'].values
    X_test = dfTest[cols].values
    ids_test = dfTest['id'].values
    
    return dfTrain, dfTest, X_train, y_train,X_test,ids_test

## Run model

In [3]:
def run_dcn(dfTrain, dfTest, folds, params):
    fd = FeatureDict(dfTrain, dfTest, numeric_cols=config.NUMERIC_COLS, 
                     ignore_cols=config.IGNORE_COLS, cate_cols=config.CATEGORICAL_COLS)
    # print(fd.feat_dim)
    # print(fd.feat_dict)
    
    data_parser = DataPaser(feat_dict=fd)
    cate_Xi_train,cate_Xv_train,numeric_Xv_train,y_train = data_parser.parse(df=dfTrain,has_label=True)
    cate_Xi_test,cate_Xv_test,numeric_Xv_test,ids_test = data_parser.parse(df=dfTest)
    
    params['cate_feature_size'] = fd.feat_dim
    params['field_size'] = len(cate_Xi_train[0])
    params['numeric_feature_size'] = len(config.NUMERIC_COLS)
    
    _get = lambda x, l: [x[i] for i in l]
    for i,(trn_idx,val_idx) in enumerate(folds):
        cate_Xi_train_, cate_Xv_train_,numeric_Xv_train_,y_train_ = _get(cate_Xi_train, trn_idx),_get(cate_Xv_train,trn_idx),_get(numeric_Xv_train,trn_idx),_get(y_train,trn_idx)
        cate_Xi_valid_, cate_Xv_valid_,numeric_Xv_valid_,y_valid_ = _get(cate_Xi_train,val_idx),_get(cate_Xi_train,val_idx),_get(numeric_Xv_train,val_idx),_get(y_train,val_idx)
        
        dcn = DCN(**params)
        dcn.fit(cate_Xi_train_,cate_Xv_train_,numeric_Xv_train_,y_train_,cate_Xi_valid_,cate_Xv_valid_,numeric_Xv_valid_,y_valid_)

## Main

In [4]:
dfTrain,dfTest,X_train,y_train,X_test,ids_test = load_data()

In [5]:
folds = list(StratifiedKFold(n_splits=config.NUM_SPLITS,shuffle=True, random_state=config.RANDOM_SEED).split(X_train,y_train))

In [6]:
params = {'embedding_size': 4,
          'deep_layers': [8,8],
          'dropout_deep': [0.5,0.5,0.5],
          'deep_layers_activation': tf.nn.relu,
          'epoch': 30,
          'batch_size': 128,
          'learning_rate': 0.001,
          'optimizer_type': 'adam',
          'verbose': True,
          'random_seed': config.RANDOM_SEED,
          'cross_layer_num': 3}

In [7]:
run_dcn(dfTrain,dfTest,folds,params)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  df = pd.concat([self.trainfile,self.testfile])


Parames: 3259
6666
6666
6666
6666
epoch:  0 loss: [12.024529]
epoch:  1 loss: [7.773973]
epoch:  2 loss: [2.8597765]
epoch:  3 loss: [1.4841796]
epoch:  4 loss: [1.1699396]
epoch:  5 loss: [1.6370908]
epoch:  6 loss: [1.8515997]
epoch:  7 loss: [2.2488427]
epoch:  8 loss: [2.4133043]
epoch:  9 loss: [3.5436616]
epoch:  10 loss: [3.5054557]
epoch:  11 loss: [2.6502101]
epoch:  12 loss: [2.3495483]
epoch:  13 loss: [2.6154437]
epoch:  14 loss: [1.1022573]
epoch:  15 loss: [2.9973662]
epoch:  16 loss: [3.8579004]
epoch:  17 loss: [6.178442]
epoch:  18 loss: [8.025207]
epoch:  19 loss: [7.136653]
epoch:  20 loss: [7.3870573]
epoch:  21 loss: [8.595674]
epoch:  22 loss: [7.3435473]
epoch:  23 loss: [6.7827497]
epoch:  24 loss: [4.636249]
epoch:  25 loss: [3.5871704]
epoch:  26 loss: [2.8329947]
epoch:  27 loss: [4.4573736]
epoch:  28 loss: [4.7281036]
epoch:  29 loss: [4.7764482]
Parames: 3259
6667
6667
6667
6667
epoch:  0 loss: [0.6093248]
epoch:  1 loss: [1.5039687]
epoch:  2 loss: [6.712