### Keras Neural Network

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import OrderedDict
from sklearn.model_selection import ParameterGrid
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import StratifiedKFold
import re, string, time
from nltk.stem.snowball import SnowballStemmer
from sklearn.metrics import log_loss

from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras import regularizers
from keras.utils import np_utils
from keras.callbacks import EarlyStopping

Using TensorFlow backend.


Custom functions for loading and preprocessing data.

In [2]:
def most_freq_vects(docs, max_feature=None, percent=None, token_pattern=u'(?u)\b\w\w+\b'):
    vect = CountVectorizer(token_pattern=token_pattern)
    feat_sparse = vect.fit_transform(docs.values.astype('U'))
    freq_table = list(zip(vect.get_feature_names(), np.asarray(feat_sparse.sum(axis=0)).ravel()))
    freq_table = pd.DataFrame(freq_table, columns=['feature', 'count']).sort_values('count', ascending=False)
    if not max_feature:
        if percent:
            max_feature = int(percent * len(vect.get_feature_names()))
        else:
            max_feature = len(vect.get_feature_names())
    feat_df = pd.DataFrame(feat_sparse.todense(), columns=vect.get_feature_names())
    names = list(freq_table.feature[:max_feature])
    return feat_df[names]


def load_data():
    print('Loading features files')
    basic_feat = pd.read_json('../feat_input/basic_feat.json')
    longtime_feat = pd.read_csv('../feat_input/longtime_feat.csv')
    encoded_feat = pd.read_csv('../feat_input/feat_stats_encoding.csv')

    # apply ordinal encoding to categorical feature
    print('Ordinal encoding')
    basic_feat.display_address = basic_feat.display_address.replace(r'\r$', '', regex=True)
    basic_feat.street_address = basic_feat.street_address.replace(r'\r$', '', regex=True)
    categorical = ["display_address", "manager_id", "building_id", "street_address"]
    for f in categorical:
        if basic_feat[f].dtype == 'object':
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(basic_feat[f].values))
            basic_feat[f] = lbl.transform(list(basic_feat[f].values))

    all_feat = basic_feat.merge(longtime_feat, on='listing_id')
    all_feat = all_feat.merge(encoded_feat, on='listing_id')

    print("Features document-term matrix")
    stemmer = SnowballStemmer('english')
    punct = string.punctuation
    punct = re.sub("'|-", "", punct)
    pattern = r"[0-9]|[{}]".format(punct)
    all_feat['features'] = all_feat['features'].apply(lambda x: [re.sub(pattern, "", y) for y in x])
    all_feat['features'] = all_feat['features'].apply(lambda x: [stemmer.stem(y) for y in x])
    all_feat['features'] = all_feat['features'].apply(lambda x: ['_'.join(['feature'] + y.split()) for y in x])
    all_feat['features'] = all_feat['features'].apply(lambda x: ' '.join(x))
    vect_df = most_freq_vects(all_feat['features'], max_feature=100, token_pattern=r"[^ ]+")
    
    all_feat = pd.concat([all_feat, vect_df], axis=1)
    train = all_feat[all_feat.interest_level != -1].copy()
    test = all_feat[all_feat.interest_level == -1].copy()
    y_train=train["interest_level"]

    x_train = train.drop(["interest_level","features"],axis=1)
    x_test = test.drop(["interest_level","features"],axis=1)

    return x_train, y_train, x_test, x_test.columns.values, x_test.listing_id


def _preprocess(dtrain, dtest):
    # replace np.inf to np.nan
    dtrain = dtrain.replace([np.inf, -np.inf], np.nan)
    dtest = dtest.replace([np.inf, -np.inf], np.nan)

    # impute np.nan
    dtrain_col_mean = dtrain.mean(axis=0)
    dtrain, dtest = dtrain.fillna(dtrain_col_mean), dtest.fillna(dtrain_col_mean)

    # perform standardization
    dtrain_col_mean, dtrain_col_std = dtrain.mean(axis=0), dtrain.std(axis=0)
    dtrain, dtest = map(lambda x: (x - dtrain_col_mean) / dtrain_col_std, (dtrain, dtest))

    return dtrain, dtest


def _preprocess_log(dtrain, dtest):
    # replace np.inf to np.nan
    dtrain = dtrain.replace([np.inf, -np.inf], np.nan)
    dtest = dtest.replace([np.inf, -np.inf], np.nan)

    # impute np.nan
    dtrain_col_mean = dtrain.mean(axis=0)
    dtrain, dtest = dtrain.fillna(dtrain_col_mean), dtest.fillna(dtrain_col_mean)

    # log transform of min-zero columns
    dtrain_col_min = dtrain.min(axis=0)
    zero_min_index = dtrain_col_min[dtrain_col_min >= 0].index

    dtrain[zero_min_index] = np.log10(dtrain[zero_min_index] + 1.0)
    dtest[zero_min_index] = np.log10(dtest[zero_min_index] + 1.0)

    # perform standardization
    dtrain_col_mean, dtrain_col_std = dtrain.mean(axis=0), dtrain.std(axis=0)
    dtrain, dtest = map(lambda x: (x - dtrain_col_mean) / dtrain_col_std, (dtrain, dtest))

    return dtrain, dtest

Build layers of nn model using keras.

In [16]:
def nn_create_model(x, reg, class_num):
    model = Sequential()
    model.add(Dense(64, input_dim=x.shape[1], activation='relu', kernel_regularizer=regularizers.l2(reg)))
    model.add(Dense(64, activation='relu', kernel_regularizer=regularizers.l2(reg)))
    model.add(Dense(class_num, activation='softmax', kernel_regularizer=regularizers.l2(reg)))

    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    return model

Custom function run model.

In [17]:
def nn_run_model(model, dtrain, dtest, batch_size=64, nb_epochs=20, patience=5):
    if dtest:
        early_stop = EarlyStopping(monitor='val_loss', patience=patience, verbose=0, mode='auto')
        model.fit(dtrain[0], dtrain[1], batch_size=batch_size, epochs=nb_epochs,
                  callbacks=[early_stop], validation_data=dtest, verbose=0)
        y_train_pred, y_test_pred = model.predict(dtrain[0]), model.predict(dtest[0])
        y_train_loss, y_test_loss = log_loss(dtrain[1], y_train_pred), log_loss(dtest[1], y_test_pred)        
        return model, y_train_loss, y_test_loss
    else:
        model.fit(dtrain[0], dtrain[1], batch_size=batch_size, epochs=nb_epochs, verbose=2)
        y_train_pred = model.predict(dtrain[0])
        y_train_loss = log_loss(dtrain[1], y_train_pred)
        return model, y_train_loss

Train model with 5-fold cross-validation, output logloss for both train and validation data from cv.

In [18]:
def neural_network_cv(preprocess='linear', reg=0.005, batch_size=64, nb_epochs=100, patience=5):
    class_num, reg = 3, 0.005
    X_train, y_train_cls, X_test, _, _ = load_data()
    y_train = np_utils.to_categorical(y_train_cls, class_num)
    if preprocess == 'log':
        X_train, X_test = _preprocess_log(X_train, X_test)
    else:
        X_train, X_test = _preprocess(X_train, X_test)
    
    cv_scores, n_folds = [], 5
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=816)
    for i, (train_ind, val_ind) in enumerate(skf.split(X_train, y_train_cls)):
        print("Running Fold", i + 1, "/", n_folds)
        start = time.time()
        
        train_x, val_x = X_train.iloc[train_ind, :].as_matrix(), X_train.iloc[val_ind, :].as_matrix()
        train_y, val_y = y_train[train_ind, :], y_train[val_ind, :]
        clf = nn_create_model(train_x, reg, class_num)
        clf, train_loss, val_loss = nn_run_model(clf, (train_x, train_y), (val_x, val_y), batch_size, nb_epochs, patience)
        cv_scores.append([train_loss, val_loss])
        
        print("train_loss: {0:.6f}, val_loss: {1:.6f}".format(train_loss, val_loss), end="\t")
        
        end = time.time()
        m, s = divmod(end-start, 60)
        h, m = divmod(m, 60)
        print("time elapsed: %d:%02d:%02d" % (h, m, s))
        
    mean_train_loss = np.mean([cv_scores[i][0] for i in range(len(cv_scores))])
    mean_val_loss = np.mean([cv_scores[i][1] for i in range(len(cv_scores))])
    print("train_loss mean: {0:.6f}, val_loss mean: {1:.6f}".format(mean_train_loss, mean_val_loss))

In [20]:
neural_network_cv(preprocess='linear')

Loading features files
Ordinal encoding
Features document-term matrix
Running Fold 1 / 5
train_loss: 0.564195, val_loss: 0.582923	time elapsed: 0:02:50
Running Fold 2 / 5
train_loss: 0.569054, val_loss: 0.570021	time elapsed: 0:02:33
Running Fold 3 / 5
train_loss: 0.564443, val_loss: 0.576351	time elapsed: 0:02:53
Running Fold 4 / 5
train_loss: 0.566299, val_loss: 0.591557	time elapsed: 0:02:49
Running Fold 5 / 5
train_loss: 0.568864, val_loss: 0.582655	time elapsed: 0:01:41
train_loss mean: 0.566571, val_loss mean: 0.580702


In [21]:
neural_network_cv(preprocess='log')

Loading features files
Ordinal encoding
Features document-term matrix
Running Fold 1 / 5
train_loss: 0.540040, val_loss: 0.562592	time elapsed: 0:02:11
Running Fold 2 / 5
train_loss: 0.548724, val_loss: 0.556050	time elapsed: 0:01:32
Running Fold 3 / 5
train_loss: 0.546578, val_loss: 0.560894	time elapsed: 0:01:26
Running Fold 4 / 5
train_loss: 0.539318, val_loss: 0.560809	time elapsed: 0:02:29
Running Fold 5 / 5
train_loss: 0.541070, val_loss: 0.553596	time elapsed: 0:01:48
train_loss mean: 0.543146, val_loss mean: 0.558788


- `acc, loss` is computed on the training batches, `val_acc, val_loss` is calculated on the validation dataset from cv.
- Much better result is obtained, with log(X+1) transformation of the input features. However, the neural network is still less comparable with results from boosting.

Try other input parameters: `reg=0.01, batch_size=640, nb_epochs=5000, patience=20`

In [19]:
neural_network_cv(preprocess='log', reg=0.01, batch_size=640, nb_epochs=5000, patience=20)

Loading features files
Ordinal encoding
Features document-term matrix
Running Fold 1 / 5
train_loss: 0.518329, val_loss: 0.551731	time elapsed: 0:00:31
Running Fold 2 / 5
train_loss: 0.524863, val_loss: 0.538281	time elapsed: 0:00:33
Running Fold 3 / 5
train_loss: 0.522881, val_loss: 0.546620	time elapsed: 0:00:42
Running Fold 4 / 5
train_loss: 0.518309, val_loss: 0.549502	time elapsed: 0:00:57
Running Fold 5 / 5
train_loss: 0.522674, val_loss: 0.542417	time elapsed: 0:01:51
train_loss mean: 0.521411, val_loss mean: 0.545710


In [28]:
neural_network_cv(preprocess='log', reg=0.01, batch_size=6400, nb_epochs=5000, patience=20)

Loading features files
Ordinal encoding
Features document-term matrix
Running Fold 1 / 5
train_loss: 0.505150, val_loss: 0.548612	time elapsed: 0:00:41
Running Fold 2 / 5
train_loss: 0.508658, val_loss: 0.532170	time elapsed: 0:00:54
Running Fold 3 / 5
train_loss: 0.507986, val_loss: 0.542261	time elapsed: 0:01:01
Running Fold 4 / 5
train_loss: 0.506295, val_loss: 0.545356	time elapsed: 0:00:59
Running Fold 5 / 5
train_loss: 0.508341, val_loss: 0.538506	time elapsed: 0:00:55
train_loss mean: 0.507286, val_loss mean: 0.541381


In general, neural network performs worse than xgboost by around 0.03 in logloss.