## Collect Model Output Data and Generate Features

In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
product_df = pd.read_csv('../../data/processed/product_data.csv')
# Remove floats
product_df = product_df.loc[product_df['product_name'].apply(lambda x: isinstance(x, str)),:]
product_df = product_df.loc[product_df['is_ordered_history'].apply(lambda x: isinstance(x, str)),:]
product_df = product_df[["user_id", "product_id", "label"]] # usecols=['user_id', 'product_id', 'label']

In [3]:
# Check Data Shape
product_df.shape
# Check Label type
label_col = product_df['label']
label_set = set(label_col)
label_count = {i: sum(label_col == i) for i in label_set}
print(label_count)

{0: 7829, 1: 1223, -1: 262629}


In [4]:
products = pd.read_csv('../../data/raw/products.csv')
product_df = product_df.merge(products, how='left', on='product_id')

orders = pd.read_csv('../../data/raw/orders.csv')
orders = orders[orders['eval_set'].isin({'train', 'test'})]
product_df = product_df.merge(orders[['user_id', 'order_id']], how='left', on='user_id').reset_index(drop=True)
product_df['is_none'] = (product_df['product_id'] == 0).astype(int)

In [5]:
# nn feature representations
prefix = 'rnn_product'
h_df = pd.DataFrame(np.load('../rnn_product/predictions/final_states.npy')).add_prefix('{}_h'.format(prefix))
h_df['user_id'] = np.load('../rnn_product/predictions/user_ids.npy')
h_df['product_id'] = np.load('../rnn_product/predictions/product_ids.npy')
product_df = product_df.merge(h_df, how='left', on=['user_id', 'product_id'])

In [6]:
drop_cols = [
    'label',
    'user_id',
    'product_id',
    'order_id',
    'product_name',
    'aisle_id',
    'department_id',
]
user_id = product_df['user_id']
product_id = product_df['product_id']
order_id = product_df['order_id']
label = product_df['label']

product_df.drop(drop_cols, axis=1, inplace=True)
features = product_df.values
feature_names = product_df.columns.values
feature_maxs = features.max(axis=0)
feature_mins = features.min(axis=0)
feature_means = features.mean(axis=0)

In [7]:
# Save features
if not os.path.isdir('data'):
    os.makedirs('data')

np.save('data/user_id.npy', user_id)
np.save('data/product_id.npy', product_id)
np.save('data/order_id.npy', order_id)
np.save('data/features.npy', features)
np.save('data/feature_names.npy', product_df.columns)
np.save('data/feature_maxs.npy', feature_maxs)
np.save('data/feature_mins.npy', feature_mins)
np.save('data/feature_means.npy', feature_means)
np.save('data/label.npy', label)

## NN Blend 

In [9]:
import os

import numpy as np
import tensorflow as tf

import sys
sys.path.append(os.path.join(os.getcwd(), '..'))

from data_frame import DataFrame
from tf_base_model import TFBaseModel
from tf_utils import dense_layer, log_loss


class DataReader(object):

    def __init__(self, data_dir):
        data_cols = [
            'order_id',
            'product_id',
            'features',
            'label'
        ]
        data = [np.load(os.path.join(data_dir, '{}.npy'.format(i)), mmap_mode='r') for i in data_cols]
        df = DataFrame(columns=data_cols, data=data)
        self.data_dim = df['features'].shape[1]

        print(df.shapes())
        print('loaded data')

        # Since we don't have the true label for Kaggle test dataset
        # we generate the test dataset by splitting the training dataset
        ####################
        df = df.mask(df['label'] != -1)
        self.train_val_df, self.test_df = df.train_test_split(train_size=0.8)
        self.train_df, self.val_df = self.train_val_df.train_test_split(train_size=0.9)
        ####################

        print( 'train size', len(self.train_df))
        print( 'val size', len(self.val_df))
        print( 'test size', len(self.test_df))

        self.feature_means = np.load(os.path.join(data_dir, 'feature_means.npy'))
        self.feature_maxs = np.load(os.path.join(data_dir, 'feature_maxs.npy'))
        self.feature_mins = np.load(os.path.join(data_dir, 'feature_mins.npy'))

    def train_batch_generator(self, batch_size):
        return self.batch_generator(
            batch_size=batch_size,
            df=self.train_df,
            shuffle=True,
            num_epochs=10000,
            is_test=False
        )

    def val_batch_generator(self, batch_size):
        return self.batch_generator(
            batch_size=batch_size,
            df=self.val_df,
            shuffle=True,
            num_epochs=10000,
            is_test=False
        )

    def test_batch_generator(self, batch_size):
        return self.batch_generator(
            batch_size=batch_size,
            df=self.test_df,
            shuffle=False,
            num_epochs=1,
            is_test=True
        )

    def batch_generator(self, batch_size, df, shuffle=True, num_epochs=10000, is_test=False):
        batch_gen = df.batch_generator(batch_size, shuffle=shuffle, num_epochs=num_epochs, allow_smaller_final_batch=is_test)
        for batch in batch_gen:
            batch['features'] = np.nan_to_num((batch['features'] - self.feature_means) / (self.feature_maxs - self.feature_mins))
            yield batch

In [10]:
class nn(TFBaseModel):

    def __init__(self, hidden_units=500, **kwargs):
        self.hidden_units = hidden_units
        super(nn, self).__init__(**kwargs)

    def calculate_loss(self):
        self.order_id = tf.placeholder(tf.int32, [None])
        self.product_id = tf.placeholder(tf.int32, [None])
        self.features = tf.placeholder(tf.float32, [None, self.reader.data_dim])
        self.label = tf.placeholder(tf.int32, [None])

        h = dense_layer(self.features, self.hidden_units, activation=tf.nn.relu, scope='dense1')
        h = tf.concat([h, self.features], axis=1)
        y_hat = tf.squeeze(dense_layer(h, 1, activation=tf.nn.sigmoid, scope='dense2'), 1)
        loss = log_loss(self.label, y_hat)

        self.prediction_tensors = {
            'order_ids': self.order_id,
            'product_ids': self.product_id,
            'predictions': y_hat,
            'labels': self.label
        }

        return loss

In [11]:
base_dir = './'

dr = DataReader(data_dir=os.path.join(base_dir, 'data'))

# A tiny one for this small dataset
nn = nn(
    reader=dr,
    log_dir=os.path.join(base_dir, 'logs_nn'),
    checkpoint_dir=os.path.join(base_dir, 'checkpoints_nn'),
    prediction_dir=os.path.join(base_dir, 'predictions_nn'),
    optimizer='adam',
    learning_rate=.005,
    hidden_units=64,
    batch_size=128,
    num_training_steps=2000,
    early_stopping_steps=300,
    warm_start_init_step=0,
    regularization_constant=0.0,
    keep_prob=1.0,
    enable_parameter_averaging=False,
    num_restarts=0,
    min_steps_to_checkpoint=100,
    log_interval=20,
    num_validation_batches=2,
)
nn.fit()
nn.restore()
nn.predict()


new run with parameters:
{'batch_size': 128,
 'checkpoint_dir': './checkpoints_nn',
 'early_stopping_steps': 300,
 'enable_parameter_averaging': False,
 'grad_clip': 5,
 'hidden_units': 64,
 'keep_prob_scalar': 1.0,
 'learning_rate': 0.005,
 'log_dir': './logs_nn',
 'log_interval': 20,
 'loss_averaging_window': 100,
 'min_steps_to_checkpoint': 100,
 'num_restarts': 0,
 'num_training_steps': 2000,
 'num_validation_batches': 2,
 'optimizer': 'adam',
 'prediction_dir': './predictions_nn',
 'reader': <__main__.DataReader object at 0x2aedb8ee3940>,
 'regularization_constant': 0.0,
 'warm_start_init_step': 0}


order_id         (289024,)
product_id       (289024,)
features      (289024, 51)
label            (289024,)
dtype: object
loaded data
train size 7024
val size 781
test size 1952


all parameters:
[('dense1/weights:0', [51, 64]),
 ('dense1/biases:0', [64]),
 ('dense2/weights:0', [115, 1]),
 ('dense2/biases:0', [1]),
 ('Variable:0', []),
 ('Variable_1:0', []),
 ('beta1_power:0', []),
 ('beta2_power:0', []),
 ('dense1/weights/Adam:0', [51, 64]),
 ('dense1/weights/Adam_1:0', [51, 64]),
 ('dense1/biases/Adam:0', [64]),
 ('dense1/biases/Adam_1:0', [64]),
 ('dense2/weights/Adam:0', [115, 1]),
 ('dense2/weights/Adam_1:0', [115, 1]),
 ('dense2/biases/Adam:0', [1]),
 ('dense2/biases/Adam_1:0', [1])]
trainable parameters:
[('dense1/weights:0', [51, 64]),
 ('dense1/biases:0', [64]),
 ('dense2/weights:0', [115, 1]),
 ('dense2/biases:0', [1])]
trainable parameter count:
3444
[[step        0]]     [[train]]     loss: 0.69173574       [[val]]     loss: 0.69281209       
[[step       20]]     [[train]]     loss: 0.59147092       [[val]]     loss: 0.59358677       
[[step       40]]     [[train]]     loss: 0.48803423       [[val]]     loss: 0.49711335       
[[step       60]]    

built graph


[[step      100]]     [[train]]     loss: 0.3528045        [[val]]     loss: 0.36534492       
[[step      120]]     [[train]]     loss: 0.28968921       [[val]]     loss: 0.30163035       
saving model to ./checkpoints_nn/model
[[step      140]]     [[train]]     loss: 0.26592089       [[val]]     loss: 0.27718664       
saving model to ./checkpoints_nn/model
[[step      160]]     [[train]]     loss: 0.25677417       [[val]]     loss: 0.27325503       
saving model to ./checkpoints_nn/model
[[step      180]]     [[train]]     loss: 0.25658396       [[val]]     loss: 0.27246476       
saving model to ./checkpoints_nn/model
[[step      200]]     [[train]]     loss: 0.25885595       [[val]]     loss: 0.27242188       
saving model to ./checkpoints_nn/model
[[step      220]]     [[train]]     loss: 0.25665928       [[val]]     loss: 0.27326019       
[[step      240]]     [[train]]     loss: 0.25233751       [[val]]     loss: 0.27257384       
[[step      260]]     [[train]]     loss: 0.2

INFO:tensorflow:Restoring parameters from ./checkpoints_nn/model-320


saving order_ids with shape (1952,) to ./predictions_nn/order_ids.npy
saving product_ids with shape (1952,) to ./predictions_nn/product_ids.npy
saving predictions with shape (1952,) to ./predictions_nn/predictions.npy
saving labels with shape (1952,) to ./predictions_nn/labels.npy


0


## GBM Blend

In [13]:
import gc
import os
import pprint as pp

import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split


In [14]:
# Data Preprocessing
order_id = np.load('data/order_id.npy')
product_id = np.load('data/product_id.npy')
features = np.load('data/features.npy')
feature_names = np.load('data/feature_names.npy', allow_pickle=True)
label = np.load('data/label.npy')

product_df = pd.DataFrame(data=features, columns=feature_names)
product_df['order_id'] = order_id
product_df['product_id'] = product_id
product_df['label'] = label

del order_id, product_id, features, feature_names, label
gc.collect()

drop_cols = [i for i in product_df.columns if i.startswith('sgns') or i.startswith('nnmf')]
drop_cols += ['order_id', 'product_id', 'label']

# training

#self.train_val_df, self.test_df = df.train_test_split(train_size=0.8)
#self.train_df, self.val_df = self.train_val_df.train_test_split(train_size=0.9)
df = product_df[product_df['label'] != -1]
train_val_df, test_df = train_test_split(df, train_size=0.8)
train_df, val_df = train_test_split(train_val_df, train_size=0.9)
del product_df
gc.collect()

Y_train, Y_val = train_df['label'].astype(int).astype(float), val_df['label'].astype(int).astype(float)
X_train, X_val = train_df.drop(drop_cols, axis=1), val_df.drop(drop_cols, axis=1)
del train_df
gc.collect()

test_orders = test_df['order_id']
test_products = test_df['product_id']
test_labels = test_df['label']
X_test = test_df.drop(drop_cols, axis=1)
del test_df
gc.collect()

0

In [15]:
# Training
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss'},
    'learning_rate': .01,
    'num_leaves': 32,
    'max_depth': 6,
    'feature_fraction': 0.35,
    'bagging_fraction': 0.5,
    'bagging_freq': 2,
    'early_stopping_round': 500
}
rounds = 15000
d_train = lgb.Dataset(X_train, label=Y_train, silent=True)
d_valid = lgb.Dataset(X_val, label=Y_val, silent=True)
del X_train, X_val, Y_train, Y_val

valid_sets = [d_train, d_valid]
valid_names = ['train', 'valid']
gbdt = lgb.train(params, d_train, rounds, valid_sets=valid_sets, valid_names=valid_names, verbose_eval=20)



[LightGBM] [Info] Number of positive: 877, number of negative: 6147
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8451
[LightGBM] [Info] Number of data points in the train set: 7024, number of used features: 51
Training until validation scores don't improve for 500 rounds
[20]	train's binary_logloss: 0.336001	valid's binary_logloss: 0.346389
[40]	train's binary_logloss: 0.310622	valid's binary_logloss: 0.324106
[60]	train's binary_logloss: 0.291408	valid's binary_logloss: 0.307751
[80]	train's binary_logloss: 0.276505	valid's binary_logloss: 0.295736
[100]	train's binary_logloss: 0.26455	valid's binary_logloss: 0.286133
[120]	train's binary_logloss: 0.255257	valid's binary_logloss: 0.27897
[140]	train's binary_logloss: 0.247238	valid's binary_logloss: 0.273076
[160]	train's binary_logloss: 0.240503	valid's binary_logloss: 0.268895
[180]	train's binary_logloss: 0.234675	valid's binar

In [16]:
# Evaluate and Predict
features = gbdt.feature_name()
importance = list(gbdt.feature_importance())
importance = zip(features, importance)
importance = sorted(importance, key=lambda x: x[1])
total = sum(j for i, j in importance)
importance = [(i, float(j)/total) for i, j in importance]
pp.pprint(importance)

test_preds = gbdt.predict(X_test, num_iteration=gbdt.best_iteration)

dirname = 'predictions_gbm'
if not os.path.isdir(dirname):
    os.makedirs(dirname)

np.save(os.path.join(dirname, 'order_ids.npy'), test_orders)
np.save(os.path.join(dirname, 'product_ids.npy'), test_products)
np.save(os.path.join(dirname, 'predictions.npy'), test_preds)
np.save(os.path.join(dirname, 'labels.npy'), test_labels)

[('rnn_product_h7', 0.0),
 ('rnn_product_h9', 0.0),
 ('rnn_product_h10', 0.0),
 ('rnn_product_h12', 0.0),
 ('rnn_product_h17', 0.0),
 ('rnn_product_h18', 0.0),
 ('rnn_product_h21', 0.0),
 ('rnn_product_h28', 0.0),
 ('rnn_product_h30', 0.0),
 ('rnn_product_h33', 0.0),
 ('rnn_product_h36', 0.0),
 ('rnn_product_h39', 0.0),
 ('rnn_product_h47', 0.0),
 ('rnn_product_h48', 0.0),
 ('rnn_product_h49', 0.0),
 ('rnn_product_h5', 0.00018249285236328244),
 ('rnn_product_h0', 0.0012774499665429771),
 ('is_none', 0.012348683009915444),
 ('rnn_product_h44', 0.02238578988989598),
 ('rnn_product_h2', 0.023054930348561348),
 ('rnn_product_h6', 0.02311576129934911),
 ('rnn_product_h34', 0.02311576129934911),
 ('rnn_product_h15', 0.024940689822981935),
 ('rnn_product_h46', 0.024940689822981935),
 ('rnn_product_h29', 0.02694811119897804),
 ('rnn_product_h31', 0.027252265952916844),
 ('rnn_product_h1', 0.02755642070685565),
 ('rnn_product_h35', 0.027982237362369974),
 ('rnn_product_h14', 0.02804306831315773

## F-score Evaluation

In [17]:
from multiprocessing import Pool, cpu_count
import numpy as np
import pandas as pd

In [18]:
# Load Data
gbm_df = pd.DataFrame({
    'order_id': np.load('predictions_gbm/order_ids.npy'),
    'product_id': np.load('predictions_gbm/product_ids.npy'),
    'prediction_gbm': np.load('predictions_gbm/predictions.npy'),
    'label': np.load('predictions_gbm/labels.npy')
})

nn_df = pd.DataFrame({
    'order_id': np.load('predictions_nn/order_ids.npy'),
    'product_id': np.load('predictions_nn/product_ids.npy'),
    'prediction_nn': np.load('predictions_nn/predictions.npy'),
})

In [19]:
# Combine the prediction
# pred_df = gbm_df.merge(nn_df, how='left', on=['order_id', 'product_id'])
# pred_df['prediction'] = .9*pred_df['prediction_gbm'] + .1*pred_df['prediction_nn']

In [20]:
# Calculate F-score
def cal_f_value(pred, label, model):
    TP = sum((label==1) & (pred==1))
    FP = sum((pred==1) & (label==0))
    FN = sum((pred==0) & (label==1))
    recall = TP / (TP + FN)
    precision = TP / (TP + FP)
    F_value = 2 / (1/recall + 1/precision)
    summary = [model, recall, precision, F_value]
    summary_name = ["model", "recall", "precision", "F_value"]
    return(pd.DataFrame(dict(zip(summary_name, summary)), index=[0]))

true_label = gbm_df['label']
# gbm
pred_gbm = (gbm_df['prediction_gbm'] > 0.5).astype(int)
print(cal_f_value(pred_gbm, true_label, "GBM"))
# nn
pred_nn = (nn_df['prediction_nn'] > 0.5).astype(int)
print(cal_f_value(pred_nn, true_label, "NN"))

  model    recall  precision   F_value
0   GBM  0.175781   0.555556  0.267062


ZeroDivisionError: division by zero

In [22]:
pred_nn.unique # All Zeros

<bound method Series.unique of 0       0
1       0
2       0
3       0
4       0
       ..
1947    0
1948    0
1949    0
1950    0
1951    0
Name: prediction_nn, Length: 1952, dtype: int64>