# DNN classifier

reference data: https://labs.criteo.com/2013/12/download-terabyte-click-logs-2/

In [1]:
import pandas as pd
import numpy as np
import pickle
from os import path
from matplotlib import pyplot as plt
from datetime import datetime

import tensorflow as tf
from tensorflow.keras.layers.experimental.preprocessing import PreprocessingLayer, TextVectorization

### 1. Data batching

#### 1.1. Load dataset

In [2]:
LBL_COLUMN = ['lbl']
CAT_COLUMNS = [f'cat{i}' for i in range(26)]
NUM_COLUMNS = [f'num{i}' for i in range(13)]
COLUMNS = LBL_COLUMN + NUM_COLUMNS + CAT_COLUMNS
FEATURE_COLUMNS = NUM_COLUMNS + CAT_COLUMNS
COLUMN_DEFAULTS = [0]*14 + ['thisisdefault']*26

In [3]:
dataset = tf.data.experimental.make_csv_dataset(
    file_pattern='data/dac/sample_train.txt', 
    batch_size=200,
    num_epochs=1,
    column_defaults=COLUMN_DEFAULTS,
    column_names=COLUMNS, 
    label_name='lbl', 
    field_delim='\t',
    shuffle=True
)\
.shuffle(10, reshuffle_each_iteration=False)

#### 1.2. Conbine the columes

In [4]:
class FeaturesExtraction:
    
    def __init__(self, num_col, feature_type=None):
        self.num_col = num_col
        self.feature_type = feature_type

    def __call__(self, features, labels):
        numeric_features = [features.pop(col) for col in self.num_col]
        numeric_features = [tf.cast(feat, tf.float32) for feat in numeric_features]
        numeric_features = tf.stack(numeric_features, axis=-1)
        
        if self.feature_type == 'cat':
            return features
        if self.feature_type == 'numeric':
            return numeric_features
        if self.feature_type == 'no_label':
            features['numeric'] = numeric_features
            return features
        else:
            features['numeric'] = numeric_features
            return features, labels


In [5]:
packed_dataset = dataset.map(FeaturesExtraction(num_col=NUM_COLUMNS))

#### 1.3. Training / validation split

In [6]:
def is_validate(idx, data):
    return idx % 5 == 0

def is_train(idx, data):
    return not is_validate(idx, data)

recover = lambda idx, data: data

validate_dataset = packed_dataset.enumerate()\
.filter(is_validate)\
.map(recover)

train_dataset = packed_dataset.enumerate()\
.filter(is_train)\
.map(recover)

### 2. Data processing

In [7]:
class DataProcessingLayer(PreprocessingLayer):
    
    def __init__(self, ls_cat_col, num_col, vocabulary=None, **kwargs):
        
        super(DataProcessingLayer, self).__init__(**kwargs)
        self._ls_cat_col = ls_cat_col
        self._num_col = num_col
        self._dict_cat_vocab = vocabulary
        
        self._dict_vectorization_layer = dict()
        for key in ls_cat_col:
            self._dict_vectorization_layer.update({
                key: TextVectorization(output_sequence_length=1)
            })
        self.processing_layer = None
    
    def adapt(self, data):
        
        for cat_col in self._ls_cat_col:
            print(f'adapting col: {cat_col}')
            tmp_dataset = data.map(lambda feature, label: feature.pop(cat_col))
            self._dict_vectorization_layer[cat_col].adapt(tmp_dataset)
        
        self._dict_cat_vocab = dict()
        for cat_col in self._ls_cat_col:
            self._dict_cat_vocab.update({
                cat_col: self._dict_vectorization_layer[cat_col].get_vocabulary()
            })
                
    def build(self, input_shape):
        
        ls_feature_cat_col = list()
        ls_feature_num_col = list()
        
        for cat_col, vocab in self._dict_cat_vocab.items():
            tf_cat_vocab = tf.feature_column.categorical_column_with_vocabulary_list(
                key=cat_col, vocabulary_list=vocab)
            ls_feature_cat_col.append(tf.feature_column.indicator_column(tf_cat_vocab))
        
        tf_feature_num = tf.feature_column.numeric_column(self._num_col, shape=input_shape[self._num_col].as_list()[-1])
        ls_feature_num_col.append(tf_feature_num)
        
        self.processing_layer = tf.keras.layers.DenseFeatures(ls_feature_cat_col + ls_feature_num_col)
        
    def call(self, inputs):
        
        return self.processing_layer(inputs)
    
    def get_config(self):
        
        config = super(DataProcessingLayer, self).get_config()
        config.update({
            'ls_cat_col': self._ls_cat_col,
            'num_col': self._num_col,
            'vocabulary': self._dict_cat_vocab
        })
        
        return config
    
    @classmethod
    def from_config(cls, config):
        
        return cls(**config)
        

In [8]:
def build_data_processing_layer(file_path, dataset, re_build=True):

    if path.exists(file_path) and re_build == False:

        with open(file_path, 'rb') as f:
            serialized_layer = pickle.load(f) 
        layer = tf.keras.layers.deserialize(
            serialized_layer, custom_objects={'DataProcessingLayer': DataProcessingLayer}
        )
        
        print(f'Loaded the saved layer from {file_path}')

        return layer

    else:

        layer = DataProcessingLayer(
            ls_cat_col=CAT_COLUMNS,
            num_col='numeric',
            name='data_processing_layer'
        )
        layer.adapt(data=dataset)
        serialized_layer = tf.keras.layers.serialize(layer)
        with open(file_path, 'wb') as f:
            pickle.dump(serialized_layer, f)
            
        print(f'Saved the layer to {file_path}')
        
        return layer

### 3. ANN Model Building

In [9]:
class ANNModel(tf.keras.Model):
    
    def __init__(self, num_hiden_layer, ls_hiden_unit, re_build_onhot=False, **kwargs):
        
        super(ANNModel, self).__init__(**kwargs)
        
        assert num_hiden_layer == len(ls_hiden_unit), f'num_hiden_layer != len(ls_hiden_unit)'
        self.num_hiden_layer = num_hiden_layer
        
        # processing layer
        self.data_processing_layer = build_data_processing_layer(
            file_path='saved/layer/DataProcessingLayer.pkl',
            dataset=train_dataset,
            re_build=re_build_onhot
        )
        
        # fully connected hiden layers
        self.ls_hiden_layer = list()
        for i in range(num_hiden_layer):
            self.ls_hiden_layer.append(
                tf.keras.layers.Dense(
                    ls_hiden_unit[i], 
                    activation='relu', 
                    name=f'hiden_layer_{i}'))
            
        # output layer
        self.output_layer = tf.keras.layers.Dense(1, activation='sigmoid', name='output_layer')
        
    def call(self, inputs):
        
        x = self.data_processing_layer(inputs)
        for i in range(self.num_hiden_layer):
            x = self.ls_hiden_layer[i](x)
        return self.output_layer(x)
    
    def train_step(self, data):
        
        # Unpack the data
        x, y = data

        with tf.GradientTape() as tape:
            y_pred = self(x, training=True) 
            loss = self.compiled_loss(y, y_pred, regularization_losses=self.losses)

        # Compute gradients
        trainable_vars = self.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)
        
        # Update weights
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))
        
        # Update metrics (includes the metric that tracks the loss)
        self.compiled_metrics.update_state(y, y_pred)
        
        # Return a dict mapping metric names to current value
        return {m.name: m.result() for m in self.metrics}
    
    def test_step(self, data):
        
        # Unpack the data
        x, y = data
        
        # Compute predictions
        y_pred = self(x, training=False)
        
        # Updates the metrics tracking the loss
        self.compiled_loss(y, y_pred, regularization_losses=self.losses)
        
        # Update the metrics.
        self.compiled_metrics.update_state(y, y_pred)
        
        # Return a dict mapping metric names to current value.
        # Note that it will include the loss (tracked in self.metrics).
        return {m.name: m.result() for m in self.metrics}

In [10]:
ann_model = ANNModel(
    num_hiden_layer=2,
    ls_hiden_unit=[128,128],
    name='binary_classifer'
)

METRICS = [
      tf.keras.metrics.TruePositives(name='tp'),
      tf.keras.metrics.FalsePositives(name='fp'),
      tf.keras.metrics.TrueNegatives(name='tn'),
      tf.keras.metrics.FalseNegatives(name='fn'), 
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall'),
      tf.keras.metrics.AUC(name='auc'),
]

ann_model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
    optimizer='adam',
    metrics=METRICS)

Loaded the saved layer from saved/layer/DataProcessingLayer.pkl


### 5. Training

In [11]:
%rm -rf saved/logs

In [12]:
checkpoint_path = 'saved/model/checkpoint'
checkpoint_dir = path.dirname(checkpoint_path)

# Create a callback that saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=0)

# Create a callback for TensorBoard
logdir = "saved/logs/fit/" + datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=logdir)

In [13]:
model_history = ann_model.fit(
    train_dataset, 
    validation_data=validate_dataset, 
    callbacks=[cp_callback, tensorboard_callback],
    epochs=20,
    verbose=0,
    workers=4)

print(f'Average test loss: {np.average(model_history.history["loss"])})')

Average test loss: 0.6707712322473526)


In [14]:
ann_model.summary()

Model: "binary_classifer"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
data_processing_layer (DataP multiple                  0         
_________________________________________________________________
hiden_layer_0 (Dense)        multiple                  271872    
_________________________________________________________________
hiden_layer_1 (Dense)        multiple                  16512     
_________________________________________________________________
output_layer (Dense)         multiple                  129       
Total params: 288,513
Trainable params: 288,513
Non-trainable params: 0
_________________________________________________________________


### 6. Evaluation

#### 6.1 Load testing dataset

In [15]:
testing_dataset = tf.data.experimental.make_csv_dataset(
    file_pattern='data/dac/sample_test.txt', 
    batch_size=200,
    num_epochs=1,
    column_defaults=COLUMN_DEFAULTS,
    column_names=COLUMNS, 
    label_name='lbl', 
    field_delim='\t',
    shuffle=True
).shuffle(10, reshuffle_each_iteration=False)

In [16]:
testing_packed_dataset = testing_dataset.map(FeaturesExtraction(num_col=NUM_COLUMNS))

In [17]:
ann_model.evaluate(testing_packed_dataset, return_dict=True, verbose=0)

{'loss': 0.6679402589797974,
 'tp': 793.0,
 'fp': 0.0,
 'tn': 9513.0,
 'fn': 1645.0,
 'accuracy': 0.8623546361923218,
 'precision': 1.0,
 'recall': 0.32526659965515137,
 'auc': 0.6626332998275757}