# DNN classifier

reference data: https://labs.criteo.com/2013/12/download-terabyte-click-logs-2/

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers.experimental.preprocessing import PreprocessingLayer, TextVectorization

### 1. Data batching

#### 1.1. Load dataset

In [2]:
LBL_COLUMN = ['lbl']
CAT_COLUMNS = [f'cat{i}' for i in range(26)]
NUM_COLUMNS = [f'num{i}' for i in range(13)]
COLUMNS = LBL_COLUMN + NUM_COLUMNS + CAT_COLUMNS
FEATURE_COLUMNS = NUM_COLUMNS + CAT_COLUMNS
COLUMN_DEFAULTS = [0]*14 + ['thisisdefault']*26

In [3]:
dataset = tf.data.experimental.make_csv_dataset(
    file_pattern='data/dac/sample.txt', 
    batch_size=20,
    num_epochs=1,
    column_defaults=COLUMN_DEFAULTS,
    column_names=COLUMNS, 
    label_name='lbl', 
    field_delim='\t'
)

#### 1.2. Conbine the columes

In [4]:
class FeaturesExtraction:
    
    def __init__(self, num_col, feature_type=None):
        self.num_col = num_col
        self.feature_type = feature_type

    def __call__(self, features, labels):
        numeric_features = [features.pop(col) for col in self.num_col]
        numeric_features = [tf.cast(feat, tf.float32) for feat in numeric_features]
        numeric_features = tf.stack(numeric_features, axis=-1)
        
        if self.feature_type == 'cat':
            return features
        if self.feature_type == 'numeric':
            return numeric_features
        else:
            features['numeric'] = numeric_features
            return features, labels


In [4]:
packed_dataset = dataset.map(FeaturesExtraction(num_col=NUM_COLUMNS))

### 2. Data processing

In [5]:
class DataProcessingLayer(PreprocessingLayer):
    
    def __init__(self, ls_cat_col, num_col, name=None):
        
        super(DataProcessingLayer, self).__init__(name=name)
        self._ls_cat_col = ls_cat_col
        self._num_col = num_col
        self._dict_vectorization_layer = dict()
        for key in ls_cat_col:
            self._dict_vectorization_layer.update({
                key: TextVectorization(output_sequence_length=1)
            })
        self.processing_layer = None
    
    def adapt(self, data):
        
        for cat_col in self._ls_cat_col:
            tmp_dataset = data.map(lambda feature, label: feature.pop(cat_col))
            self._dict_vectorization_layer[cat_col].adapt(tmp_dataset)
    
    def build(self, input_shape):
        
        dict_cat_vocab = dict()
        ls_feature_cat_col = list()
        ls_feature_num_col = list()
        
        for cat_col in self._ls_cat_col:
            dict_cat_vocab.update({
                cat_col: self._dict_vectorization_layer[cat_col].get_vocabulary()
            })
        
        for cat_col, vocab in dict_cat_vocab.items():
            tf_cat_vocab = tf.feature_column.categorical_column_with_vocabulary_list(
                key=cat_col, vocabulary_list=vocab)
            ls_feature_cat_col.append(tf.feature_column.indicator_column(tf_cat_vocab))
            
        ls_feature_num_col.append(tf.feature_column.numeric_column(self._num_col, shape=input_shape[self._num_col].as_list()[-1]))
        
        self.processing_layer = tf.keras.layers.DenseFeatures(
            ls_feature_cat_col + ls_feature_num_col)
        
    def call(self, inputs):
        
        return self.processing_layer(inputs)
        

In [6]:
data_processing_layer = DataProcessingLayer(
    ls_cat_col=CAT_COLUMNS,
    num_col='numeric',
    name='data_processing_layer'
)

In [7]:
data_processing_layer.adapt(data=packed_dataset)

### 3. ANN Model Building

In [8]:
ann_model_input = {
    'numeric': tf.keras.Input((len(NUM_COLUMNS),), dtype=tf.dtypes.float32, name=f'numeric_input')
}

for cat_column in CAT_COLUMNS:
    ann_model_input.update({cat_column: tf.keras.Input((1,), dtype=tf.dtypes.string, name=f'{cat_column}_input')})

x = data_processing_layer(ann_model_input)
x = tf.keras.layers.Dense(128, activation="relu", name='hiden_layer_1')(x)
x = tf.keras.layers.Dense(128, activation="relu", name='hiden_layer_2')(x)
ann_model_output = tf.keras.layers.Dense(1, activation='sigmoid', name='output_layer')(x)

ann_model = tf.keras.Model(
    inputs=ann_model_input, 
    outputs=ann_model_output, 
    name='binary_classifer')

ann_model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
    optimizer='adam',
    metrics=['accuracy'])

In [9]:
ann_model.summary()

Model: "binary_classifer"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
cat0_input (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
cat1_input (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
cat10_input (InputLayer)        [(None, 1)]          0                                            
__________________________________________________________________________________________________
cat11_input (InputLayer)        [(None, 1)]          0                                            
___________________________________________________________________________________

### 5. Training

In [10]:
ann_model.fit(packed_dataset, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x13d9b5ac8>