# Adding new model to the Data Labeler

In this example, we define a new model to be used with the Data Labeler component of the Data Profiler. In particular, a character-level LSTM model is implemented, then integrated into the DataLabeler pipeline to be trained with a tabular dataset.

First, let's import the libraries needed for this example.

In [None]:
import os
import sys
import json
import pandas as pd
sys.path.insert(0, '..')
import dataprofiler as dp

## Dataset

We use the aws honeypot dataset in the test folder for this example. First, look at the data using the Data Reader class of the Data Profiler. 

In [None]:
data = dp.Data("../dataprofiler/tests/data/csv/aws_honeypot_marx_geo.csv")
df_data = data.data
df_data.head()

In [None]:
# the column 'comment' is changed to UNKNOWN, as the data labeler requires at least one column with label UNKNOWN
df = data.data.rename({'comment': 'UNKNOWN'}, axis=1)

# split data to training and test set
split_ratio = 0.2
df = df.sample(frac=1).reset_index(drop=True)
data_train = df[:int((1 - split_ratio) * len(df))]
data_test = df[int((1 - split_ratio) * len(df)):]

# train a new data labeler with column names as labels
if not os.path.exists('data_labeler_saved'):
    os.makedirs('data_labeler_saved')

## Implement a new LSTM model

This model is inherited from CharacterLevelCnnModel class, with some modifications the following functions

__init__: to add new parameters for lstm model

_validate_parameters: to check new parameters for lstm model

_construct_model: to construct new architecture for lstm model

In [None]:
import tensorflow as tf
import numpy as np
from dataprofiler.labelers.character_level_cnn_model import CharacterLevelCnnModel, F1Score, \
                                                            create_glove_char, build_embd_dictionary
from dataprofiler.labelers.base_model import BaseModel

class CharacterLevelLstmModel(CharacterLevelCnnModel):

    # boolean if the label mapping requires the mapping for index 0 reserved
    requires_zero_mapping = True

    def __init__(self, label_mapping=None, parameters=None):
        """
        LSTM Model Initializer. initialize epoch_id
        """
                
        # parameter initialization
        if not parameters:
            parameters = {}
        parameters.setdefault('max_length', 3400)
        parameters.setdefault('max_char_encoding_id', 127)
        parameters.setdefault('dim_embed', 64)
        parameters.setdefault('size_fc', [32, 32])
        parameters.setdefault('dropout', 0.1)
        parameters.setdefault('size_lstm', [64])
        parameters.setdefault('rec_dropout', 0.1)
        parameters.setdefault('activation', "tanh")
        parameters.setdefault('recurrent_activation', "sigmoid")
        parameters.setdefault('default_label', "UNKNOWN")
        parameters['pad_label'] = 'PAD'
        self._epoch_id = 0

        # reconstruct flags for model
        self._model_num_labels = 0
        self._model_default_ind = -1

        BaseModel.__init__(self, label_mapping, parameters)

    def _validate_parameters(self, parameters):
        """
        Validate the parameters sent in. Raise error if invalid parameters are
        present.
        """
        errors = []
        list_of_necessary_params = ['max_length', 'max_char_encoding_id',
                                    'dim_embed', 'size_fc', 'dropout',
                                    'size_lstm', 'rec_dropout', 'activation', 
                                    'recurrent_activation', 'default_label', 
                                    'pad_label']
        # Make sure the necessary parameters are present and valid.
        for param in parameters:
            if param in ['max_length', 'max_char_encoding_id', 'dim_embed',
                         'size_conv']:
                if not isinstance(parameters[param], (int, float)) \
                        or parameters[param] < 0:
                    errors.append(param + " must be a valid integer or float "
                                          "greater than 0.")
            elif param in ['dropout', 'rec_dropout']:
                if not isinstance(parameters[param], (int, float)) \
                        or parameters[param] < 0 or parameters[param] > 1:
                    errors.append(param + " must be a valid integer or float "
                                          "from 0 to 1.")
            elif param == 'size_fc' or param == 'size_lstm':
                if not isinstance(parameters[param], list) \
                        or len(parameters[param]) == 0:
                    errors.append(param + " must be a non-empty list of "
                                          "integers.")
                else:
                    for item in parameters[param]:
                        if not isinstance(item, int):
                            errors.append(param + " must be a non-empty "
                                                  "list of integers.")
                            break
            elif param in ['default_label', 'activation', 'recurrent_activation']:
                if not isinstance(parameters[param], str):
                    error = str(param) + " must be a string."
                    errors.append(error)

        # Error if there are extra parameters thrown in
        for param in parameters:
            if param not in list_of_necessary_params:
                errors.append(param + " is not an accepted parameter.")
        if errors:
            raise ValueError('\n'.join(errors))

    def _construct_model(self):
        """
        Model constructor for the data labeler. This also serves as a weight
        reset.

        :return: None
        """
        num_labels = self.num_labels
        default_ind = self.label_mapping[self._parameters['default_label']]

        # Reset model
        tf.keras.backend.clear_session()

        # generate glove embedding
        create_glove_char(self._parameters['dim_embed'])

        # generate model
        self._model = tf.keras.models.Sequential()

        # default parameters
        max_length = self._parameters['max_length']
        max_char_encoding_id = self._parameters['max_char_encoding_id']

        # Encoding layer
        def encoding_function(input_str):
            char_in_vector = CharacterLevelLstmModel._char_encoding_layer(
                input_str, max_char_encoding_id, max_length)
            return char_in_vector

        self._model.add(tf.keras.layers.Input(shape=(None,), dtype=tf.string))

        self._model.add(
            tf.keras.layers.Lambda(encoding_function,
                                   output_shape=tuple([max_length])))

        # Create a pre-trained weight matrix
        # character encoding indices range from 0 to max_char_encoding_id,
        # we add one extra index for out-of-vocabulary character
        embed_file = os.path.join(
            "../dataprofiler/labelers", "embeddings/glove-reduced-{}D.txt".format(
                self._parameters['dim_embed']))
        embedding_matrix = np.zeros((max_char_encoding_id + 2,
                                     self._parameters['dim_embed']))
        embedding_dict = build_embd_dictionary(embed_file)

        input_shape = tuple([max_length])
        # Fill in the weight matrix: let pad and space be 0s
        for ascii_num in range(max_char_encoding_id):
            if chr(ascii_num) in embedding_dict:
                embedding_matrix[ascii_num + 1] = embedding_dict[chr(ascii_num)]

        self._model.add(tf.keras.layers.Embedding(
            max_char_encoding_id + 2,
            self._parameters['dim_embed'],
            weights=[embedding_matrix],
            input_length=input_shape[0],
            trainable=True))
            
        # Add the lstm layers
        for size in self._parameters['size_lstm']:
            self._model.add(
                tf.keras.layers.LSTM(units=size, 
                                     recurrent_dropout=self._parameters['rec_dropout'], 
                                     activation=self._parameters['activation'],
                                     recurrent_activation=self._parameters['recurrent_activation'],
                                     return_sequences=True))
            if self._parameters['dropout']:
                self._model.add(tf.keras.layers.Dropout(self._parameters['dropout']))

        # Add the fully connected layers
        for size in self._parameters['size_fc']:
            self._model.add(
                tf.keras.layers.Dense(units=size, activation='relu'))
            if self._parameters['dropout']:
                self._model.add(
                    tf.keras.layers.Dropout(self._parameters['dropout']))

        # Add the final Softmax layer
        self._model.add(
            tf.keras.layers.Dense(num_labels, activation='softmax'))

        # Output the model into a .pb file for TensorFlow
        argmax_layer = tf.keras.backend.argmax(self._model.output)

        # Create confidence layers
        final_predicted_layer = CharacterLevelLstmModel._argmax_threshold_layer(
            num_labels, threshold=0.0, default_ind=default_ind)

        argmax_outputs = self._model.outputs + \
                         [argmax_layer,
                          final_predicted_layer(argmax_layer, self._model.output)]
        self._model = tf.keras.Model(self._model.inputs, argmax_outputs)

        # Compile the model
        softmax_output_layer_name = self._model.outputs[0].name.split('/')[0]
        losses = {softmax_output_layer_name: "categorical_crossentropy"}

        # use f1 score metric
        f1_score_training = F1Score(num_classes=num_labels, average='micro')
        metrics = {softmax_output_layer_name: ['acc', f1_score_training]}

        self._model.compile(loss=losses,
                            optimizer="adam",
                            metrics=metrics)

        self._epoch_id = 0
        self._model_num_labels = num_labels
        self._model_default_ind = default_ind


## Integrate the LSTM model to the DataLabeler

The above LSTM model is loaded along with Preprocessor and Postprocessor to build a new DataLabeler, which is then trained on the given dataset.

In [None]:
# get labels from the given dataset
value_label_df = data_train.reset_index(drop=True).melt()
value_label_df.columns = [1, 0]  # labels=1, values=0 in that order
value_label_df = value_label_df.astype(str)
labels = value_label_df[1].unique().tolist()

# build new data labeler with preprocessor, new lstm model, and postprocessor
preprocessor = dp.labelers.data_processing.StructCharPreprocessor()
model = CharacterLevelLstmModel(labels)
postprocessor = dp.labelers.data_processing.StructCharPostprocessor()
data_labeler = dp.labelers.base_data_labeler.TrainableDataLabeler.load_with_components(preprocessor, model, postprocessor)

# train the data labeler
save_dirpath="data_labeler_saved"
epochs=2
data_labeler.fit(
    x=value_label_df[0], y=value_label_df[1], labels=labels, epochs=epochs)
if save_dirpath:
    data_labeler.save_to_disk(save_dirpath)

The trained Data Labeler is then used by the Data Profiler to provide the prediction on the new dataset.

In [None]:
# predict with the data labeler object
profile_options = dp.ProfilerOptions()
profile_options.set({"text.is_enabled": False, 
                     "int.is_enabled": False, 
                     "float.is_enabled": False, 
                     "order.is_enabled": False, 
                     "category.is_enabled": False, 
                     "datetime.is_enabled": False,})
profile_options.set({'data_labeler.data_labeler_object': data_labeler})
profile = dp.Profiler(data_test, profiler_options=profile_options)

# get the prediction from the data profiler
def get_structured_results(results):
    columns = []
    predictions = []
    for col in results['data_stats']:
        columns.append(col)
        predictions.append(results['data_stats'][col]['data_label'])

    df_results = pd.DataFrame({'Column': columns, 'Prediction': predictions})
    return df_results

results = profile.report()
print(get_structured_results(results))