# 4.1b Compare CLD with other works

In [1]:
import os
import sys
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import report
import block_sampler
import batch_encoder
import callbacks
from dataset import Dataset

from tensorflow.keras.layers import Input, Conv1D, MaxPooling1D, LSTM, Dense, Activation, TimeDistributed, Flatten, Dot, Softmax, Lambda, RepeatVector, Multiply, Permute, Reshape, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
from trainer import TrainResults

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
tf.__version__, tf.keras.__version__

('1.14.0', '2.2.4-tf')

In [3]:
tf.test.is_gpu_available(), tf.test.is_built_with_cuda()

(False, False)

In [4]:
raw_dataset_folder='govdocs1/sample200'
minimum=200
maximum=200
result_dir = 'results/4.1b-compare-other-works'
os.makedirs(result_dir, exist_ok=True)

In [5]:
rawset = Dataset.new_from_folders(raw_dataset_folder).filter_min_max(minimum, maximum)
rawset = rawset.filter(lambda x: rawset.category_from(x) not in ['text', 'unk'])
rawset.rebuild_categories()

In [6]:
tset, vset = rawset.rnd_split_fraction_by_category(0.5)

In [7]:
def CLD(classes, len_byte_vector):
    last = l0 = Input(shape=(512,len_byte_vector))
    last = Conv1D(256, (16,), strides=16)(last)
    last = LSTM(128)(last)
    last = Dense(classes)(last)
    last = Activation('softmax')(last)
    name = sys._getframe().f_code.co_name
    model = tf.keras.Model([l0], last, name=name)
    model.compile(loss=tf.keras.losses.categorical_crossentropy,
        optimizer=tf.keras.optimizers.Adam(),
        metrics=['binary_accuracy', 'categorical_accuracy'])
    return model

In [8]:
class MyTrainer:
    def __init__(self,
                 model,
                 group_by='by_file',
                 xs_encoder='8bits_11',
                 validation_steps=10,
                 steps_per_epoch=28,
                 epochs=10000000,
                 max_seconds=None,
                 batch_size=100,
                 min_delta=1e-03,
                 patience=10,
                 blockSampler=block_sampler.BlockSampler,
                 batchEncoder=batch_encoder.BatchEncoder):
        self.model = model
        self.group_by = group_by
        self.xs_encoder = xs_encoder
        self.validation_steps = validation_steps
        self.steps_per_epoch = steps_per_epoch
        self.epochs = epochs
        self.max_seconds = max_seconds
        self.batch_size = batch_size
        self.min_delta = min_delta
        self.patience = patience
        self.blockSampler = blockSampler
        self.batchEncoder = batchEncoder

    def train(self, tset, vset):
        tsampler = self.blockSampler(tset, group_by=self.group_by)
        tbenc = self.batchEncoder(tsampler, self.batch_size,
                                  xs_encoder=self.xs_encoder)

        vsampler = self.blockSampler(vset, group_by=self.group_by)
        vbenc = self.batchEncoder(vsampler, self.batch_size,
                                  xs_encoder=self.xs_encoder)

        model = self.model

        timeIt = callbacks.TimeIt()

        history = model.fit_generator(iter(tbenc),
                                      validation_data=iter(vbenc),
                                      validation_steps=self.validation_steps,
                                      steps_per_epoch=self.steps_per_epoch,
                                      epochs=self.epochs,
                                      verbose=0,
                                      callbacks=[
            timeIt,
            # callbacks.SaveModel(os.path.join(result_dir, model.name + '.h5')),
#             callbacks.TimeLimit(self.max_seconds),
            EarlyStopping(monitor='val_categorical_accuracy',
                          min_delta=self.min_delta, patience=self.patience),
            TensorBoard(
                log_dir=os.path.join(result_dir, model.name),
            ),
        ],
#             use_multiprocessing=False,
#             workers=0,
        )
        return TrainResults(
            model=model,
            history=history,
            metrics=['val_binary_accuracy', 'val_categorical_accuracy'],
            elapsed=timeIt.elapsed,
        )

# Chen

In [14]:
categories = ["csv", "doc", "docx", "gif", "gz", "html", "java", "jpg", "log", "pdf", "png", "ppt", "rtf", "text", "xls", "xml"]

In [15]:
rawset = Dataset.new_from_folders(raw_dataset_folder).filter_min_max(0, maximum)
rawset = rawset.filter(lambda x: rawset.category_from(x) in categories)
rawset.rebuild_categories()

tset, vset = rawset.rnd_split_fraction_by_category(0.5)

model = CLD(len(categories),256)

result = MyTrainer(
        model,
        xs_encoder='one_hot',
        batch_size=100,
        steps_per_epoch=16*2,
        validation_steps=16*2,
        patience=10,
    ).train(tset, vset)

print(result.elapsed, result.history.history['val_categorical_accuracy'][-1])

1379.652292728424 0.6328125


# Hiester
csv, xml, jpg and gif.

In [16]:
categories = ['csv', 'xml', 'jpg', 'gif']

In [17]:
rawset = Dataset.new_from_folders(raw_dataset_folder).filter_min_max(0, maximum)
rawset = rawset.filter(lambda x: rawset.category_from(x) in categories)
rawset.rebuild_categories()

tset, vset = rawset.rnd_split_fraction_by_category(0.5)

model = CLD(len(categories),256)

result = MyTrainer(
        model,
        xs_encoder='one_hot',
        batch_size=100,
        steps_per_epoch=16*2,
        validation_steps=16*2,
        patience=10,
    ).train(tset, vset)

print(result.elapsed, result.history.history['val_categorical_accuracy'][-1])

349.5016326904297 0.905


# Wang wang_sparse_2018
csv, doc, docx, gif, gz, html, jpg, pdf, png, ppt, pptx, ps, rtf, swf, txt, xls, xlsx, and xml.

In [18]:
categories = ['csv', 'doc', 'docx', 'gif', 'gz', 'html', 'jpg', 'pdf', 'png', 'ppt', 'pptx', 'ps', 'rtf', 'swf', 'txt', 'xls', 'xlsx', 'xml']

In [19]:
rawset = Dataset.new_from_folders(raw_dataset_folder).filter_min_max(0, maximum)
rawset = rawset.filter(lambda x: rawset.category_from(x) in categories)
rawset.rebuild_categories()

tset, vset = rawset.rnd_split_fraction_by_category(0.5)

model = CLD(len(categories),256)

result = MyTrainer(
        model,
        xs_encoder='one_hot',
        batch_size=100,
        steps_per_epoch=16*2,
        validation_steps=16*2,
        patience=10,
    ).train(tset, vset)

print(result.elapsed, result.history.history['val_categorical_accuracy'][-1])

1267.5370030403137 0.5940625


# Wang wang_file_2018
csv, doc, html, pdf, gif, jpg, dbase3, f, txt, swf, ps, java, log, xml, xls, ppt, gz, unk, rtf, and png.


In [20]:
categories = ['csv', 'doc', 'html', 'pdf', 'gif', 'jpg', 'dbase3', 'f', 'txt', 'swf', 'ps', 'java', 'log', 'xml', 'xls', 'ppt', 'gz', 'unk', 'rtf', 'png']

In [21]:
rawset = Dataset.new_from_folders(raw_dataset_folder).filter_min_max(0, maximum)
rawset = rawset.filter(lambda x: rawset.category_from(x) in categories)
rawset.rebuild_categories()

tset, vset = rawset.rnd_split_fraction_by_category(0.5)

model = CLD(len(categories),256)

result = MyTrainer(
        model,
        xs_encoder='one_hot',
        batch_size=100,
        steps_per_epoch=16*2,
        validation_steps=16*2,
        patience=10,
    ).train(tset, vset)

print(result.elapsed, result.history.history['val_categorical_accuracy'][-1])

862.1381969451904 0.64125


 # Vulinović
csv, doc, docx, gif, gz, html, jpg, pdf, png, ppt, pptx, ps, rtf, swf, txt, xls, xlsx, and xml.

In [22]:
categories = ['csv', 'doc', 'docx', 'gif', 'gz', 'html', 'jpg', 'pdf', 'png', 'ppt', 'pptx', 'ps', 'rtf', 'swf', 'txt', 'xls', 'xlsx', 'xml']

In [23]:
rawset = Dataset.new_from_folders(raw_dataset_folder).filter_min_max(0, maximum)
rawset = rawset.filter(lambda x: rawset.category_from(x) in categories)
rawset.rebuild_categories()

tset, vset = rawset.rnd_split_fraction_by_category(0.5)

model = CLD(len(categories),256)

result = MyTrainer(
        model,
        xs_encoder='one_hot',
        batch_size=100,
        steps_per_epoch=16*2,
        validation_steps=16*2,
        patience=10,
    ).train(tset, vset)

print(result.elapsed, result.history.history['val_categorical_accuracy'][-1])

843.657393693924 0.581875
