In [None]:
# https://machinelearningmastery.com/how-to-normalize-center-and-standardize-images-with-the-imagedatagenerator-in-keras/
# https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/image/ImageDataGenerator
# https://theailearner.com/2019/07/06/data-augmentation-with-keras-imagedatagenerator/

In [15]:
# load all necessary packages
from keras.preprocessing.image import ImageDataGenerator
import tensorflow as tf
import keras
import keras.utils
from keras.models import Sequential
import pandas as pd
import os

In [16]:
# Define tensorflow model.
def create_model_AlexNet(
    verbose: bool = False, dropout_param: float = 0.0, regularization_param: float = 0.0, cnn_layer_filters_value: int = 256
) -> Sequential:
    """Receives:
        -
    Returns:
        Compiled AlexNet model with input layer of 227x227x3 and output layer with 16 nodes.
    """

    # Create the model
    model = keras.models.Sequential(
        [
            keras.layers.Conv2D(
                filters=96,
                kernel_size=(11, 11),
                strides=(4, 4),
                activation="relu",
                input_shape=(227, 227, 3),
                kernel_regularizer=tf.keras.regularizers.l2(l=regularization_param),
            ),  # NOQA E501
            keras.layers.BatchNormalization(),
            keras.layers.MaxPool2D(pool_size=(3, 3), strides=(2, 2)),
            tf.keras.layers.SpatialDropout2D(rate=dropout_param),
            keras.layers.Conv2D(
                filters=256,
                kernel_size=(5, 5),
                strides=(1, 1),
                activation="relu",
                padding="same",
                kernel_regularizer=tf.keras.regularizers.l2(l=regularization_param),
            ),
            keras.layers.BatchNormalization(),
            keras.layers.MaxPool2D(pool_size=(3, 3), strides=(2, 2)),
            tf.keras.layers.SpatialDropout2D(rate=dropout_param),
            keras.layers.Conv2D(
                filters=384,
                kernel_size=(3, 3),
                strides=(1, 1),
                activation="relu",
                padding="same",
                kernel_regularizer=tf.keras.regularizers.l2(l=regularization_param),
            ),
            keras.layers.BatchNormalization(),
            keras.layers.Conv2D(
                filters=384,
                kernel_size=(3, 3),
                strides=(1, 1),
                activation="relu",
                padding="same",
                kernel_regularizer=tf.keras.regularizers.l2(l=regularization_param),
            ),
            keras.layers.BatchNormalization(),
            keras.layers.Conv2D(
                filters=cnn_layer_filters_value,
                kernel_size=(3, 3),
                strides=(1, 1),
                activation="relu",
                padding="same",
                kernel_regularizer=tf.keras.regularizers.l2(l=regularization_param),
            ),
            keras.layers.BatchNormalization(),
            keras.layers.MaxPool2D(pool_size=(3, 3), strides=(2, 2)),
            keras.layers.Flatten(),
            keras.layers.Dense(4096, activation="relu"),
            keras.layers.Dropout(0.5),
            keras.layers.Dense(4096, activation="relu"),
            keras.layers.Dropout(0.5),
            keras.layers.Dense(10, activation="softmax"),
        ]
    )

    # Compile the model
    model.compile(
        loss="sparse_categorical_crossentropy",
        optimizer=tf.optimizers.SGD(learning_rate=0.001, momentum=0.9),
        metrics=["accuracy"],
    )

    if verbose:
        print(model.summary())

    return model

In [17]:
# create model
model = create_model_AlexNet()

In [18]:
# create image generators

# Training ImagaDataGenerator with Augmentation transf.
train_datagen = ImageDataGenerator(
    rescale=1.0 / 255,
    validation_split=0.2,
    samplewise_center=True,
    samplewise_std_normalization=True,
    rotation_range=15,
    shear_range=10,
    zoom_range=0.1,
    fill_mode="nearest",
    channel_shift_range=0.9,
    height_shift_range=0.05,
    width_shift_range=0.1,
    brightness_range=[0.2, 1.0],
)
# Validation ImageDataGenerator with rescaling.
valid_datagen = ImageDataGenerator(
    rescale=1.0 / 255, validation_split=0.2, samplewise_center=True, samplewise_std_normalization=True
)
# Testing ImageDataGenerator with rescaling.
test_datagen = ImageDataGenerator(rescale=1.0 / 255, samplewise_center=True, samplewise_std_normalization=True)

In [19]:
# set classes
classes = [
    "bank-statements",
    "contracts",
    "company-registry",
    "court-documents",
    "gazettes",
    "invoices",
    "middle-page",
    "passport-scan",
    "receipts",
    "shipping-receipts",
    "transcripts",
]

In [20]:
# Create a flow from the directory for validation data - seed=42
# Create a flow from the directory using same seed and 'training' subset.
train_gen = train_datagen.flow_from_directory(
    "/data/dssg/occrp/data/processed_clean",
    subset="training",
    shuffle=True,
    seed=42,
    target_size=(227, 227),
    batch_size=32,
    class_mode="sparse",
    color_mode="rgb",
    classes=classes,
)
# Choose subset = 'validation'
valid_gen = valid_datagen.flow_from_directory(
    "/data/dssg/occrp/data/processed_clean",
    subset="validation",
    shuffle=True,
    seed=42,
    target_size=(227, 227),
    batch_size=32,
    class_mode="sparse",
    color_mode="rgb",
    classes=classes,
)

# Create flow from test directory
test_generator = test_datagen.flow_from_directory(
    "/data/dssg/occrp/data/testing_data/web",
    target_size=(227, 227),
    batch_size=32,
    color_mode="rgb",
    class_mode="sparse",
    seed=42,
    classes=classes,
)

Found 10737 images belonging to 11 classes.
Found 2679 images belonging to 11 classes.
Found 687 images belonging to 11 classes.


In [14]:
# Create flow from dataframe
train_gen_d = train_datagen.flow_from_dataframe(
    "/data/dssg/occrp/data/input/rvl-cdip/labels/train.txt",
    subset="training",
    x_col="",
    y_col="",
    shuffle=True,
    seed=42,
    target_size=(227, 227),
    batch_size=32,
    class_mode="sparse",
    color_mode="rgb",
    classes=classes,
)
# Choose subset = 'validation'
valid_gen_d = valid_datagen.flow_from_dataframe(
    "/data/dssg/occrp/data/input/rvl-cdip/labels/val.txt",
    subset="validation",
    shuffle=True,
    seed=42,
    target_size=(227, 227),
    batch_size=32,
    class_mode="sparse",
    color_mode="rgb",
    classes=classes,
)

# Create flow from test directory
test_generator_d = test_datagen.flow_from_dataframe(
    "/data/dssg/occrp/data/input/rvl-cdip/labels/test.txt",
    target_size=(227, 227),
    batch_size=32,
    color_mode="rgb",
    class_mode="sparse",
    seed=42,
    classes=classes,
)

KeyError: ''

In [35]:
# Create flow from dataframe

traindf = pd.read_csv("/data/dssg/occrp/data/input/rvl-cdip/labels/train_TH.txt", dtype=str, sep=" ")
traindf

Unnamed: 0,filename,class,Unnamed: 2
0,imagesq/q/o/c/qoc54c00/80035521.tif,15,
1,imagese/e/w/c/ewc23d00/513280028.tif,1,
2,imagesw/w/b/t/wbt26e00/2053453161.tif,7,
3,imagesm/m/k/m/mkm05e00/2040792992_2040792994.tif,10,
4,imageso/o/e/x/oex80d00/522787731+-7732.tif,3,
...,...,...,...
319995,imagesu/u/p/p/upp04f00/0000282789.tif,9,
319996,imagesa/a/c/z/acz60f00/0011972032.tif,15,
319997,imagesu/u/j/m/ujm20a00/10155388.tif,6,
319998,imagesd/d/r/r/drr93f00/0000343578.tif,9,


In [36]:
traindf.drop(["Unnamed: 2"], axis=1)
traindf

Unnamed: 0,filename,class,Unnamed: 2
0,imagesq/q/o/c/qoc54c00/80035521.tif,15,
1,imagese/e/w/c/ewc23d00/513280028.tif,1,
2,imagesw/w/b/t/wbt26e00/2053453161.tif,7,
3,imagesm/m/k/m/mkm05e00/2040792992_2040792994.tif,10,
4,imageso/o/e/x/oex80d00/522787731+-7732.tif,3,
...,...,...,...
319995,imagesu/u/p/p/upp04f00/0000282789.tif,9,
319996,imagesa/a/c/z/acz60f00/0011972032.tif,15,
319997,imagesu/u/j/m/ujm20a00/10155388.tif,6,
319998,imagesd/d/r/r/drr93f00/0000343578.tif,9,


In [37]:
traindf["filename"] = "/data/dssg/occrp/data/input/rvl-cdip/images/" + traindf["filename"].astype(str)

In [44]:
traindf.filename

0         /data/dssg/occrp/data/input/rvl-cdip/images/im...
1         /data/dssg/occrp/data/input/rvl-cdip/images/im...
2         /data/dssg/occrp/data/input/rvl-cdip/images/im...
3         /data/dssg/occrp/data/input/rvl-cdip/images/im...
4         /data/dssg/occrp/data/input/rvl-cdip/images/im...
                                ...                        
319995    /data/dssg/occrp/data/input/rvl-cdip/images/im...
319996    /data/dssg/occrp/data/input/rvl-cdip/images/im...
319997    /data/dssg/occrp/data/input/rvl-cdip/images/im...
319998    /data/dssg/occrp/data/input/rvl-cdip/images/im...
319999    /data/dssg/occrp/data/input/rvl-cdip/images/im...
Name: filename, Length: 320000, dtype: object

In [50]:
train_gen_d = train_datagen.flow_from_dataframe(
    dataframe=traindf,
    subset="training",
    x_col="filename",
    y_col="class",
    shuffle=True,
    seed=42,
    target_size=(227, 227),
    batch_size=32,
    class_mode="sparse",
    color_mode="rgb",
)

Found 320000 validated image filenames belonging to 16 classes.


In [50]:
# load data
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.cifar10.load_data()

In [51]:
# transform labels to vectors
y_train = keras.utils.to_categorical(y_train, 10)
y_test = keras.utils.to_categorical(y_test, 10)

In [52]:
# create datagen object
datagen = ImageDataGenerator(
    featurewise_center=True,
    featurewise_std_normalization=True,
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    horizontal_flip=True,
    validation_split=0.2,
)

In [53]:
# compute quantities required for featurewise normalization
# (std, mean, and principal components if ZCA whitening is applied)
datagen.fit(x_train)

In [54]:
# fits the model on batches with real-time data augmentation:
model.fit(
    datagen.flow(x_train, y_train, batch_size=32, subset="training"),
    validation_data=datagen.flow(x_train, y_train, batch_size=8, subset="validation"),
    steps_per_epoch=len(x_train) / 32,
    epochs=1,
)

InvalidArgumentError: Graph execution error:

Detected at node 'sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits' defined at (most recent call last):
    File "/usr/lib64/python3.8/runpy.py", line 194, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "/usr/lib64/python3.8/runpy.py", line 87, in _run_code
      exec(code, run_globals)
    File "/home/thenn/.local/share/virtualenvs/dssgxdfki2022-occrp-CY4k_1kg/lib/python3.8/site-packages/ipykernel_launcher.py", line 17, in <module>
      app.launch_new_instance()
    File "/home/thenn/.local/share/virtualenvs/dssgxdfki2022-occrp-CY4k_1kg/lib/python3.8/site-packages/traitlets/config/application.py", line 976, in launch_instance
      app.start()
    File "/home/thenn/.local/share/virtualenvs/dssgxdfki2022-occrp-CY4k_1kg/lib/python3.8/site-packages/ipykernel/kernelapp.py", line 712, in start
      self.io_loop.start()
    File "/home/thenn/.local/share/virtualenvs/dssgxdfki2022-occrp-CY4k_1kg/lib64/python3.8/site-packages/tornado/platform/asyncio.py", line 215, in start
      self.asyncio_loop.run_forever()
    File "/usr/lib64/python3.8/asyncio/base_events.py", line 570, in run_forever
      self._run_once()
    File "/usr/lib64/python3.8/asyncio/base_events.py", line 1859, in _run_once
      handle._run()
    File "/usr/lib64/python3.8/asyncio/events.py", line 81, in _run
      self._context.run(self._callback, *self._args)
    File "/home/thenn/.local/share/virtualenvs/dssgxdfki2022-occrp-CY4k_1kg/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 510, in dispatch_queue
      await self.process_one()
    File "/home/thenn/.local/share/virtualenvs/dssgxdfki2022-occrp-CY4k_1kg/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 499, in process_one
      await dispatch(*args)
    File "/home/thenn/.local/share/virtualenvs/dssgxdfki2022-occrp-CY4k_1kg/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 406, in dispatch_shell
      await result
    File "/home/thenn/.local/share/virtualenvs/dssgxdfki2022-occrp-CY4k_1kg/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 730, in execute_request
      reply_content = await reply_content
    File "/home/thenn/.local/share/virtualenvs/dssgxdfki2022-occrp-CY4k_1kg/lib/python3.8/site-packages/ipykernel/ipkernel.py", line 383, in do_execute
      res = shell.run_cell(
    File "/home/thenn/.local/share/virtualenvs/dssgxdfki2022-occrp-CY4k_1kg/lib/python3.8/site-packages/ipykernel/zmqshell.py", line 528, in run_cell
      return super().run_cell(*args, **kwargs)
    File "/home/thenn/.local/share/virtualenvs/dssgxdfki2022-occrp-CY4k_1kg/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 2881, in run_cell
      result = self._run_cell(
    File "/home/thenn/.local/share/virtualenvs/dssgxdfki2022-occrp-CY4k_1kg/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 2936, in _run_cell
      return runner(coro)
    File "/home/thenn/.local/share/virtualenvs/dssgxdfki2022-occrp-CY4k_1kg/lib/python3.8/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "/home/thenn/.local/share/virtualenvs/dssgxdfki2022-occrp-CY4k_1kg/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3135, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "/home/thenn/.local/share/virtualenvs/dssgxdfki2022-occrp-CY4k_1kg/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3338, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "/home/thenn/.local/share/virtualenvs/dssgxdfki2022-occrp-CY4k_1kg/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3398, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "/tmp/ipykernel_47413/340413222.py", line 2, in <cell line: 2>
      model.fit(datagen.flow(x_train, y_train, batch_size=32,
    File "/home/thenn/.local/share/virtualenvs/dssgxdfki2022-occrp-CY4k_1kg/lib/python3.8/site-packages/keras/utils/traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "/home/thenn/.local/share/virtualenvs/dssgxdfki2022-occrp-CY4k_1kg/lib/python3.8/site-packages/keras/engine/training.py", line 1409, in fit
      tmp_logs = self.train_function(iterator)
    File "/home/thenn/.local/share/virtualenvs/dssgxdfki2022-occrp-CY4k_1kg/lib/python3.8/site-packages/keras/engine/training.py", line 1051, in train_function
      return step_function(self, iterator)
    File "/home/thenn/.local/share/virtualenvs/dssgxdfki2022-occrp-CY4k_1kg/lib/python3.8/site-packages/keras/engine/training.py", line 1040, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/home/thenn/.local/share/virtualenvs/dssgxdfki2022-occrp-CY4k_1kg/lib/python3.8/site-packages/keras/engine/training.py", line 1030, in run_step
      outputs = model.train_step(data)
    File "/home/thenn/.local/share/virtualenvs/dssgxdfki2022-occrp-CY4k_1kg/lib/python3.8/site-packages/keras/engine/training.py", line 890, in train_step
      loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "/home/thenn/.local/share/virtualenvs/dssgxdfki2022-occrp-CY4k_1kg/lib/python3.8/site-packages/keras/engine/training.py", line 948, in compute_loss
      return self.compiled_loss(
    File "/home/thenn/.local/share/virtualenvs/dssgxdfki2022-occrp-CY4k_1kg/lib/python3.8/site-packages/keras/engine/compile_utils.py", line 201, in __call__
      loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "/home/thenn/.local/share/virtualenvs/dssgxdfki2022-occrp-CY4k_1kg/lib/python3.8/site-packages/keras/losses.py", line 139, in __call__
      losses = call_fn(y_true, y_pred)
    File "/home/thenn/.local/share/virtualenvs/dssgxdfki2022-occrp-CY4k_1kg/lib/python3.8/site-packages/keras/losses.py", line 243, in call
      return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "/home/thenn/.local/share/virtualenvs/dssgxdfki2022-occrp-CY4k_1kg/lib/python3.8/site-packages/keras/losses.py", line 1860, in sparse_categorical_crossentropy
      return backend.sparse_categorical_crossentropy(
    File "/home/thenn/.local/share/virtualenvs/dssgxdfki2022-occrp-CY4k_1kg/lib/python3.8/site-packages/keras/backend.py", line 5238, in sparse_categorical_crossentropy
      res = tf.nn.sparse_softmax_cross_entropy_with_logits(
Node: 'sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits'
logits and labels must have the same first dimension, got logits shape [0,10] and labels shape [320]
	 [[{{node sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits}}]] [Op:__inference_train_function_10535]