In [None]:
import numpy as np
import pandas as pd

import tensorflow as tf

from pathlib import Path
import os
import glob

import json

import matplotlib.pyplot as plt
plt.rcParams["figure.facecolor"] = "w"

In [None]:
root_data = Path("../data/VOC2012")

In [None]:
# Train / Val
main_path = root_data / "ImageSets/Main"

def get_dataset_ids(split):
  with open(main_path / f"{split}.txt") as file:
    lines = file.readlines()
    lines = [line.rstrip() for line in lines]
  return pd.DataFrame(
    np.array(lines).T,
    columns=("imageName",)
  )

df_train = get_dataset_ids("train")
print(f"len train: {len(df_train)}")

df_val = get_dataset_ids("val")
print(f"len val: {len(df_val)}")

In [None]:
# Inspect labels
labels_train = []
labels_val = []

for filename in main_path.glob("*.txt"):
  parts = filename.stem.split("_")
  # ignore train.txt and val.txt
  if len(parts) != 2: 
    continue

  label, split = parts[0], parts[1]
  if split == "train":
    labels_train.append(label)
  elif split == "val":
    labels_val.append(label)
  else:
    # trainval
    continue

assert sorted(labels_train) == sorted(labels_val)

# build label map
label_map = {label: i for i, label in enumerate(sorted(labels_train))}
label_map_inverse = {i: label for label, i in label_map.items()}
n_outputs = len(label_map)
label_map

In [None]:
# Build y_train & y_val
include_difficult = True

def build_y_dict(df, split):
    """Builds a dictionary with the image name as keys and the one-hot representation of labels as values
    TODO add mask for difficult cases
    """
    y_dict = {}
    # initialize all keys of the dict
    for row in df.itertuples():
        y_dict[row.imageName] = np.zeros(len(label_map))

    # loop through every label txt file
    for label, idx in label_map.items():
        with open(main_path / f"{label}_{split}.txt") as file:
            lines = file.readlines()
            lines = [line.rstrip() for line in lines]
        for line in lines:
            parts = line.split(" ")
            image_name, ground_truth = parts[0], parts[-1]
            if ground_truth == "1":
                y_dict[image_name][idx] = 1
            elif ground_truth == "0" and include_difficult:
                y_dict[image_name][idx] = 1

    return y_dict

y_train = build_y_dict(df_train, "train")
y_val = build_y_dict(df_val, "val")

n_train, n_val = len(y_train), len(y_val)

In [None]:
# Build tf datasets
jpeg_path = root_data / "JPEGImages"

def buid_tf_datasets(y_dict):
    jpeg_list = [str((jpeg_path / f"{image_name}.jpg").resolve()) for image_name in y_dict.keys()]
    labels = [val for val in y_dict.values()]
    return tf.data.Dataset.from_tensor_slices((jpeg_list, labels))

ds_train = buid_tf_datasets(y_train)
ds_val = buid_tf_datasets(y_val)

In [None]:
# Load and preprocess images
INPUT_SIZE = (224, 224)

def rescale(image):
    return tf.math.divide(image, 255)

def resize(image):
    # tf.image.resize raises an error about images having no shape
    #image = tf.image.resize(image, (INPUT_SIZE[0], INPUT_SIZE[1]), method=tf.image.ResizeMethod.BILINEAR)
    #return tf.image.resize_with_pad(image, INPUT_SIZE[0], INPUT_SIZE[1], method=tf.image.ResizeMethod.BILINEAR, antialias=True)
    return tf.image.resize(image, (INPUT_SIZE[0], INPUT_SIZE[1]), method=tf.image.ResizeMethod.BILINEAR, antialias=True)

def clip_values(image):
    return tf.clip_by_value(image, 0, 1)

def preprocess_image(image):
    image = clip_values(resize(rescale(image)))
    return image

def load_and_preprocess_image(image_path, labels):
    img_content = tf.io.read_file(image_path)
    image = tf.io.decode_jpeg(img_content)
    return (preprocess_image(image), labels)

ds_train = (
    ds_train
    .map(load_and_preprocess_image, num_parallel_calls=tf.data.AUTOTUNE)
)

ds_val = (
    ds_val
    .map(load_and_preprocess_image, num_parallel_calls=tf.data.AUTOTUNE)
)

In [None]:
for x in ds_train.as_numpy_iterator():
  fig, ax = plt.subplots()
  ax.imshow(x[0])
  break

In [None]:
BATCH_SIZE = 128

ds_train = (
    ds_train
    .cache()
    .shuffle(n_train)
    .batch(BATCH_SIZE)
    .prefetch(tf.data.AUTOTUNE)
)

ds_val = (
    ds_val
    .batch(BATCH_SIZE)
    .cache()  # caching is done after batching because batches can be the same between epochs
    .prefetch(tf.data.AUTOTUNE)
)

In [None]:
inputs = tf.keras.Input(shape=(INPUT_SIZE[0], INPUT_SIZE[1], 3))
x = tf.keras.layers.Conv2D(16, 3, padding="same", activation="relu")(inputs)
x = tf.keras.layers.Conv2D(32, 5, padding="same", activation="relu")(x)
x = tf.keras.layers.MaxPooling2D(3, 1)(x)
x = tf.keras.layers.Conv2D(64, 9, padding="same", activation="relu")(inputs)
x = tf.keras.layers.Conv2D(128, 17, padding="same", activation="relu")(x)
x = tf.keras.layers.GlobalMaxPooling2D()(x)
outputs = tf.keras.layers.Dense(n_outputs, activation="sigmoid")(x)

model = tf.keras.Model(inputs=inputs, outputs=outputs, name="object-classification")

model.summary()

In [None]:
model.compile(
    loss="binary_crossentropy",
    optimizer="adam",
    metrics=["accuracy"]
)

In [None]:
model.fit(
    ds_train,
    epochs=5,
    validation_data=ds_val
)

In [None]:
for xval in ds_val.take(1).as_numpy_iterator():
  batch_img = xval[0]
  batch_labels = xval[1]
  print(xval[1].shape)
  for i in range(10):
    img = batch_img[i, :]
    # ground truth
    labels = []
    for idx in np.where(batch_labels[i, :] > 0.5)[0]:
      labels.append(label_map_inverse[idx])
    ground_truth = "-".join(labels)
    # predictions
    ypred = model.predict(img[np.newaxis, ...])
    labels = []
    for idx in np.where(ypred.squeeze() > 0.5)[0]:
      labels.append(label_map_inverse[idx])
    predictions = "-".join(labels)
    
    fig, ax = plt.subplots()
    ax.set_title(f"Ground truth: {ground_truth}\nPredictions: {predictions}")
    
    ax.imshow(img)