### loading coco annotations and creating labels

coco images dont come with labels in the folder
all the labels live inside a big json file
so first we extract a small subset of classes
and turn it into a simple csv

In [None]:
import json
import pandas as pd

ANNOTATIONS = "../instances_val2017.json"

# coco category ids we care about
# mapping id = readable label
TARGET_CLASSES = {
    1: "person",
    3: "car",
    18: "dog",
    2: "bicycle",
    62: "chair"
}

# load the coco json file
with open(ANNOTATIONS, "r") as f:
    coco = json.load(f)

id_to_filename = {img["id"]: img["file_name"] for img in coco["images"]}

rows = []

# loop through all annotations
# keep only the classes we want
for ann in coco["annotations"]:
    if ann["category_id"] in TARGET_CLASSES:
        rows.append([
            id_to_filename[ann["image_id"]],
            TARGET_CLASSES[ann["category_id"]]
        ])

# build a dataframe with filename and label
df = pd.DataFrame(rows, columns=["filename", "label"])
# remove duplicates since an image can have many annotations

df = df.drop_duplicates()
# save labels to csv so we dont need the json again
df.to_csv("coco_labels_subset.csv", index=False)

print(df["label"].value_counts())

label
person     2693
chair       580
car         535
dog         177
bicycle     149
Name: count, dtype: int64


### loading images and building the dataset

now we switch to tensorflow
we load images from disk
map labels to numbers
and build a tf.data pipeline

In [None]:
import tensorflow as tf
import pandas as pd
import os

# basic config
IMG_SIZE = 224
BATCH_SIZE = 32
EPOCHS = 5

DATA_DIR = "../val2017"
CSV_PATH = "coco_labels_subset.csv"

label_map = {
    "person": 0,
    "car": 1,
    "dog": 2,
    "bicycle": 3,
    "chair": 4
}

NUM_CLASSES = len(label_map)

In [None]:
df = pd.read_csv(CSV_PATH)
df["label"] = df["label"].map(label_map)

paths = df["filename"].apply(lambda x: os.path.join(DATA_DIR, x)).values
labels = df["label"].values

# create tensorflow dataset
ds = tf.data.Dataset.from_tensor_slices((paths, labels))

# function to load and preprocess images
def load_image(path, label):
    img = tf.io.read_file(path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, (IMG_SIZE, IMG_SIZE))
    img = img / 255.0
    return img, label

# apply preprocessing
ds = ds.map(load_image, num_parallel_calls=tf.data.AUTOTUNE)
# shuffle and batch
ds = ds.shuffle(1000).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

### train validation split

simple split
not perfect but good enough for benchmarking
we only care about relative performance

In [11]:
total = len(df)
train_size = int(0.8 * total)

train_ds = ds.take(train_size // BATCH_SIZE)
val_ds = ds.skip(train_size // BATCH_SIZE)

### cnn model definition

basic cnn
nothing fancy
goal is understanding not max accuracy

In [None]:
model = tf.keras.Sequential([

    # first conv block
    tf.keras.layers.Conv2D(16, 3, activation="relu", input_shape=(IMG_SIZE, IMG_SIZE, 3)),
    tf.keras.layers.MaxPooling2D(),

    # second conv block
    tf.keras.layers.Conv2D(32, 3, activation="relu"),
    tf.keras.layers.MaxPooling2D(),

    # third conv block
    tf.keras.layers.Conv2D(64, 3, activation="relu"),
    tf.keras.layers.MaxPooling2D(),

    # classifier head
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(256, activation="relu"),
    tf.keras.layers.Dense(NUM_CLASSES, activation="softmax")
])


# compile the model
model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [14]:
model.fit(train_ds, validation_data=val_ds, epochs=EPOCHS)

MODEL_DIR = os.getenv("AIP_MODEL_DIR", "saved_model")
model.save("coco_model.keras")

Epoch 1/5
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 471ms/step - accuracy: 0.6423 - loss: 0.9166 - val_accuracy: 0.7112 - val_loss: 0.8013
Epoch 2/5
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 490ms/step - accuracy: 0.6456 - loss: 0.8482 - val_accuracy: 0.7232 - val_loss: 0.6658
Epoch 3/5
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 637ms/step - accuracy: 0.6547 - loss: 0.7652 - val_accuracy: 0.7124 - val_loss: 0.7232
Epoch 4/5
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 564ms/step - accuracy: 0.6726 - loss: 0.7135 - val_accuracy: 0.7387 - val_loss: 0.6150
Epoch 5/5
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 496ms/step - accuracy: 0.6811 - loss: 0.6847 - val_accuracy: 0.7411 - val_loss: 0.6078


### Sampling the same number of images as AWS
for fair comparison
aws rekognition was run on 996 images
so we sample the exact same amount here

In [16]:
# sample the same number of images aws saw
# random_state just makes this reproducible

N = 996
df_sample = df.sample(n=N, random_state=42)

### Building an inference dataset

this part is different from training
we dont need labels anymore
only images
we also avoid shuffling because order doesnt matter for timing

In [17]:
# turn filenames into full image paths
paths = df_sample["filename"].apply(
    lambda x: os.path.join("../val2017", x)
).values

# load and preprocess images only
def load_image_only(path):
    img = tf.io.read_file(path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, (IMG_SIZE, IMG_SIZE))
    img = img / 255.0
    return img

# build dataset for inference
pred_ds = tf.data.Dataset.from_tensor_slices(paths)
pred_ds = pred_ds.map(load_image_only).batch(BATCH_SIZE)

### Running inference and measuring latency

this is what we compare to aws latency
we time the full prediction run
then divide by number of images

In [18]:
import time

start = time.time()
preds = model.predict(pred_ds)
end = time.time()

avg_latency = (end - start) / N
print("avg latency per image:", avg_latency)

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 110ms/step
avg latency per image: 0.00474756549161122


### Converting model outputs to labels

the model outputs probabilities
argmax gives the predicted class index
we map it back to label names

In [21]:
# reverse the label map
inv_label_map = {v: k for k, v in label_map.items()}

pred_labels = [inv_label_map[i] for i in preds.argmax(axis=1)]

# quick sanity check
pred_labels[:10]

['chair',
 'car',
 'person',
 'person',
 'person',
 'person',
 'person',
 'person',
 'person',
 'person']