In [2]:
import tensorflow as tf
from tensorflow.keras import layers, models
import cv2
import numpy as np
import os


import xml.etree.ElementTree as ET


In [3]:
'''class_map = {
    "apple": 0,
    "banana": 1,
    "orange": 2
}
NUM_CLASSES = 3'''
IMG_SIZE = 128

TRAIN_PATH = "dataset/train_zip/train/"
TEST_PATH  = "dataset/test_zip/test/"


class_map = {
    "apple": 0,
    "banana": 1,
    "orange": 2
}

NUM_CLASSES = len(class_map)


In [4]:
def load_dataset(folder_path):
    images = []
    boxes = []
    labels = []

    for file in os.listdir(folder_path):
        if file.endswith(".jpg"):
            img_path = os.path.join(folder_path, file)
            xml_path = img_path.replace(".jpg", ".xml")

            # Read image
            img = cv2.imread(img_path)
            h, w, _ = img.shape
            img = cv2.resize(img, (IMG_SIZE, IMG_SIZE))
            img = img / 255.0

            # Parse XML
            tree = ET.parse(xml_path)
            root = tree.getroot()

            obj = root.find("object")
            class_name = obj.find("name").text
            bbox = obj.find("bndbox")

            xmin = int(bbox.find("xmin").text) / w
            ymin = int(bbox.find("ymin").text) / h
            xmax = int(bbox.find("xmax").text) / w
            ymax = int(bbox.find("ymax").text) / h

            images.append(img)
            boxes.append([xmin, ymin, xmax, ymax])
            labels.append(class_map[class_name])

    images = np.array(images, dtype=np.float32)
    boxes = np.array(boxes, dtype=np.float32)
    labels = tf.keras.utils.to_categorical(labels, NUM_CLASSES)

    return images, boxes, labels


In [5]:
X_train, bbox_train, y_train = load_dataset(TRAIN_PATH)
X_test,  bbox_test,  y_test  = load_dataset(TEST_PATH)


In [6]:
input_img = layers.Input(shape=(IMG_SIZE, IMG_SIZE, 3))

x = layers.Conv2D(16, 3, activation="relu")(input_img)
x = layers.MaxPooling2D()(x)

x = layers.Conv2D(32, 3, activation="relu")(x)
x = layers.MaxPooling2D()(x)

x = layers.Flatten()(x)
x = layers.Dense(64, activation="relu")(x)

bbox_output = layers.Dense(4, activation="sigmoid", name="bbox")(x)
class_output = layers.Dense(NUM_CLASSES, activation="softmax", name="class")(x)

model = models.Model(inputs=input_img, outputs=[bbox_output, class_output])


In [7]:
model.compile(
    optimizer="adam",
    loss={
        "bbox": "mse",
        "class": "categorical_crossentropy"
    },
    metrics={"class": "accuracy"}
)


In [8]:
model.summary()


In [9]:
model.fit(
    X_train,
    {"bbox": bbox_train, "class": y_train},
    validation_data=(X_test, {"bbox": bbox_test, "class": y_test}),
    epochs=10,
    batch_size=4
)


Epoch 1/10
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - class_accuracy: 0.6148 - loss: 1.2750 - val_class_accuracy: 0.8667 - val_loss: 0.4910
Epoch 2/10
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - class_accuracy: 0.9019 - loss: 0.2885 - val_class_accuracy: 0.8333 - val_loss: 0.5829
Epoch 3/10
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - class_accuracy: 0.9354 - loss: 0.1722 - val_class_accuracy: 0.8833 - val_loss: 0.5005
Epoch 4/10
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - class_accuracy: 0.9932 - loss: 0.0934 - val_class_accuracy: 0.8667 - val_loss: 0.5633
Epoch 5/10
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - class_accuracy: 0.9794 - loss: 0.0849 - val_class_accuracy: 0.8667 - val_loss: 0.6193
Epoch 6/10
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - class_accuracy: 1.0000 - loss: 0.0291 - val_clas

<keras.src.callbacks.history.History at 0x19d0afe6390>