## Single Shot Detector

### Download dataset.

In [1]:
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import xml.etree.ElementTree as ET
from google.colab import auth
from googleapiclient.discovery import build
from google.colab import userdata

from tensorflow.keras.applications import VGG16
from tensorflow.keras.layers import Flatten, Dense, Dropout, Input
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import img_to_array, load_img


GOOGLE_DRIVE_FOLDER_ID = "14LvaZfifLwoGsnIbwINaEZMSLEpP4E4-"
KAGGLE_DATASET = "mbkinaci/fruit-images-for-object-detection"
BASE_DIR = "/content/fruit_data"

# Request permissions to access (read/write) the Google Drive Folder ID
auth.authenticate_user()
drive_service = build('drive', 'v3')

# Configurar Kaggle
os.environ['KAGGLE_USERNAME'] = userdata.get('KAGGLE_USERNAME')
os.environ['KAGGLE_KEY'] = userdata.get('KAGGLE_KEY')

# Descargar y descomprimir
if not os.path.exists(BASE_DIR):
    print("Download dataset...")
    !kaggle datasets download -d {KAGGLE_DATASET}
    !unzip -q fruit-images-for-object-detection.zip -d {BASE_DIR}
    print(f"Dataset download in {BASE_DIR}")
else:
    print("The dataset has been already downloaded.")

# Verificar estructura
print("Dataset Folders:", os.listdir(BASE_DIR))

Download dataset...
Dataset URL: https://www.kaggle.com/datasets/mbkinaci/fruit-images-for-object-detection
License(s): CC0-1.0
Downloading fruit-images-for-object-detection.zip to /content
  0% 0.00/28.4M [00:00<?, ?B/s]
100% 28.4M/28.4M [00:00<00:00, 1.40GB/s]
Dataset download in /content/fruit_data
Dataset Folders: ['test_zip', 'train_zip']


## Pre-processing


In [6]:
# Parameters
IMG_WIDTH = 224 # VGG16 prefer 224x224
IMG_HEIGHT = 224
BATCH_SIZE = 32

# Classes map
CLASS_MAP = {"apple": 0, "banana": 1, "orange": 2}
INV_CLASS_MAP = {0: "apple", 1: "banana", 2: "orange"}

def load_data_from_xml(directory):
    data = []
    labels = []
    bboxes = []
    image_paths = []

    # El dataset suele tener carpetas train_zip/train y test_zip/test dentro
    # Vamos a buscar recursivamente archivos XML
    for root_dir, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".xml"):
                xml_path = os.path.join(root_dir, file)

                # 1. Leer XML
                tree = ET.parse(xml_path)
                root = tree.getroot()

                # Buscar el nombre de la imagen correspondiente al XML
                filename = root.find("filename").text
                img_path = os.path.join(root_dir, filename)

                if not os.path.exists(img_path):
                    continue # Si no encuentra la imagen, salta

                # 2. Leer dimensiones originales
                size = root.find("size")
                w = int(size.find("width").text)
                h = int(size.find("height").text)

                if w == 0 or h == 0:
                    continue

                # 3. Leer TODOS los Objetos
                # Usamos findall para obtener una lista de todos los objetos
                objects = root.findall("object")

                # Filtramos objetos validos antes de procesar la imagen
                valid_objects = [obj for obj in objects if obj.find("name").text in CLASS_MAP]

                if not valid_objects:
                    continue

                # 4. CARGAR Y RE-ESCALAR IMAGEN (Una vez por archivo)
                try:
                    image = load_img(img_path, target_size=(IMG_WIDTH, IMG_HEIGHT))
                    image = img_to_array(image)
                except Exception as e:
                    print(f"Error cargando imagen {img_path}: {e}")
                    continue

                # Iterar sobre cada objeto encontrado en la imagen
                for obj in valid_objects:
                    name = obj.find("name").text

                    bndbox = obj.find("bndbox")
                    xmin = int(bndbox.find("xmin").text)
                    ymin = int(bndbox.find("ymin").text)
                    xmax = int(bndbox.find("xmax").text)
                    ymax = int(bndbox.find("ymax").text)

                    # 5. NORMALIZAR COORDENADAS
                    # La red aprende mejor números entre 0 y 1.
                    xmin = xmin / w
                    ymin = ymin / h
                    xmax = xmax / w
                    ymax = ymax / h

                    # Añadimos el ejemplo al dataset
                    # Nota: La imagen se repite para cada objeto, lo cual es aceptable
                    # para este enfoque simplificado.
                    data.append(image)
                    labels.append(CLASS_MAP[name])
                    bboxes.append((xmin, ymin, xmax, ymax))
                    image_paths.append(img_path)

    # Convertir a numpy y normalizar pixeles
    data = np.array(data, dtype="float32") / 255.0
    labels = np.array(labels)
    bboxes = np.array(bboxes, dtype="float32")

    return data, labels, bboxes, image_paths

print("Cargando datos de entrenamiento (Train)...")
# Ajusta la ruta según la estructura del unzip
train_dir = os.path.join(BASE_DIR, "train_zip", "train")
X_train, y_train_class, y_train_box, _ = load_data_from_xml(train_dir)

print("Cargando datos de prueba (Test)...")
test_dir = os.path.join(BASE_DIR, "test_zip", "test")
X_test, y_test_class, y_test_box, test_paths = load_data_from_xml(test_dir)

# One-hot encoding para las clases
from tensorflow.keras.utils import to_categorical
y_train_class = to_categorical(y_train_class, num_classes=3)
y_test_class = to_categorical(y_test_class, num_classes=3)

print(f"Train samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")

Cargando datos de entrenamiento (Train)...
1200
1200
1200
652
652
652
1910
2850
694
1300
1280
1280
1280
2091
440
440
220
800
640
640
640
800
800
800
800
800
350
718
1024
1024
1024
1320
1200
1200
1200
1200
634
600
425
425
425
640
3160
800
800
800
800
800
800
800
800
1600
728
280
280
450
271
250
1940
1024
1024
1024
620
700
700
700
780
780
780
620
620
620
620
450
250
250
1300
1132
1132
1132
800
800
800
1024
1292
1292
640
300
600
600
600
600
600
1200
722
450
500
1000
250
250
250
250
250
800
800
468
468
468
468
468
468
468
468
1200
1200
1200
1200
780
500
800
800
800
800
800
800
800
800
196
1000
1000
1000
1000
1000
514
514
1023
1023
1023
800
800
800
437
437
437
437
250
1300
444
444
444
800
630
630
630
630
630
630
630
630
630
350
350
350
333
1200
600
1024
1024
1024
1024
1024
1024
1024
460
350
720
720
720
720
1280
1920
1920
1920
1920
1400
351
351
640
640
640
960
960
640
640
640
640
700
620
1824
2048
1010
1010
300
300
300
600
600
500
500
500
693
1024
1024
1024
1024
800
800
800
800
640
250
640
3