Notebook by Zara

#### ``You Only Look Once``

The YOLO model is another convolutional neural network used for object recognition, offering real-time evaluation unlike other algorithms such as SDD and R-CNN. It divided the input image into grids and assigns each grid the responsibility of detecting objects whose midpoint falls within it. 

The model predicts target frames for each grid, with each frame represented by five parameters: the coordinates of the center point (x, y), width (w), height (h), and a confidence score (si). The confidence score measures the accuracy of the predicted frame compared to the real frame. 

In [1]:
# import libraries 
import pandas as pd
import numpy as np
import os
import cv2
import tensorflow as tf
from tensorflow.keras import layers


#### **``Data preparation``**

Read the CSV file

In [2]:
data = pd.read_csv('/Users/zaravanthoff/Desktop/MasterProject/Datasets/PublicDataset/flickr_logos_27_dataset/flickr_logos_27_dataset_training_set_annotation.txt', sep='\s+',header=None)


In [3]:
data

Unnamed: 0,0,1,2,3,4,5,6
0,144503924.jpg,Adidas,1,38,12,234,142
1,2451569770.jpg,Adidas,1,242,208,413,331
2,390321909.jpg,Adidas,1,13,5,89,60
3,4761260517.jpg,Adidas,1,43,122,358,354
4,4763210295.jpg,Adidas,1,83,63,130,93
...,...,...,...,...,...,...,...
4531,2126991906.jpg,Yahoo,6,15,6,253,54
4532,217288720.jpg,Yahoo,6,136,161,304,222
4533,2472817996.jpg,Yahoo,6,2,4,499,106
4534,2514220918.jpg,Yahoo,6,1,69,342,157


In [4]:
print(data.dtypes)

0    object
1    object
2     int64
3     int64
4     int64
5     int64
6     int64
dtype: object


In [5]:
# # # Convert column 0 to float
# # data[0] = pd.to_numeric(data[0], errors='coerce')

# # Convert column 1 to float
# data[1] = pd.to_numeric(data[1], errors='coerce')

# # Convert labels column to int
# data[2] = data[2].astype(int)

# # Check the data types after conversion
# print(data.dtypes)

In [6]:
# Extract classes from the second column
classes = data[2]

# Count the number of unique classes
num_classes = len(classes.unique())

print("Number of classes:", num_classes)

Number of classes: 6


In [7]:
data

Unnamed: 0,0,1,2,3,4,5,6
0,144503924.jpg,Adidas,1,38,12,234,142
1,2451569770.jpg,Adidas,1,242,208,413,331
2,390321909.jpg,Adidas,1,13,5,89,60
3,4761260517.jpg,Adidas,1,43,122,358,354
4,4763210295.jpg,Adidas,1,83,63,130,93
...,...,...,...,...,...,...,...
4531,2126991906.jpg,Yahoo,6,15,6,253,54
4532,217288720.jpg,Yahoo,6,136,161,304,222
4533,2472817996.jpg,Yahoo,6,2,4,499,106
4534,2514220918.jpg,Yahoo,6,1,69,342,157


Parse the annotations to extract the file names, class labels, and bounding box coordinates

In [8]:
# Iterate through each row in the dataset
for index, row in data.iterrows():
    # Extract file name
    image_filename = row[0]  # Assuming the file name is in the first column
    
    # Extract class label
    class_label = row[1]  # Assuming the class label is in the second column
    
    # Extract bounding box coordinates
    x_min, y_min, x_max, y_max = row[2], row[3], row[4], row[5]  # Assuming bounding box coordinates are in columns 3 to 6
    
    # Print extracted information
    print(f"Image file name: {image_filename}")
    print(f"Class label: {class_label}")
    print(f"Bounding box coordinates: ({x_min}, {y_min}), ({x_max}, {y_max})")


Image file name: 144503924.jpg
Class label: Adidas
Bounding box coordinates: (1, 38), (12, 234)
Image file name: 2451569770.jpg
Class label: Adidas
Bounding box coordinates: (1, 242), (208, 413)
Image file name: 390321909.jpg
Class label: Adidas
Bounding box coordinates: (1, 13), (5, 89)
Image file name: 4761260517.jpg
Class label: Adidas
Bounding box coordinates: (1, 43), (122, 358)
Image file name: 4763210295.jpg
Class label: Adidas
Bounding box coordinates: (1, 83), (63, 130)
Image file name: 4763210295.jpg
Class label: Adidas
Bounding box coordinates: (1, 91), (288, 125)
Image file name: 4763210295.jpg
Class label: Adidas
Bounding box coordinates: (1, 182), (63, 229)
Image file name: 4763210295.jpg
Class label: Adidas
Bounding box coordinates: (1, 192), (291, 225)
Image file name: 4763210295.jpg
Class label: Adidas
Bounding box coordinates: (1, 285), (61, 317)
Image file name: 4763210295.jpg
Class label: Adidas
Bounding box coordinates: (1, 285), (298, 324)
Image file name: 4763210

In [9]:
data

Unnamed: 0,0,1,2,3,4,5,6
0,144503924.jpg,Adidas,1,38,12,234,142
1,2451569770.jpg,Adidas,1,242,208,413,331
2,390321909.jpg,Adidas,1,13,5,89,60
3,4761260517.jpg,Adidas,1,43,122,358,354
4,4763210295.jpg,Adidas,1,83,63,130,93
...,...,...,...,...,...,...,...
4531,2126991906.jpg,Yahoo,6,15,6,253,54
4532,217288720.jpg,Yahoo,6,136,161,304,222
4533,2472817996.jpg,Yahoo,6,2,4,499,106
4534,2514220918.jpg,Yahoo,6,1,69,342,157


#### **``Data preprocessing``**

Load each image and resize it to a fixed size suitable for YOLO input

1. Load each image and resize it to a fixed size suitable for YOLO input (416x416 since this is typically used for YOLO)
2. Normalize the pixel values of the resized images to the range [0, 1] by dividing by 255.
3. Convert the bounding box coordinates from absolute pixel values to relative values with respect to the image dimensions. This whill scale the coordinates to the range [0, 1], where (0,0) represents the top-left corner of the image and (1,1) represents the bottom-right corner.

In [10]:
folder_path = "/Users/zaravanthoff/Desktop/MasterProject/Datasets/PublicDataset/flickr_logos_27_dataset/flickr_logos_27_dataset_images/"

In [11]:
# Iterate through each row in the dataset
for index, row in data.iterrows():
    # Extract image filename from the dataset
    image_filename = row[0]  # Accessing by integer index 0
    
    # Construct the full path to the image
    image_path = os.path.join(folder_path, image_filename)
    
    # Read the image using OpenCV
    image = cv2.imread(image_path)

In [12]:
data

Unnamed: 0,0,1,2,3,4,5,6
0,144503924.jpg,Adidas,1,38,12,234,142
1,2451569770.jpg,Adidas,1,242,208,413,331
2,390321909.jpg,Adidas,1,13,5,89,60
3,4761260517.jpg,Adidas,1,43,122,358,354
4,4763210295.jpg,Adidas,1,83,63,130,93
...,...,...,...,...,...,...,...
4531,2126991906.jpg,Yahoo,6,15,6,253,54
4532,217288720.jpg,Yahoo,6,136,161,304,222
4533,2472817996.jpg,Yahoo,6,2,4,499,106
4534,2514220918.jpg,Yahoo,6,1,69,342,157


In [13]:
# Define the target size for resizing
target_height = 416
target_width = 416

In [14]:
# List to store resized images and corresponding labels
resized_images = []
labels = []

In [15]:
# Iterate through each row in the dataset
for index, row in data.iterrows():
    # Extract image filename from the dataset
    image_filename = row[0]  # Accessing by integer index 0
    
    # Construct the full path to the image
    image_path = os.path.join(folder_path, image_filename)
    
    # Read the image using OpenCV
    image = cv2.imread(image_path)
    
    # Resize the image to the target size
    resized_image = cv2.resize(image, (target_width, target_height))
    
    # Append resized image to the list
    resized_images.append(resized_image)
    
    # Append label to the list (assuming label is in the second column of the dataframe)
    label = row[1]
    labels.append(label)

In [16]:
data

Unnamed: 0,0,1,2,3,4,5,6
0,144503924.jpg,Adidas,1,38,12,234,142
1,2451569770.jpg,Adidas,1,242,208,413,331
2,390321909.jpg,Adidas,1,13,5,89,60
3,4761260517.jpg,Adidas,1,43,122,358,354
4,4763210295.jpg,Adidas,1,83,63,130,93
...,...,...,...,...,...,...,...
4531,2126991906.jpg,Yahoo,6,15,6,253,54
4532,217288720.jpg,Yahoo,6,136,161,304,222
4533,2472817996.jpg,Yahoo,6,2,4,499,106
4534,2514220918.jpg,Yahoo,6,1,69,342,157


In [17]:
# Convert lists to numpy arrays
resized_images = np.array(resized_images)
labels = np.array(labels)

In [18]:
# Check the shape of resized images and labels
print("Resized Images Shape:", resized_images.shape)
print("Labels Shape:", labels.shape)

Resized Images Shape: (4536, 416, 416, 3)
Labels Shape: (4536,)


In [19]:
# Normalize pixel values
resized_images = resized_images.astype('float32') / 255.0

# Check the range of pixel values after normalization
print("Minimum Pixel Value After Normalization:", np.min(resized_images))
print("Maximum Pixel Value After Normalization:", np.max(resized_images))

Minimum Pixel Value After Normalization: 0.0
Maximum Pixel Value After Normalization: 1.0


In [20]:
data

Unnamed: 0,0,1,2,3,4,5,6
0,144503924.jpg,Adidas,1,38,12,234,142
1,2451569770.jpg,Adidas,1,242,208,413,331
2,390321909.jpg,Adidas,1,13,5,89,60
3,4761260517.jpg,Adidas,1,43,122,358,354
4,4763210295.jpg,Adidas,1,83,63,130,93
...,...,...,...,...,...,...,...
4531,2126991906.jpg,Yahoo,6,15,6,253,54
4532,217288720.jpg,Yahoo,6,136,161,304,222
4533,2472817996.jpg,Yahoo,6,2,4,499,106
4534,2514220918.jpg,Yahoo,6,1,69,342,157


In [21]:
from sklearn.model_selection import train_test_split

# Define the ratio for splitting the data
train_ratio = 0.8  # 80% of the data for training
val_ratio = 0.1    # 10% of the data for validation
test_ratio = 0.1   # 10% of the data for testing

# Split the data into training and remaining sets
train_data, remaining_data = train_test_split(data, test_size=(1 - train_ratio), random_state=42)

# Split the remaining data into validation and testing sets
val_data, test_data = train_test_split(remaining_data, test_size=test_ratio/(test_ratio + val_ratio), random_state=42)

# Check the size of each split
print("Training Data Size:", len(train_data))
print("Validation Data Size:", len(val_data))
print("Testing Data Size:", len(test_data))


Training Data Size: 3628
Validation Data Size: 454
Testing Data Size: 454


In [22]:
import tensorflow as tf
from tensorflow.keras import layers

def create_yolo_model(input_shape, num_classes):
    # Define the YOLO model architecture
    inputs = tf.keras.Input(shape=input_shape)

    # Backbone layers (e.g., convolutional layers)
    x = layers.Conv2D(64, (3, 3), strides=(1, 1), padding='same', activation='relu')(inputs)
    x = layers.MaxPooling2D((2, 2), padding='same')(x)
    x = layers.Conv2D(128, (3, 3), strides=(1, 1), padding='same', activation='relu')(x)
    x = layers.MaxPooling2D((2, 2), padding='same')(x)
    x = layers.Conv2D(256, (3, 3), strides=(1, 1), padding='same', activation='relu')(x)
    x = layers.MaxPooling2D((2, 2), padding='same')(x)
    x = layers.Conv2D(512, (3, 3), strides=(1, 1), padding='same', activation='relu')(x)
    x = layers.MaxPooling2D((2, 2), padding='same')(x)
    x = layers.Conv2D(1024, (3, 3), strides=(1, 1), padding='same', activation='relu')(x)
    x = layers.GlobalAveragePooling2D()(x)

    # Detection head (output layer)
    outputs = layers.Dense(num_classes + 5, activation='sigmoid')(x)

    # Create the model
    model = tf.keras.Model(inputs, outputs)

    return model

# Define input shape and number of classes
input_shape = (416, 416, 3)  # Height, Width, Channels
num_classes = 6  # Assuming there are 10 classes (adjust according to your dataset)

# Create the YOLO model
yolo_model = create_yolo_model(input_shape, num_classes)

# Compile the model
yolo_model.compile(optimizer='adam', loss='mse', metrics=['accuracy'])

# Print model summary
yolo_model.summary()


2024-04-05 16:47:21.359978: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1
2024-04-05 16:47:21.360167: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 8.00 GB
2024-04-05 16:47:21.360176: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 2.67 GB
2024-04-05 16:47:21.360219: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-04-05 16:47:21.360615: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [23]:
data

Unnamed: 0,0,1,2,3,4,5,6
0,144503924.jpg,Adidas,1,38,12,234,142
1,2451569770.jpg,Adidas,1,242,208,413,331
2,390321909.jpg,Adidas,1,13,5,89,60
3,4761260517.jpg,Adidas,1,43,122,358,354
4,4763210295.jpg,Adidas,1,83,63,130,93
...,...,...,...,...,...,...,...
4531,2126991906.jpg,Yahoo,6,15,6,253,54
4532,217288720.jpg,Yahoo,6,136,161,304,222
4533,2472817996.jpg,Yahoo,6,2,4,499,106
4534,2514220918.jpg,Yahoo,6,1,69,342,157


In [30]:
# Define maximum number of objects that can be detected in an image
max_objects = 10

# Initialize label array with zeros
labels = np.zeros((len(data), max_objects, 5 + num_classes))  # Adjusted to match the number of rows in your dataset

# Create a dictionary to map class labels to indices
class_label_to_index = {}
for idx, label in enumerate(data[1].unique()):  # Assuming class labels are in column 1 of your dataframe
    class_label_to_index[label] = idx

# Iterate over each object in the dataset
for i, row in data.iterrows():
    # Extract bounding box coordinates and class labels from the dataset
    bbox_coords_list = [(row[2], row[3], row[4], row[5])]  # Assuming bbox coordinates are in columns 2 to 5
    class_labels_list = [row[1]]  # Assuming class labels are in column 1

    # Iterate over each object in the image
    for j in range(len(bbox_coords_list)):
        bbox = bbox_coords_list[j]
        class_labels = class_labels_list[j]

        # Encode bounding box coordinates
        x_min, y_min, x_max, y_max = bbox
        labels[i, j, :4] = [x_min, y_min, x_max, y_max]

        # Encode class labels using one-hot encoding
        class_index = class_label_to_index[class_labels]
        labels[i, j, 5 + class_index] = 1

IndexError: index 11 is out of bounds for axis 2 with size 11

In [24]:
# Assuming you have the training and validation data ready (resized images and bounding box annotations)
# Train the model
history = yolo_model.fit(
    x=resized_images, y=labels, batch_size=32, epochs=10, validation_split=0.1
)


ValueError: Invalid dtype: str288

In [None]:
# # Define training parameters
# batch_size = 32
# epochs = 10
# validation_split = 0.2  # Percentage of training data to use for validation

# # Train the model
# history = yolo_model.fit(
#     resized_images,  # Training data (resized images)
#     labels,  # Training labels
#     batch_size=batch_size,
#     epochs=epochs,
#     validation_split=validation_split
# )

ValueError: Invalid dtype: str288

--------------------

--------------------------