In [1]:
from os import listdir
from PIL import Image
from torchvision import transforms
import torch
import json
import random

import cv2
import numpy

In [5]:
"""
Transforms the training images and labels into tensors and loads them into the train_data list as 
tuples.
"""
def DataLoader():
    train_files, target_files = FilesLoader()
    
    for i in range(len(train_files)):
        if i == 1:
            break
        
        # Extracts a single random image and the corresponding label, and transforms them into
        # tensors. Both are appended to the train_data list in form of a tuple.
        extract_image_and_label(train_files, target_files) 


"""
Loads the images and the label file using the respective system path.
Returns:
    train_files (list): List containing all image names from the folder.
    target_files (list): List containg the labels in the form of a json datastructure
"""
def FilesLoader():
    # All image names from the directory are loaded into the list train_files.
    global train_files
    train_files = listdir(train_files_path)
    # The json file containing the labels is loaded into the list target_files.
    f = open(target_files_path)
    target_files = json.load(f)
    return train_files, target_files


"""
Chooses a random image which is then being transformed into a tensor and stored.
Finds the corresponding label inside the json file which is then being transformed into a tensor
and stored. Stores both tensors in a tuple inside the train_data list.
Parameters:
    train_files (list): A list containing the names of all images from the image folder
    target_files (list): A list containing the json object with all the labels.
"""
def extract_image_and_label(train_files, target_files):
    img_tensor, chosen_image = extract_image(train_files)
    target_tensor = extract_json_label(target_files, chosen_image)
    
    train_data.append((img_tensor, target_tensor))

    
"""
Finds a random image from the folder and applies the transform to it. 
Parameters:
    train_files (list): A list containing the names of all images from the image folder
Returns:
    img_tensor (tensor): The tensor which contains the image values
    f (string): The string name of the image file
"""    
def extract_image(train_files):
    f = random.choice(train_files)
    train_files.remove(f)
    global img
    img = Image.open(train_files_path + f)
    img_tensor = transform(img) # Apply the transform to the image.
    return img_tensor, f


"""
Uses the name of the image to find the corresponding json element. Then it extracts the data and
transforms it into a tensor which is stored inside the train_data list.
Parameters:
    target_files (list): A list containing the json object with all the labels.
    chosen_image (string): The name of the image for which the label is needed.
Returns:
    target_tensor (tensor): The tensor which contains the image labels
"""
def extract_json_label(target_files, chosen_image):
    for json in target_files:
        if json['name'] == chosen_image:
            img_label = json
            break
            
    target_tensor = transform_label_to_tensor(img_label)

    return target_tensor
    
    
"""
Extracts the useful information from the json object and transforms them into a tensor.
Parameters:
    img_label (): A specific json element
Returns:
    target_tensor (tensor): A tensor of size (5+num_classes,cells,cells) which is used as the target of 
    the image.
"""
def transform_label_to_tensor(img_label):
    target_tensor = torch.zeros(5+num_classes, cells, cells) # Here are the information stored
    
    for labels in range(len(img_label["labels"])):
        
        # Store the category index if its contained within the category_list.
        category = img_label["labels"][labels]["category"]         
        if category not in category_list:
            continue
        ctg_idx = category_list.index(category)
        
        # Store the bounding box information and rescale it by the resize factor.
        x1 = img_label["labels"][labels]["box2d"]["x1"] * (448/img.size[0])
        y1 = img_label["labels"][labels]["box2d"]["y1"] * (448/img.size[1])
        x2 = img_label["labels"][labels]["box2d"]["x2"] * (448/img.size[0])
        y2 = img_label["labels"][labels]["box2d"]["y2"] * (448/img.size[1])
        debug_labels.append([ctg_idx,x1,y1,x2,y2]) # ToDo: delete this

        # Transforms the corner bounding box information into a mid bounding box information
        x_mid = abs(x2 - x1) / 2 + x1
        y_mid = abs(y2 - y1) / 2 + y1
        width = abs(x2 - x1) 
        height = abs(y2 - y1) 
        
        # Size of a single cell
        cell_dim = int(448 / cells)
        
        # Determines the cell position of the bounding box
        cell_pos_x = int(x_mid // cell_dim)
        cell_pos_y = int(y_mid // cell_dim)
        
        # Stores the information inside the target_tensor
        target_tensor[0][cell_pos_y][cell_pos_x] = 1
        target_tensor[1][cell_pos_y][cell_pos_x] = (x_mid % cell_dim) / cell_dim
        target_tensor[2][cell_pos_y][cell_pos_x] = (y_mid % cell_dim) / cell_dim
        target_tensor[3][cell_pos_y][cell_pos_x] = width / 448
        target_tensor[4][cell_pos_y][cell_pos_x] = height / 448
        target_tensor[ctg_idx+5][cell_pos_y][cell_pos_x] = 1

    return target_tensor

# Define transform which is applied to every single image to resize and convert it into a tensor
transform = transforms.Compose([
    transforms.Resize((448,448), Image.NEAREST),
    transforms.ToTensor(),
    ])

# System paths to the train images and the label file
# ToDo: Adapt this to only use the respective folder location
train_files_path = 'C:/Users/alens/Desktop/Real-time-Object-Detection-for-Autonomous-Driving-using-Deep-Learning/YOLO v1/bdd100k/images/100k/val/'
target_files_path = 'C:/Users/alens/Desktop/Real-time-Object-Detection-for-Autonomous-Driving-using-Deep-Learning/YOLO v1/bdd100k_labels_release/bdd100k/labels/det_v2_val_release.json'

# Reference list to all the label categories for object detection
category_list = ["other vehicle", "pedestrian", "traffic light", "traffic sign", "truck", "train", "other person", "bus", "car", "rider", "motorcycle", "bicycle", "trailer"]

# Number of classes from the dataset
num_classes = len(category_list)

# Determines how many cells the YOLO grid contains
cells = 14

# Will contain all training image tensors and the corresponding label tensors in the form of tuples
train_data = []
debug_labels = [] # ToDo: delete this

# Fills the train_data list with the tuples
DataLoader()













# Used for printing the bounding box directly from the dataset
color = (0, 255, 0)
thickness = 1
image = train_data[0][0].numpy().transpose(1, 2, 0)
img_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
       
for i in range(len(debug_labels)):
    start_point = (int(debug_labels[i][1]),int(debug_labels[i][2]))
    end_point = (int(debug_labels[i][3]),int(debug_labels[i][4]))
    cv2.rectangle(img_rgb, start_point, end_point, color, thickness)
    
    start_point = (int(debug_labels[i][1]),int(debug_labels[i][2])-10)
    cv2.putText(img_rgb, str(category_list[debug_labels[i][0]]), start_point, cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)

cv2.imshow("image", img_rgb)
cv2.waitKey()

# Used to print the bounding box from the tensor structures
image2 = train_data[0][0].numpy().transpose(1, 2, 0)
img_rgb2 = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
cell_dim = int(448 / cells)

for h in range(cells):
    for w in range(cells):
        if train_data[0][1][0,h,w] == 0:
            continue
        centre_x = train_data[0][1][1,h,w]*cell_dim + cell_dim*w
        centre_y = train_data[0][1][2,h,w]*cell_dim + cell_dim*h
        width = train_data[0][1][3,h,w] * 448
        height = train_data[0][1][4,h,w] * 448
        
        start_point = (int(centre_x - width/2), int(centre_y - height/2))
        end_point = (int(centre_x + width/2), int(centre_y + height/2))
        cv2.rectangle(img_rgb2, start_point, end_point, color, thickness)
        
        for i in range(13):
            if train_data[0][1][i+5,h,w] == 1:
                category_idx = i
                break
        start_point = (int(centre_x - width/2),int(centre_y - height/2)-10)
        cv2.putText(img_rgb2, str(category_list[category_idx]), start_point, cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)
cv2.imshow("image", img_rgb2)
cv2.waitKey()

-1