# MNIST Object detection using YOLOv4

In [9]:
import os
import requests

In [10]:
# Clone all repositories
!git submodule update --init --recursive

## Downloading weights

In [None]:
required_files = {
    "yolov4.weights": "https://github.com/AlexeyAB/darknet/releases/download/darknet_yolo_v4_pre/yolov4.weights", # Real yolo v4 weights
    "yolov4.conv.137": "https://github.com/AlexeyAB/darknet/releases/download/darknet_yolo_v3_optimal/yolov4.conv.137" # Weights for transfer learning and faster training
}

for file_name, url in required_files.items():
    if os.path.exists(file_name):
        continue
    print(f"Downloading {file_name}...")
    r = requests.get(url, allow_redirects=True)
    with open(file_name, 'wb') as f:
        f.write(r.content)
    print(f"Downloaded {file_name}.")



## Running the model on a video using OpenCV DNN and yolov4 weights

In [20]:
# Load darknet dll
import ctypes
import os
import sys
import cv2
import numpy as np
import darknet.darknet as darknet

In [21]:
print("OpenCV version: ", cv2.__version__)
print("OpenCV CUDA enabled: ", cv2.cuda.getCudaEnabledDeviceCount())

OpenCV version:  4.5.5
OpenCV CUDA enabled:  1


In [99]:
# Load the network using opencv
net = cv2.dnn.readNet("yolov4.weights", "darknet/cfg/yolov4.cfg")
classes = open("darknet/data/coco.names").read().strip().split("\n")
colors = np.random.uniform(0, 255, size=(200, 3))
width = 608
height = 608

if cv2.cuda.getCudaEnabledDeviceCount() > 0:
    net.setPreferableTarget(cv2.dnn.DNN_TARGET_CUDA)
    net.setPreferableBackend(cv2.dnn.DNN_BACKEND_CUDA)

font=cv2.FONT_HERSHEY_SIMPLEX

In [106]:
def draw_bounding_boxes(image, classes, class_ids, boxes, confidences, colors, width, height):
    indexes = cv2.dnn.NMSBoxes(boxes, confidences, 0.5, 0.4)
    if len(indexes) > 0:
        for i in indexes.flatten():
            x, y, w, h = boxes[i]
            # Convert coordinates to integers to image size
            # x = int(x * (image.shape[1] / width))
            # w = int(w * (image.shape[1] / width))
            # y = int(y * (image.shape[0] / height))
            # h = int(h * (image.shape[0] / height))
            
            # Draw bounding box
            cv2.rectangle(image, (x, y), (x + w, y + h), colors[class_ids[i]], 2)
            # Draw label
            cv2.putText(image, f"{classes[class_ids[i]].upper()} {confidences[i]:.2f}", (x, y - 5), font, 0.5, colors[class_ids[i]], 2)
    return image

In [107]:
# Detect objects in the video (slow, use darknet executable for faster detection)

def detect_objects_in_video(video_path, net, classes, colors, width, height):
    cap = cv2.VideoCapture(video_path)
    while True:
        _, img = cap.read()
        if img is None:
            break
        # img = cv2.resize(img, (width, height))
        blob = cv2.dnn.blobFromImage(img, 1 / 255, (width, height), [0, 0, 0], 1, crop=False)
        net.setInput(blob)
        output_layers_names = net.getUnconnectedOutLayersNames()
        layerOutputs = net.forward(output_layers_names)
        boxes = []
        confidences = []
        class_ids = []
        for output in layerOutputs:
            for detection in output:
                scores = detection[5:]
                class_id = np.argmax(scores)
                confidence = scores[class_id]
                if confidence > 0.5:
                    center_x = int(detection[0] * img.shape[1])
                    center_y = int(detection[1] * img.shape[0])
                    w = int(detection[2] * img.shape[1])
                    h = int(detection[3] * img.shape[0])
                    x = int(center_x - w / 2)
                    y = int(center_y - h / 2)
                    boxes.append([x, y, w, h])
                    confidences.append((float(confidence)))
                    class_ids.append(class_id)
        img = draw_bounding_boxes(img, classes, class_ids, boxes, confidences, colors, width, height)
        cv2.imshow("Image", img)
        key = cv2.waitKey(1)
        if key == 27: # ESC
            break
    cap.release()
    cv2.destroyAllWindows()

In [109]:
video_path = "C:/Users/aurel/OneDrive - EPITA/Ing3/DNN - Deep Neural Network/IMG_1688.MOV"
if not os.path.exists(video_path):
    print(f"Video not found at {video_path}")
    raise FileNotFoundError

detect_objects_in_video(video_path, net, classes, colors, width, height)

## Generating the dataset

In [41]:
%cd ./MNIST-ObjectDetection
!python ./generate_data.py --num-train-images 30000 --num-test-images 3000
%cd ..

Downloading train-images-idx3-ubyte.gz...
Downloading t10k-images-idx3-ubyte.gz...
Downloading train-labels-idx1-ubyte.gz...
Downloading t10k-labels-idx1-ubyte.gz...
(47040000,)
(7840000,)



Generating dataset, saving to: data\mnist_detection\train:   0%|          | 0/30000 [00:00<?, ?it/s]
Generating dataset, saving to: data\mnist_detection\train:   0%|          | 53/30000 [00:00<00:57, 519.63it/s]
Generating dataset, saving to: data\mnist_detection\train:   0%|          | 111/30000 [00:00<00:54, 545.80it/s]
Generating dataset, saving to: data\mnist_detection\train:   1%|          | 166/30000 [00:00<00:55, 541.93it/s]
Generating dataset, saving to: data\mnist_detection\train:   1%|          | 221/30000 [00:00<00:56, 531.06it/s]
Generating dataset, saving to: data\mnist_detection\train:   1%|          | 276/30000 [00:00<00:55, 536.78it/s]
Generating dataset, saving to: data\mnist_detection\train:   1%|          | 330/30000 [00:00<00:56, 527.43it/s]
Generating dataset, saving to: data\mnist_detection\train:   1%|▏         | 385/30000 [00:00<00:55, 532.76it/s]
Generating dataset, saving to: data\mnist_detection\train:   1%|▏         | 439/30000 [00:00<00:56, 527.06it/s]
Gen

## Creating configuration files for training

In [44]:
data_path = os.path.join(os.getcwd(), 'MNIST-ObjectDetection/data/mnist_detection')
train_path = os.path.join(data_path, 'train')
test_path = os.path.join(data_path, 'test')

cfg_path = os.path.join(os.getcwd(), 'cfg')
os.makedirs(cfg_path, exist_ok=True)
os.makedirs("darknet/backup", exist_ok=True)

# Train YOLOv4 on MNIST dataset
# create data file
with open(f'{cfg_path}/mnist.data', 'w') as f:
    f.write(f'classes = 10\n')
    f.write(f'train = {cfg_path}/mnist_train.txt\n')
    f.write(f'valid = {cfg_path}/mnist_test.txt\n')
    f.write(f'names = {cfg_path}/mnist.names\n')
    f.write(f'backup = backup\n')

# create names file
with open(f'{cfg_path}/mnist.names', 'w') as f:
    for i in range(10):
        f.write(str(i) + '\n')

# create train.txt file
with open(f'{cfg_path}/mnist_train.txt', 'w') as f:
    dirs = os.listdir(os.path.join(train_path, 'images'))
    for d in dirs:
        if d.endswith('.png'):
            f.write(os.path.join(train_path, 'images', d) + '\n')

# create test.txt file
with open(f'{cfg_path}/mnist_test.txt', 'w') as f:
    dirs = os.listdir(os.path.join(test_path, 'images'))
    for d in dirs:
        if d.endswith('.png'):
            f.write(os.path.join(test_path, 'images', d) + '\n')

## Converting the dataset to darknet format

In [46]:
# Default structure is train -> images -> 0,1,2,3,4,5,6,7,8,9 -> 0.jpg, 1.jpg, 2.jpg, ...
#                      train -> labels -> 0,1,2,3,4,5,6,7,8,9 -> 0.txt, 1.txt, 2.txt, ...
# Same for test
# We want the structure to be train -> images -> 0.jpg, 0.txt, 1.jpg, 1.txt, 2.jpg, 2.txt, ...

# YOLOv4 labels are in the format: class_id x_center y_center width height (normalized so between 0 and 1)
# MNIST labels are in the format: class_id x_min y_min x_max y_max (not normalized) a header at the top of the file

def process_file(file, file_out, img_width=300, img_height=300):
    with open(file, 'r') as f:
        lines = f.readlines()[1:]
        res_lines = []
        for line in lines:
            label, xmin, ymin, xmax, ymax = line.strip().split(',')
            label = int(label)
            xmin, ymin, xmax, ymax = int(xmin), int(ymin), int(xmax), int(ymax)
            x_center = (xmin + xmax) / (2 * img_width)
            y_center = (ymin + ymax) / (2 * img_height)
            width = (xmax - xmin) / img_width
            height = (ymax - ymin) / img_height
            res_lines.append(f'{label} {x_center} {y_center} {width} {height}\n')
    with open(file_out, 'w') as f:
        f.writelines(res_lines)

to_process = [
    train_path,
    test_path
]

for path in to_process:
    print(f'Processing {path}')
    for file in os.listdir(os.path.join(path, 'labels')):
        if file.endswith('.txt'):
            process_file(os.path.join(path, 'labels', file), os.path.join(path, 'images', file))


Processing c:\Users\aurel\ING3-DNN-Projet\MNIST-ObjectDetection/data/mnist_detection\train
Processing c:\Users\aurel\ING3-DNN-Projet\MNIST-ObjectDetection/data/mnist_detection\test


> The model is now ready to be trained !
> 
> Compile darknet with GPU and CUDNN enabled for faster training and run the following command to start training the model :
> 
> ```bash
> cd darknet
> ./darknet detector train ../cfg/mnist.data ../yolov4_custom.cfg ../yolov4.conv.137 -map
> ```
> If in windows, use `darknet.exe` instead of `./darknet`
>
> Compilation instructions can be found [here](https://github.com/AlexeyAB/darknet#how-to-compile-on-linuxmacos-using-cmake) for Linux and [here](https://github.com/AlexeyAB/darknet#how-to-compile-on-linuxmacos-using-cmake) for Windows.


## Show the results

In [45]:
def draw_image(img, net, colors, font):
    blob = cv2.dnn.blobFromImage(img, 1 / 255, (width, height), [0, 0, 0], 1, crop=False)
    net.setInput(blob)
    output_layers_names = net.getUnconnectedOutLayersNames()
    layerOutputs = net.forward(output_layers_names)
    boxes = []
    confidences = []
    class_ids = []
    for output in layerOutputs:
        for detection in output:
            scores = detection[5:]
            class_id = np.argmax(scores)
            confidence = scores[class_id]
            if confidence > 0.5:
                center_x = int(detection[0] * width)
                center_y = int(detection[1] * height)
                w = int(detection[2] * width)
                h = int(detection[3] * height)
                x = int(center_x - w / 2)
                y = int(center_y - h / 2)
                boxes.append([x, y, w, h])
                confidences.append((float(confidence)))
                class_ids.append(class_id)
    img = draw_bounding_boxes(img, classes, class_ids, boxes, confidences, colors, width, height)
    return img

### Using the trained weights

In [110]:
# Open an image and detect objects
config_path = os.path.join(os.getcwd(), './yolov4_custom.cfg')
weights_path = "./darknet/backup/yolov4_custom_best.weights" 
classes = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
colors = np.random.uniform(0, 255, size=(10, 3))
font = cv2.FONT_HERSHEY_PLAIN

net = cv2.dnn.readNetFromDarknet(config_path, weights_path)

# Get the output layer names of the model
layer_names = net.getLayerNames()
output_layers_names = net.getUnconnectedOutLayersNames() 

# Load the image
height = 416
width = 416

In [111]:
img = cv2.imread("C:/Users/aurel/ING3-DNN-Projet/MNIST-ObjectDetection/data/mnist_detection/test/images/2989.png")
img = draw_image(img, net, colors, font)
cv2.imshow("Image", img)
cv2.waitKey(10000)
cv2.destroyAllWindows()

### Using the labeled bounding boxes

In [114]:
# Show the image with its real bounding boxes
img_path = "./MNIST-ObjectDetection/data/mnist_detection/test/images/23.png"
img = cv2.imread(img_path)
img_width, img_height = img.shape[1], img.shape[0]
colors = np.random.uniform(0, 255, size=(10, 3))

In [116]:
# Get the real bounding boxes
with open(img_path.replace(".png", ".txt"), "r") as f:
    for line in f.readlines()[1:]:
        line = line.strip().split(" ")
        print(line)
        # Draw yolov4 labels
        x_center, y_center, width, height = float(line[1]), float(line[2]), float(line[3]), float(line[4])
        w = int(width * img_width)
        h = int(height * img_height)
        x = int(x_center * img_width - w / 2)
        y = int(y_center * img_height - h / 2)
        color = colors[int(line[0])]
        cv2.rectangle(img, (x, y), (x + w, y + h), color, 2)
        cv2.putText(img, line[0], (x, y + 30), font, 2, color, 2)

cv2.imshow("Image", img)
cv2.waitKey(10000)
cv2.destroyAllWindows()

['0', '0.5316666666666666', '0.6316666666666667', '0.15666666666666668', '0.17']
['7', '0.77', '0.23666666666666666', '0.16', '0.16']
['1', '0.6416666666666667', '0.835', '0.09', '0.21']
['6', '0.5916666666666667', '0.515', '0.05', '0.07']
['8', '0.7183333333333334', '0.5533333333333333', '0.09', '0.1']
['3', '0.8833333333333333', '0.7666666666666667', '0.06666666666666667', '0.06666666666666667']
['2', '0.8116666666666666', '0.38666666666666666', '0.13', '0.19333333333333333']
