# MNIST Object detection using YOLOv4

In [9]:
import os

In [10]:
!git submodule update --init --recursive
if not os.path.exists('MNIST-ObjectDetection/data'):
    %pip install -r MNIST-ObjectDetection/requirements.txt
    %cd MNIST-ObjectDetection
    !python generate_data.py
    %cd ..

In [11]:
# GPU_NAME =  str(os.popen('nvidia-smi --query-gpu=name --format=csv,noheader').read()).strip()
# GPU_COMPUTE_CAPABILITY = str(os.popen('nvidia-smi --query-gpu=compute_cap --format=csv,noheader').read()).strip()
# # Remove newlines
# GPU_COMPUTE_CAPABILITY = GPU_COMPUTE_CAPABILITY.replace('.', '')

# print(f'GPU_NAME: {GPU_NAME}')
# print(f'GPU_COMPUTE_CAPABILITY: {GPU_COMPUTE_CAPABILITY}')
# # Build using make, enable OpenCV and CUDNN
# import re
# with open('darknet/Makefile', 'r') as f:
#     makefile = f.read()
# makefile = re.sub(r'GPU=0', 'GPU=1', makefile)
# makefile = re.sub(r'CUDNN=0', 'CUDNN=1', makefile)
# makefile = re.sub(r'CUDNN_HALF=0', 'CUDNN_HALF=1', makefile)
# makefile = re.sub(r'OPENCV=0', 'OPENCV=1', makefile)

# # Remove everything in arch= and remove lines after it if it ends with a \. Replace with GPU_COMPUTE_CAPABILITY
# makefile = re.sub(r'ARCH=(.*\\\n)*.*', f'ARCH={GPU_COMPUTE_CAPABILITY}', makefile)


# # Enable CUDNN_HALF if GPU compute capability >= 75
# if int(GPU_COMPUTE_CAPABILITY) >= 75:
#     makefile = re.sub(r'CUDNN_HALF=0', 'CUDNN_HALF=1', makefile)

# with open('darknet/Makefile', 'w') as f:
#     f.write(makefile)

# # Build darknet
# !(cd darknet && make -j)

In [12]:
# # Upload video to colab
# from google.colab import files
# uploaded = files.upload()
# video_path = list(uploaded.keys())[0]

video_path = "../Videos/Img_1688.mp4"

In [2]:
# if not os.path.exists("yolov4.weights"):
#   !curl https://github.com/AlexeyAB/darknet/releases/download/darknet_yolo_v4_pre/yolov4.weights -o yolov4.weights
import requests
required_files = {
    "yolov4.weights": "https://github.com/AlexeyAB/darknet/releases/download/darknet_yolo_v4_pre/yolov4.weights",
    "yolov4.conv.137": "https://github.com/AlexeyAB/darknet/releases/download/darknet_yolo_v3_optimal/yolov4.conv.137"
}

for file_name, url in required_files.items():
    if os.path.exists(file_name):
        continue
    print(f"Downloading {file_name}...")
    r = requests.get(url, allow_redirects=True)
    with open(file_name, 'wb') as f:
        f.write(r.content)
    print(f"Downloaded {file_name}.")


# Load darknet dll
import ctypes
import os
import sys
import cv2
import numpy as np

print(cv2.__version__)
import darknet.darknet as darknet


# Load the network using opencv
net = cv2.dnn.readNet("yolov4.weights", "darknet/cfg/yolov4.cfg")
classes = open("darknet/data/coco.names").read().strip().split("\n")
colors = np.random.uniform(0, 255, 1000000)


# Set the backend to opencv
net.setPreferableBackend(cv2.dnn.DNN_BACKEND_OPENCV)
# Set the target to CUDA
net.setPreferableTarget(cv2.dnn.DNN_TARGET_OPENCL_FP16)
font=cv2.FONT_HERSHEY_SIMPLEX


Downloading yolov4.conv.137...
4.5.5


In [14]:
cv2.__version__
cv2.cuda.getCudaEnabledDeviceCount()

1

In [15]:
# Detect objects in the video
cap = cv2.VideoCapture(video_path)
# YoloV4 input size
width = 416
height = 416

while True:
    _, img = cap.read()
    if img is None:
        break
    # img = cv2.resize(img, (width, height))
    blob = cv2.dnn.blobFromImage(img, 1 / 255, (width, height), [0, 0, 0], 1, crop=False)
    net.setInput(blob)
    output_layers_names = net.getUnconnectedOutLayersNames()
    layerOutputs = net.forward(output_layers_names)
    boxes = []
    confidences = []
    class_ids = []
    for output in layerOutputs:
        for detection in output:
            scores = detection[5:]
            class_id = np.argmax(scores)
            confidence = scores[class_id]
            if confidence > 0.5:
                center_x = int(detection[0] * width)
                center_y = int(detection[1] * height)
                w = int(detection[2] * width)
                h = int(detection[3] * height)
                x = int(center_x - w / 2)
                y = int(center_y - h / 2)
                boxes.append([x, y, w, h])
                confidences.append((float(confidence)))
                class_ids.append(class_id)
    indexes = cv2.dnn.NMSBoxes(boxes, confidences, 0.5, 0.4)
    if len(indexes) > 0:
        for i in indexes.flatten():
            x, y, w, h = boxes[i]
            # Convert coordinates to integers to image size
            x = int(x * (img.shape[1] / width))
            w = int(w * (img.shape[1] / width))
            y = int(y * (img.shape[0] / height))
            h = int(h * (img.shape[0] / height))
            
            label = str(classes[class_ids[i]])
            confidence = str(round(confidences[i], 2))
            color = colors[i]
            cv2.rectangle(img, (x, y), (x + w, y + h), color, 2)
            cv2.putText(img, label + " " + confidence, (x, y + 20), font, 1, color, 2)
    cv2.imshow("Image", img)
    key = cv2.waitKey(1)
    if key == 27:
        break


KeyboardInterrupt: 

In [None]:
!python ./generate_data.py --num-train-images 100000 --num-test-images 10000

In [None]:
# Convert data to darknet format

In [25]:
# Use YOLOv4 to detect MNIST digits from images

# Structure is train -> images -> 0,1,2,3,4,5,6,7,8,9 -> 0.jpg, 1.jpg, 2.jpg, ...
# Structure is train -> labels -> 0,1,2,3,4,5,6,7,8,9 -> 0.txt, 1.txt, 2.txt, ...
# Same for test

data_path = os.path.join(os.getcwd(), 'MNIST-ObjectDetection/data/mnist_detection')
train_path = os.path.join(data_path, 'train')
test_path = os.path.join(data_path, 'test')

cfg_path = os.path.join(os.getcwd(), 'cfg')
os.makedirs(cfg_path, exist_ok=True)
os.makedirs("darknet/backup", exist_ok=True)

# Train YOLOv4 on MNIST dataset
# create data file
with open(f'{cfg_path}/mnist.data', 'w') as f:
    f.write(f'classes = 10\n')
    f.write(f'train = {cfg_path}/mnist_train.txt\n')
    f.write(f'valid = {cfg_path}/mnist_test.txt\n')
    f.write(f'names = {cfg_path}/mnist.names\n')
    f.write(f'backup = backup\n')

# create names file
with open(f'{cfg_path}/mnist.names', 'w') as f:
    for i in range(10):
        f.write(str(i) + '\n')

# create train.txt file
with open(f'{cfg_path}/mnist_train.txt', 'w') as f:
    dirs = os.listdir(os.path.join(train_path, 'images'))
    for d in dirs:
        if d.endswith('.png'):
            f.write(os.path.join(train_path, 'images', d) + '\n')

# create test.txt file
with open(f'{cfg_path}/mnist_test.txt', 'w') as f:
    dirs = os.listdir(os.path.join(test_path, 'images'))
    for d in dirs:
        if d.endswith('.png'):
            f.write(os.path.join(test_path, 'images', d) + '\n')

# Moved all labels to the images folder
# !mv MNIST-ObjectDetection/data/mnist_detection/train/labels/* MNIST-ObjectDetection/data/mnist_detection/train/images/
# !mv MNIST-ObjectDetection/data/mnist_detection/test/labels/* MNIST-ObjectDetection/data/mnist_detection/test/images/


# Train YOLOv4 on MNIST dataset
# !./darknet/darknet.exe detector train MNIST-ObjectDetection/data/mnist.data MNIST-ObjectDetection/cfg/yolov4-mnist.cfg yolov4.conv.137 -dont_show -map

In [39]:
# Convert data to darknet format
# Original format is label,xmin,ymin,xmax,ymax and has a header
# Darknet format is class x_center y_center width height and has no header

import os

def process_file(file, file_out):
    with open(file, 'r') as f:
        lines = f.readlines()[1:]
        res_lines = []
        for line in lines:
            label, xmin, ymin, xmax, ymax = line.strip().split(',')
            x_center = (int(xmin) + int(xmax)) / 2
            y_center = (int(ymin) + int(ymax)) / 2
            width = int(xmax) - int(xmin)
            height = int(ymax) - int(ymin)
            res_lines.append(f'{label} {x_center} {y_center} {width} {height}\n')
    with open(file_out, 'w') as f:
        f.writelines(res_lines)

to_process = [
    train_path,
    test_path
]

for path in to_process:
    print(f'Processing {path}')
    for file in os.listdir(os.path.join(path, 'labels')):
        if file.endswith('.txt'):
            process_file(os.path.join(path, 'labels', file), os.path.join(path, 'images', file))


In [12]:
import cv2
import numpy as np

# Open an image and detect objects
config_path = os.path.join(os.getcwd(), './yolov4_custom.cfg')
weights_path = "C:/Users/aurel/ING3-DNN-Projet/yolov4.weights"
classes = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
colors = np.random.uniform(0, 255, size=(len(classes), 3))
font = cv2.FONT_HERSHEY_PLAIN

net = cv2.dnn.readNetFromDarknet(config_path, weights_path)

# Get the output layer names of the model
layer_names = net.getLayerNames()
output_layers_names = net.getUnconnectedOutLayersNames() 

# Load the image
img = cv2.imread("C:/Users/aurel/ING3-DNN-Projet/23.png")
height = 416
width = 416

# Showing informations on the screen
blob = cv2.dnn.blobFromImage(img, 1 / 255, (width, height), [0, 0, 0], 1, crop=False)
net.setInput(blob)
output_layers_names = net.getUnconnectedOutLayersNames()
layerOutputs = net.forward(output_layers_names)
boxes = []
confidences = []
class_ids = []
for output in layerOutputs:
    for detection in output:
        scores = detection[5:]
        class_id = np.argmax(scores)
        confidence = scores[class_id]
        if confidence > 0.5:
            center_x = int(detection[0] * width)
            center_y = int(detection[1] * height)
            w = int(detection[2] * width)
            h = int(detection[3] * height)
            x = int(center_x - w / 2)
            y = int(center_y - h / 2)
            boxes.append([x, y, w, h])
            confidences.append((float(confidence)))
            class_ids.append(class_id)
indexes = cv2.dnn.NMSBoxes(boxes, confidences, 0.5, 0.4)
print(f"Number of objects detected: {len(indexes)}")
if len(indexes) > 0:
    for i in indexes.flatten():
        x, y, w, h = boxes[i]
        # Convert coordinates to integers to image size
        x = int(x * (img.shape[1] / width))
        w = int(w * (img.shape[1] / width))
        y = int(y * (img.shape[0] / height))
        h = int(h * (img.shape[0] / height))
        
        label = str(classes[class_ids[i]])
        confidence = str(round(confidences[i], 2))
        color = colors[i]
        cv2.rectangle(img, (x, y), (x + w, y + h), color, 2)
        cv2.putText(img, label + " " + confidence, (x, y + 20), font, 1, color, 2)
cv2.imshow("Image", img)
cv2.waitKey(10000)
cv2.destroyAllWindows()

Number of objects detected: 1


In [23]:
# Show the image with its real bounding boxes
img_path = "./MNIST-ObjectDetection/data/mnist_detection/test/images/23.png"
img = cv2.imread(img_path)

# Get the real bounding boxes
with open(img_path.replace(".png", ".txt"), "r") as f:
    for line in f.readlines()[1:]:
        line = line.strip().split(",")
        print(line)
        # x = int(float(line[1]) * img.shape[1])
        # y = int(float(line[2]) * img.shape[0])
        # w = int(float(line[3]) * img.shape[1])
        # h = int(float(line[4]) * img.shape[0])
        # cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 2)
        # cv2.putText(img, line[0], (x, y + 20), font, 1, (0, 255, 0), 2)
        x1 = int(line[1])
        y1 = int(line[2])
        x2 = int(line[3])
        y2 = int(line[4])
        cv2.rectangle(img, (x1, y1), (x2, y2), (0, 255, 0), 2)
        cv2.putText(img, line[0], (x1, y1 + 20), font, 1, (0, 255, 0), 2)
cv2.imshow("Image", img)
cv2.waitKey(10000)
cv2.destroyAllWindows()

['9', '81', '128', '127', '202']
['0', '104', '83', '118', '99']
['7', '30', '79', '69', '133']
['0', '113', '129', '144', '168']
['4', '51', '30', '87', '70']


KeyboardInterrupt: 