# Coursework file containing code for the model.

In [None]:
# This code takes a sample of the original dataset.
# The rationale is to manipulate the labels first and then select the matching images
# Original images are in /INM705/705 Coursework/datasets/data_tsinghua/images
# Original labels are in /INM705/705 Coursework/datasets/data_tsinghua/labels
#
# File operations learnt from Ceder, N. (2018) The Quick Python Handbook, 3rd Ed. Shelter Island: Manning

# Setting up the data for the model and setting up the model adapted from https://towardsdatascience.com/the-practical-guide-for-object-detection-with-yolov5-algorithm-74c04aac4843

In [1]:
import os
import pathlib
import random
import glob
from yolov5 import utils
import torch
from IPython import display
from IPython.display import clear_output
from pathlib import Path
import yaml
import matplotlib.pyplot as plt
import matplotlib.image as mpimg


# NB. This code is based on a directory structure that depends on our usernames. An appropriate substitution for another user needs to be made.

os.chdir("/users/addj212/INM705/705 Coursework")   # on Hyperion to get to correct directory for CC
# os.chdir("/users/adcy372/INM705/705 Coursework") # on Hyperion to get to correct directory for AC
print(os.getcwd())

original_dataset_pathname = "datasets/data_tsinghua"

# Make new directories to hold sample of data if they don't exist

if os.path.exists('datasets/sample_data/labels_zero'):
    pass
else:
    os.mkdir('datasets/sample_data/labels_zero')

if os.path.exists('datasets/sample_data/labels_sample'):
    pass
else:
    os.mkdir('datasets/sample_data/labels_sample')
    
if os.path.exists('datasets/sample_data/images_sample'):
    pass
else:
    os.mkdir('datasets/sample_data/images_sample')

/users/addj212/INM705/705 Coursework


In [2]:
# Proportion of the dataset to be sampled:
#
# The original dataset contains 13,672 images including 1,621 empty images for background, i.e. approximately 12% (explain why in the report).
# https://github.com/ultralytics/yolov5/issues/5851 recommends ≥ 1500 images per class and 0-10% background images to reduce false positives.
# Using 15% of the original dataset gives approximately 2,000 images of which 10% of the empty images will be chosen as background (equivalent to
# roughly 8% of the total number of images sampled.

sample_prop = 0.15
background_prop = 0.10

def get_random_files(source_dir, target_dir, ext, prop):
    # moves a random selection of files of the specified type to 'labels_sample'
    sample_number = round(len(os.listdir(source_dir)) * prop)  # convert percentage of files to sample to a number
    for x in range(sample_number):
        file_list = os.listdir(source_dir)
        rand = random.randint(0, len(file_list) - 1)
        # match only files with specified extension and move them by renaming to target directory
        if file_list[rand].split('.')[1] == ext:
            os.rename(f'{source_dir}/{file_list[rand]}',
                      f'{target_dir}/{file_list[rand]}')


In [3]:
# Search the original labels directory and move any labels for images with no cycles to a new directory

with os.scandir(f'{original_dataset_pathname}/labels') as my_dir:
    for file in my_dir:
        if os.path.getsize(file) == 0:
            os.rename(f'{original_dataset_pathname}/labels/{file.name}',
                      f'datasets/sample_data/labels_zero/{file.name}')

print(f'{len(os.listdir("datasets/sample_data/labels_zero"))} files selected')

1623 files selected


In [4]:
# Randomly select the specified proportion of labels and move to 'labels_sample'

get_random_files(f'{original_dataset_pathname}/labels', 'datasets/sample_data/labels_sample', "txt", sample_prop)

labels_chosen = len(os.listdir("datasets/sample_data/labels_sample"))

print(f'{len(os.listdir("datasets/sample_data/labels_sample"))} label files selected')

1809 label files selected


In [5]:
# Randomly select the specified proportion of zero labels and move to the sample labels directory

get_random_files('datasets/sample_data/labels_zero', 'datasets/sample_data/labels_sample', "txt", background_prop)

print(f'{len(os.listdir("datasets/sample_data/labels_sample")) - labels_chosen} zero label files selected')

print(f'{len(os.listdir("datasets/sample_data/labels_sample"))} total label files selected')

162 zero label files selected
1971 total label files selected


In [6]:
# select a sample of the images by matching against the sampled label filenames

with os.scandir('datasets/sample_data/labels_sample') as my_dir:
    for file in my_dir:
        # print(file)
        # print(file.name)
        # match selected label to image by slicing off .txt extension from label file
        if glob.glob(f'{original_dataset_pathname}/images/{file.name[:-4]}.jpg'):
            # print(f'datasets/sample_data/test_images/{file.name[:-4]}.jpg')
            os.rename(f'{original_dataset_pathname}/images/{file.name[:-4]}.jpg',
                      f'datasets/sample_data/images_sample/{file.name[:-4]}.jpg')
            
print(f'{len(os.listdir("datasets/sample_data/images_sample"))} total image files selected')

1971 total image files selected


In [7]:
# Create data directories in the structure required by the YOLO model

def create_data_directories(data_name):
    Path(f"../705 Coursework/datasets/{data_name}/images/train").mkdir(parents=True, exist_ok=True)
    Path(f"../705 Coursework/datasets/{data_name}/images/valid").mkdir(parents=True, exist_ok=True)
    Path(f"../705 Coursework/datasets/{data_name}/images/test").mkdir(parents=True, exist_ok=True)
    Path(f"../705 Coursework/datasets/{data_name}/labels/train").mkdir(parents=True, exist_ok=True)
    Path(f"../705 Coursework/datasets/{data_name}/labels/valid").mkdir(parents=True, exist_ok=True)
    Path(f"../705 Coursework/datasets/{data_name}/labels/test").mkdir(parents=True, exist_ok=True)

    
create_data_directories('cyclist_data')

In [8]:
# Populate the train, validation and test folders from the sampled data
# Split as follows:
# 60% train = 1133
# 20% validation = 388
# 20% test = 388 files

print(os.getcwd())

def move_files(source_dir, target_dir, ext, prop):
    # moves a selection of files of the specified type to target directory
    file_list = os.listdir(source_dir)
    sample_number = round(len(os.listdir(source_dir)) * prop)  # convert percentage of files to sample to a number
    for x in range(sample_number):
        # match only files with specified extension and move them by renaming to target directory
        if file_list[x].split('.')[1] == ext:
            os.rename(f'{source_dir}/{file_list[x]}',
                      f'{target_dir}/{file_list[x]}')


# Moving the files. The logic of the proportions is as follows. For the training data, the proportion is 0.6 because 60% of the day to say it is used for training.
# 40% is left, and therefore the proportion for Validation is set to 0.5 brackets to get 20% full validation, which leaves 20% for testing and the proportion moved is 1.0
# because all of the remaining data is moved into the test directory.
                
move_files('datasets/sample_data/labels_sample', 'datasets/cyclist_data/labels/train', "txt", 0.6)
move_files('datasets/sample_data/labels_sample', 'datasets/cyclist_data/labels/valid', "txt", 0.5)
move_files('datasets/sample_data/labels_sample', 'datasets/cyclist_data/labels/test', "txt", 1)

move_files('datasets/sample_data/images_sample', 'datasets/cyclist_data/images/train', "jpg", 0.6)
move_files('datasets/sample_data/images_sample', 'datasets/cyclist_data/images/valid', "jpg", 0.5)
move_files('datasets/sample_data/images_sample', 'datasets/cyclist_data/images/test', "jpg", 1)



/users/addj212/INM705/705 Coursework


In [None]:
# install YOLO v5

!pip install -U git+https://github.com/ultralytics/yolov5.git
%cd /users/addj212/INM705/INM705 Coursework/yolov5
!pip install -r requirements.txt
os.getcwd()

In [9]:
print(os.getcwd())
os.chdir("/users/addj212/INM705/705 Coursework/yolov5")
print(os.getcwd())

/users/addj212/INM705/705 Coursework
/users/addj212/INM705/705 Coursework/yolov5


In [10]:
# train the model

# Output saved to 'Runs, folder

!python train.py --data data/cyclists.yaml --img 1280 --batch 8 --epochs 100 --weights yolov5s6.pt --freeze 12


[34m[1mtrain: [0mweights=yolov5s6.pt, cfg=, data=data/cyclists.yaml, hyp=data/hyps/hyp.scratch-low.yaml, epochs=100, batch_size=8, imgsz=1280, rect=False, resume=False, nosave=False, noval=False, noautoanchor=False, noplots=False, evolve=None, bucket=, cache=None, image_weights=False, device=, multi_scale=False, single_cls=False, optimizer=SGD, sync_bn=False, workers=8, project=runs/train, name=exp, exist_ok=False, quad=False, cos_lr=False, label_smoothing=0.0, patience=100, freeze=[12], save_period=-1, seed=0, local_rank=-1, entity=None, upload_dataset=False, bbox_interval=-1, artifact_alias=latest
[34m[1mgithub: [0mskipping check (offline), for updates see https://github.com/ultralytics/yolov5
Unknown option: -C
usage: git [--version] [--help] [-c name=value]
           [--exec-path[=<path>]] [--html-path] [--man-path] [--info-path]
           [-p|--paginate|--no-pager] [--no-replace-objects] [--bare]
           [--git-dir=<path>] [--work-tree=<path>] [--namespace=<name>]
     

In [12]:
# Fine-tuning

!python train.py --hyp 'data/hyps/hyp.VOC.yaml' --img 1280 --batch 8 --epochs 100 --data 'data/cyclists.yaml' --weights 'runs/train/exp13/weights/best.pt' --project 'runs_cyclists' --name 'fine-tuning'

[34m[1mtrain: [0mweights=runs/train/exp13/weights/best.pt, cfg=, data=data/cyclists.yaml, hyp=data/hyps/hyp.VOC.yaml, epochs=100, batch_size=8, imgsz=1280, rect=False, resume=False, nosave=False, noval=False, noautoanchor=False, noplots=False, evolve=None, bucket=, cache=None, image_weights=False, device=, multi_scale=False, single_cls=False, optimizer=SGD, sync_bn=False, workers=8, project=runs_cyclists, name=fine-tuning, exist_ok=False, quad=False, cos_lr=False, label_smoothing=0.0, patience=100, freeze=[0], save_period=-1, seed=0, local_rank=-1, entity=None, upload_dataset=False, bbox_interval=-1, artifact_alias=latest
[34m[1mgithub: [0mskipping check (offline), for updates see https://github.com/ultralytics/yolov5
Unknown option: -C
usage: git [--version] [--help] [-c name=value]
           [--exec-path[=<path>]] [--html-path] [--man-path] [--info-path]
           [-p|--paginate|--no-pager] [--no-replace-objects] [--bare]
           [--git-dir=<path>] [--work-tree=<path>] [--

In [13]:
# Testing the model against unseen data

!python val.py --img 1280 --batch 8 --data 'data/cyclists.yaml' --weights 'runs_cyclists/fine-tuning2/weights/best.pt' --task test --project 'runs_cyclists' --name 'Test' --augment

[34m[1mval: [0mdata=data/cyclists.yaml, weights=['runs_cyclists/fine-tuning2/weights/best.pt'], batch_size=8, imgsz=1280, conf_thres=0.001, iou_thres=0.6, max_det=300, task=test, device=, workers=8, single_cls=False, augment=True, verbose=False, save_txt=False, save_hybrid=False, save_conf=False, save_json=False, project=runs_cyclists, name=Test, exist_ok=False, half=False, dnn=False
Unknown option: -C
usage: git [--version] [--help] [-c name=value]
           [--exec-path[=<path>]] [--html-path] [--man-path] [--info-path]
           [-p|--paginate|--no-pager] [--no-replace-objects] [--bare]
           [--git-dir=<path>] [--work-tree=<path>] [--namespace=<name>]
           <command> [<args>]
YOLOv5 🚀 2023-4-10 Python-3.9.5 torch-1.10.0 CUDA:0 (A100-PCIE-80GB, 81251MiB)

Fusing layers... 
Model summary: 206 layers, 12308200 parameters, 0 gradients, 16.1 GFLOPs
[34m[1mtest: [0mScanning /users/addj212/INM705/705 Coursework/datasets/cyclist_data/labels[0m
[34m[1mtest: [0mNew cach

In [None]:
# Train for 300 epochs with smaller image size set hopefully to speed the training up.

# Results per epoch were cut off in browser due to Hyperion timeout, but model did run for full epochs. Results saved to runs/train/exp14. 

!python train.py --data data/cyclists.yaml --img 640 --batch 8 --epochs 300 --weights yolov5s6.pt --freeze 12

[34m[1mtrain: [0mweights=yolov5s6.pt, cfg=, data=data/cyclists.yaml, hyp=data/hyps/hyp.scratch-low.yaml, epochs=300, batch_size=8, imgsz=640, rect=False, resume=False, nosave=False, noval=False, noautoanchor=False, noplots=False, evolve=None, bucket=, cache=None, image_weights=False, device=, multi_scale=False, single_cls=False, optimizer=SGD, sync_bn=False, workers=8, project=runs/train, name=exp, exist_ok=False, quad=False, cos_lr=False, label_smoothing=0.0, patience=100, freeze=[12], save_period=-1, seed=0, local_rank=-1, entity=None, upload_dataset=False, bbox_interval=-1, artifact_alias=latest
[34m[1mgithub: [0mskipping check (offline), for updates see https://github.com/ultralytics/yolov5
Unknown option: -C
usage: git [--version] [--help] [-c name=value]
           [--exec-path[=<path>]] [--html-path] [--man-path] [--info-path]
           [-p|--paginate|--no-pager] [--no-replace-objects] [--bare]
           [--git-dir=<path>] [--work-tree=<path>] [--namespace=<name>]
      