# Coursework file containing code for sorting the dataset

In [None]:
# This code takes a sample of the kaggle dataset.
# The rationale is to manipulate the labels first and then select the matching images
# Original images are in /INM705/705 Coursework/datasets/data_tsinghua/images
# Original labels are in /INM705/705 Coursework/datasets/data_tsinghua/labels
#
# File operations learnt from Ceder, N. (2018) The Quick Python Handbook, 3rd Ed. Shelter Island: Manning

# Setting up the data for the model and setting up the model adapted from https://towardsdatascience.com/the-practical-guide-for-object-detection-with-yolov5-algorithm-74c04aac4843

In [3]:
import os
import pathlib
import random
import shutil
import torch
from IPython import display
from IPython.display import clear_output
from pathlib import Path
import yaml
import matplotlib.pyplot as plt
import matplotlib.image as mpimg


# NB Code assumes download of dataset from Kaggle as zip file (see report)
# Expand zip file inside "/Cyclist Detection YOLOv5/data" 

os.chdir("/Users/charlesciumei/Dropbox/!! python notebooks/INM705 DL and image analysis/Cyclist Detection YOLOv5") 
# change this to correct pathname on file system

print(os.getcwd())

original_dataset_pathname = "data/data_tsinghua"

# Make new directories to hold sample of data if they don't exist

if os.path.exists('data/sample_data/labels_zero'):
    pass
else:
    os.makedirs('data/sample_data/labels_zero')

if os.path.exists('data/sample_data/labels_sample'):
    pass
else:
    os.makedirs('data/sample_data/labels_sample')
    
if os.path.exists('data/sample_data/images_sample'):
    pass
else:
    os.makedirs('data/sample_data/images_sample')

/Users/charlesciumei/Dropbox/!! python notebooks/INM705 DL and image analysis/Cyclist Detection YOLOv5


In [4]:
# Proportion of the dataset to be sampled:
#
# The kaggle dataset contains 13,672 images including 1,621 empty images for background, i.e. approximately 12%.
# https://github.com/ultralytics/yolov5/issues/5851 recommends ≥ 1500 images per class and 0-10% background images to reduce false positives.
# After experimenting with different proportions, 10% for overall sample and 20% background was used.
# The main reason for this was getting a small enough sample to train. 

sample_prop = 0.10
background_prop = 0.20

def get_random_files(source_dir, target_dir, ext, prop):
    # moves a random selection of files of the specified type to 'labels_sample'
    sample_number = round(len(os.listdir(source_dir)) * prop)  # convert percentage of files to sample to a number
    for x in range(sample_number):
        file_list = os.listdir(source_dir)
        rand = random.randint(0, len(file_list) - 1)
        # match only files with specified extension and move them by renaming to target directory
        if file_list[rand].split('.')[1] == ext:
            os.rename(f'{source_dir}/{file_list[rand]}',
                      f'{target_dir}/{file_list[rand]}')


In [5]:
# Search the original labels directory and move any labels for images with no cycles to a new directory

with os.scandir(f'{original_dataset_pathname}/labels') as my_dir:
    for file in my_dir:
        if os.path.getsize(file) == 0:
            os.rename(f'{original_dataset_pathname}/labels/{file.name}',
                      f'data/sample_data/labels_zero/{file.name}')

print(f'{len(os.listdir("data/sample_data/labels_zero"))} files selected')

1623 files selected


In [6]:
# Randomly select the specified proportion of labels and move to 'labels_sample'

get_random_files(f'{original_dataset_pathname}/labels', 'data/sample_data/labels_sample', "txt", sample_prop)

labels_chosen = len(os.listdir("data/sample_data/labels_sample"))

print(f'{len(os.listdir("data/sample_data/labels_sample"))} label files selected')

1205 label files selected


In [5]:
# Randomly select the specified proportion of zero labels and move to the sample labels directory

get_random_files('data/sample_data/labels_zero', 'data/sample_data/labels_sample', "txt", background_prop)

print(f'{len(os.listdir("data/sample_data/labels_sample")) - labels_chosen} zero label files selected')

print(f'{len(os.listdir("data/sample_data/labels_sample"))} total label files selected')

325 zero label files selected
1530 total label files selected


In [7]:
# select a sample of the images by matching against the sampled label filenames

with os.scandir('data/sample_data/labels_sample') as my_dir:
    for file in my_dir:
        if file.name.endswith('.txt'):
            image_name = file.name[:-4] + '.jpg'
            image_path = os.path.join(original_dataset_pathname, 'images', image_name)
            if os.path.exists(image_path):
                new_image_path = os.path.join('data/sample_data/images_sample', image_name)
                shutil.move(image_path, new_image_path)

print(f'{len(os.listdir("data/sample_data/images_sample"))} total image files selected')

1205 total image files selected


In [10]:
print(os.getcwd())

/Users/charlesciumei/Dropbox/!! python notebooks/INM705 DL and image analysis/Cyclist Detection YOLOv5


In [12]:
# Create data directories in the structure required by the YOLO model

def create_data_directories(data_name):
    Path(f"./data/{data_name}/images/train").mkdir(parents=True, exist_ok=True)
    Path(f"./data/{data_name}/images/valid").mkdir(parents=True, exist_ok=True)
    Path(f"./data/{data_name}/images/test").mkdir(parents=True, exist_ok=True)
    Path(f"./data/{data_name}/labels/train").mkdir(parents=True, exist_ok=True)
    Path(f"./data/{data_name}/labels/valid").mkdir(parents=True, exist_ok=True)
    Path(f"./data/{data_name}/labels/test").mkdir(parents=True, exist_ok=True)

    
create_data_directories('cyclist_data')

In [13]:
# Populate the train, validation and test folders from the sampled data
# Split as follows:
# 60% train = 918
# 20% validation = 306
# 20% test = 306 files

print(os.getcwd())

def move_files(source_dir, target_dir, ext, prop):
    # moves a selection of files of the specified type to target directory
    file_list = os.listdir(source_dir)
    sample_number = round(len(os.listdir(source_dir)) * prop)  # convert percentage of files to sample to a number
    for x in range(sample_number):
        # match only files with specified extension and move them by renaming to target directory
        if file_list[x].split('.')[1] == ext:
            os.rename(f'{source_dir}/{file_list[x]}',
                      f'{target_dir}/{file_list[x]}')


# Moving the files. Logic as follows. For training data, proportion is 0.6 because 60% is used for training.
# 40% is left. Therefore proportion for Validation is set to 0.5 brackets being 20% of the dataset.
# Which leaves 20% for testing: proportion is 1.0 because all the remaining data is moved to test directory.

# 06-08-2023 this put this in the wrong directory because the structure is created in the wrong folder
                
move_files('data/sample_data/labels_sample', 'data/cyclist_data/labels/train', "txt", 0.6)
move_files('data/sample_data/labels_sample', 'data/cyclist_data/labels/valid', "txt", 0.5)
move_files('data/sample_data/labels_sample', 'data/cyclist_data/labels/test', "txt", 1)



/Users/charlesciumei/Dropbox/!! python notebooks/INM705 DL and image analysis/Cyclist Detection YOLOv5


In [14]:
# move the training images by matching to labels

with os.scandir('data/cyclist_data/labels/train') as my_dir:
    for file in my_dir:
        if file.name.endswith('.txt'):
            image_name = file.name[:-4] + '.jpg'
            image_path = os.path.join('data/sample_data/images_sample', image_name)
            if os.path.exists(image_path):
                new_image_path = os.path.join('data/cyclist_data/images/train', image_name)
                shutil.move(image_path, new_image_path)


In [15]:
# move the validation images by matching to labels

with os.scandir('data/cyclist_data/labels/valid') as my_dir:
    for file in my_dir:
        if file.name.endswith('.txt'):
            image_name = file.name[:-4] + '.jpg'
            image_path = os.path.join('data/sample_data/images_sample', image_name)
            if os.path.exists(image_path):
                new_image_path = os.path.join('data/cyclist_data/images/valid', image_name)
                shutil.move(image_path, new_image_path)


In [16]:
# move the test images by matching to labels

with os.scandir('data/cyclist_data/labels/test') as my_dir:
    for file in my_dir:
        if file.name.endswith('.txt'):
            image_name = file.name[:-4] + '.jpg'
            image_path = os.path.join('data/sample_data/images_sample', image_name)
            if os.path.exists(image_path):
                new_image_path = os.path.join('data/cyclist_data/images/test', image_name)
                shutil.move(image_path, new_image_path)


**NB below this is the original code for training the model**

In [None]:
# install YOLO v5

%t+https://github.com/ultralytics/yolov5.git
%cd /users/addj212/INM705/INM705 Coursework/yolov5
%pip install -r requirements.txt
os.getcwd()

In [None]:
%git clone https://github.com/ultralytics/yolov5  # clone
%cd yolov5
%pip install -r requirements.txt  # install

In [16]:
print(os.getcwd())
# os.chdir("/users/addj212/INM705/705 Coursework/yolov5")
os.chdir("/Users/charlesciumei/Dropbox/!! python notebooks/INM705 DL and image analysis/yolov5")
print(os.getcwd())

/Users/charlesciumei/Dropbox/!! python notebooks/INM705 DL and image analysis
/Users/charlesciumei/Dropbox/!! python notebooks/INM705 DL and image analysis/yolov5


In [20]:
# train the model

# Output saved to 'Runs, folder

!python train.py --data data/cyclists.yaml --img 1280 --batch 8 --epochs 10 --weights yolov5n6.pt --freeze 12


[34m[1mtrain: [0mweights=yolov5n6.pt, cfg=, data=data/cyclists.yaml, hyp=data/hyps/hyp.scratch-low.yaml, epochs=10, batch_size=8, imgsz=1280, rect=False, resume=False, nosave=False, noval=False, noautoanchor=False, noplots=False, evolve=None, bucket=, cache=None, image_weights=False, device=, multi_scale=False, single_cls=False, optimizer=SGD, sync_bn=False, workers=8, project=runs/train, name=exp, exist_ok=False, quad=False, cos_lr=False, label_smoothing=0.0, patience=100, freeze=[12], save_period=-1, seed=0, local_rank=-1, entity=None, upload_dataset=False, bbox_interval=-1, artifact_alias=latest
[34m[1mgithub: [0mup to date with https://github.com/ultralytics/yolov5 ✅
fatal: cannot change to '/Users/charlesciumei/Dropbox/!!': No such file or directory
YOLOv5 🚀 2023-8-5 Python-3.9.7 torch-2.0.1 CPU

[34m[1mhyperparameters: [0mlr0=0.01, lrf=0.01, momentum=0.937, weight_decay=0.0005, warmup_epochs=3.0, warmup_momentum=0.8, warmup_bias_lr=0.1, box=0.05, cls=0.5, cls_pw=1.0, obj

[34m[1mtrain: [0mScanning /Users/charlesciumei/Dropbox/!! python notebooks/INM705 DL and i[0m
[34m[1mtrain: [0mNew cache created: /Users/charlesciumei/Dropbox/!! python notebooks/INM705 DL and image analysis/datasets/cyclist_data/labels/train.cache
[34m[1mval: [0mScanning /Users/charlesciumei/Dropbox/!! python notebooks/INM705 DL and ima[0m
[34m[1mval: [0mNew cache created: /Users/charlesciumei/Dropbox/!! python notebooks/INM705 DL and image analysis/datasets/cyclist_data/labels/valid.cache

[34m[1mAutoAnchor: [0m5.07 anchors/target, 0.992 Best Possible Recall (BPR). Current anchors are a good fit to dataset ✅
Plotting labels to runs/train/exp/labels.jpg... 
Image sizes 1280 train, 1280 val
Using 8 dataloader workers
Logging results to [1mruns/train/exp[0m
Starting training for 10 epochs...

      Epoch    GPU_mem   box_loss   obj_loss   cls_loss  Instances       Size
        0/9         0G    0.08129    0.03654          0          7       1280: 1
                 Cl

In [19]:
# Fine-tuning

!python train.py --hyp 'data/hyps/hyp.VOC.yaml' --img 1280 --batch 8 --epochs 100 --data 'data/cyclists.yaml' --weights 'runs/train/exp13/weights/best.pt' --project 'runs_cyclists' --name 'fine-tuning'

[34m[1mtrain: [0mweights=runs/train/exp13/weights/best.pt, cfg=, data=data/cyclists.yaml, hyp=data/hyps/hyp.VOC.yaml, epochs=100, batch_size=8, imgsz=1280, rect=False, resume=False, nosave=False, noval=False, noautoanchor=False, noplots=False, evolve=None, bucket=, cache=None, image_weights=False, device=, multi_scale=False, single_cls=False, optimizer=SGD, sync_bn=False, workers=8, project=runs_cyclists, name=fine-tuning, exist_ok=False, quad=False, cos_lr=False, label_smoothing=0.0, patience=100, freeze=[0], save_period=-1, seed=0, local_rank=-1, entity=None, upload_dataset=False, bbox_interval=-1, artifact_alias=latest
[34m[1mgithub: [0mup to date with https://github.com/ultralytics/yolov5 ✅
fatal: cannot change to '/Users/charlesciumei/Dropbox/!!': No such file or directory
YOLOv5 🚀 2023-8-5 Python-3.9.7 torch-2.0.1 CPU

[34m[1mhyperparameters: [0mlr0=0.00334, lrf=0.15135, momentum=0.74832, weight_decay=0.00025, warmup_epochs=3.3835, warmup_momentum=0.59462, warmup_bias_lr

In [13]:
# Testing the model against unseen data

!python val.py --img 1280 --batch 8 --data 'data/cyclists.yaml' --weights 'runs_cyclists/fine-tuning2/weights/best.pt' --task test --project 'runs_cyclists' --name 'Test' --augment

[34m[1mval: [0mdata=data/cyclists.yaml, weights=['runs_cyclists/fine-tuning2/weights/best.pt'], batch_size=8, imgsz=1280, conf_thres=0.001, iou_thres=0.6, max_det=300, task=test, device=, workers=8, single_cls=False, augment=True, verbose=False, save_txt=False, save_hybrid=False, save_conf=False, save_json=False, project=runs_cyclists, name=Test, exist_ok=False, half=False, dnn=False
Unknown option: -C
usage: git [--version] [--help] [-c name=value]
           [--exec-path[=<path>]] [--html-path] [--man-path] [--info-path]
           [-p|--paginate|--no-pager] [--no-replace-objects] [--bare]
           [--git-dir=<path>] [--work-tree=<path>] [--namespace=<name>]
           <command> [<args>]
YOLOv5 🚀 2023-4-10 Python-3.9.5 torch-1.10.0 CUDA:0 (A100-PCIE-80GB, 81251MiB)

Fusing layers... 
Model summary: 206 layers, 12308200 parameters, 0 gradients, 16.1 GFLOPs
[34m[1mtest: [0mScanning /users/addj212/INM705/705 Coursework/datasets/cyclist_data/labels[0m
[34m[1mtest: [0mNew cach

In [None]:
# Train for 300 epochs with smaller image size set hopefully to speed the training up.

# Results per epoch were cut off in browser due to Hyperion timeout, but model did run for full epochs. Results saved to runs/train/exp14. 

!python train.py --data data/cyclists.yaml --img 640 --batch 8 --epochs 300 --weights yolov5s6.pt --freeze 12

[34m[1mtrain: [0mweights=yolov5s6.pt, cfg=, data=data/cyclists.yaml, hyp=data/hyps/hyp.scratch-low.yaml, epochs=300, batch_size=8, imgsz=640, rect=False, resume=False, nosave=False, noval=False, noautoanchor=False, noplots=False, evolve=None, bucket=, cache=None, image_weights=False, device=, multi_scale=False, single_cls=False, optimizer=SGD, sync_bn=False, workers=8, project=runs/train, name=exp, exist_ok=False, quad=False, cos_lr=False, label_smoothing=0.0, patience=100, freeze=[12], save_period=-1, seed=0, local_rank=-1, entity=None, upload_dataset=False, bbox_interval=-1, artifact_alias=latest
[34m[1mgithub: [0mskipping check (offline), for updates see https://github.com/ultralytics/yolov5
Unknown option: -C
usage: git [--version] [--help] [-c name=value]
           [--exec-path[=<path>]] [--html-path] [--man-path] [--info-path]
           [-p|--paginate|--no-pager] [--no-replace-objects] [--bare]
           [--git-dir=<path>] [--work-tree=<path>] [--namespace=<name>]
      