# Convert data to yolov5 Pytorch format

> **1. Set path for your data that you want to prepare.**

> **2. Break data in Train,test, validation data.**

> **3. Move to respected folders.**

> **4. Read .json file and prepare .yaml file accordingly.**

In [1]:
import os
import json
import yaml
import shutil
from tqdm import tqdm
from sklearn.model_selection import train_test_split

In [2]:
import os

# Specify the relative path to the folder in another directory
relative_path = "../Yolov5/data"

# Get the absolute path
absolute_path = os.path.abspath(relative_path)

# Print the absolute path
print("Absolute Path:", absolute_path)

Absolute Path: /home/zkhan1/YOLOv5_FOSOCO_Dataset/Yolov5/data


In [2]:
IMAGES_PATH = "./data/images"
LABELS_PATH = "./data/labels"

In [3]:
# Images
images = os.listdir(IMAGES_PATH)

# Read labels
labels = os.listdir(LABELS_PATH)

# Split data
train, test = train_test_split(labels, test_size=0.15, shuffle=True)
valid, test = train_test_split(test, test_size=0.2)

print(f"train: {len(train)}; valid: {len(valid)}; test: {len(test)}; images: {len(images)}; labels: {len(labels)}")

train: 96055; valid: 13560; test: 3391; images: 113006; labels: 113006


In [None]:
os.makedirs("./data/test/images")
os.makedirs("./data/test/labels")
os.makedirs("./data/train/images")
os.makedirs("./data/train/labels")
os.makedirs("./data/valid/images")
os.makedirs("./data/valid/labels")

In [None]:
def find_files_by_name(directory_path, target_name):
    for filename in os.listdir(directory_path):
        if os.path.isfile(os.path.join(directory_path, filename)):
            base_name, _ = os.path.splitext(filename)
            if base_name == target_name:
                print(f"File found: {filename}")
                if filename.split('.')[-1] == "png":
                    return True, "png"
                elif filename.split('.')[-1] == "jpg":
                    return True, "jpg"
                else:
                    return False, filename

In [None]:
def move_files_to_dir(files, dirname):
    for label_filename in tqdm(files):
        print(label_filename)
        if label_filename.endswith(".txt"): 
            check, extension = find_files_by_name(IMAGES_PATH, label_filename[:-4])
            if check:
                image_filename = f"{label_filename[:-4]}.{extension}"
                # print("Source: ",f"{IMAGES_PATH}/{image_filename}")
                # print("Destination: ",f"{dirname}/images/{image_filename}")
                shutil.move(f"{IMAGES_PATH}/{image_filename}", f"{dirname}/images/{image_filename}")
                shutil.move(f"{LABELS_PATH}/{label_filename}", f"{dirname}/labels/{label_filename}")
            else:
                print(f"Irregular File found: {extension}")
        else:
            print("File with wrong extension.")

# Move splits to folders
move_files_to_dir(train, "./data/train")
# move_files_to_dir(test, "./data/test")
# move_files_to_dir(valid, "./data/valid")

In [None]:
descr_darknet = json.load(open(NOTES_PATH))

train_path = "./data/train/images"
test_path = "./data/test/images"
valid_path = "./data/valid/images"

nc = len(descr_darknet["categories"])
names = [category for category in descr_darknet["categories"]]

print(
    f"train: {train_path}\n"
    f"test: {test_path}\n"
    f"val: {valid_path}\n\n"
    f"nc: {nc}\n"
    f"names: {names}",
)

In [None]:
with open("./data_testing/data.yaml", "w") as file:
    yaml.dump({
        "train": train_path,
        "test": test_path,
        "val": valid_path,
        "nc": nc,
        "names": [f'{name}' for name in names]
    }, stream=file, default_flow_style=None)