**Загрузка датасета**

In [1]:
from PreapreDataLib import *
from download import download

In [3]:
create_folder_if_not_exists('./DatasetRAW/')
create_folder_if_not_exists('./temp/')

if len(os.listdir('./DatasetRAW/')) <= 1:
    if not os.path.isdir('./temp/biankatpas-Cracks-and-Potholes-in-Road-Images-Dataset-1f20054/'):
        download('https://github.com/biankatpas/Cracks-and-Potholes-in-Road-Images-Dataset/tarball/master', './temp/',
                 kind='tar.gz', replace=True)
    shutil.copytree('./temp/biankatpas-Cracks-and-Potholes-in-Road-Images-Dataset-1f20054/Dataset/', './DatasetRAW/',
                        dirs_exist_ok=True)
    shutil.rmtree('./temp/biankatpas-Cracks-and-Potholes-in-Road-Images-Dataset-1f20054/')

len(os.listdir('./DatasetRAW/'))

2235

**Преобразование датасета в формат MS-COCO с разбиением на тренировочную, валидационную и тестовую выборки**

In [8]:
import math
import random
import json

In [10]:
RAW_DIR = os.path.abspath('./DatasetRAW/')
DATASET_DIR = os.path.abspath('./DatasetCocoFormat/')
TRAIN_DIR = os.path.join(DATASET_DIR, 'train')
TEST_DIR  = os.path.join(DATASET_DIR, 'test')
VAL_DIR = os.path.join(DATASET_DIR, 'val')
ANN_DIR = os.path.join(DATASET_DIR, 'annotations')

create_folder_if_not_exists(DATASET_DIR)
create_folder_if_not_exists(TRAIN_DIR)
create_folder_if_not_exists(TEST_DIR)
create_folder_if_not_exists(VAL_DIR)
create_folder_if_not_exists(ANN_DIR)

paths = os.listdir(RAW_DIR)
img_count = len(paths)

train_prob = 0.65
val_prob = 0.2
test_prob = 0.15

test_count = math.trunc(img_count * test_prob)
val_count = math.trunc(img_count * val_prob)
train_count = img_count - test_count - val_count

print('Overall count:', img_count)
print('Train count:', train_count)
print('Validation count:', val_count)
print('Test count:', test_count)

categories = {
    "lane": 1,
    "crack": 2,
    "pothole": 3
}

a = categories.items()

if len(os.listdir(TRAIN_DIR)) <= 0 and len(os.listdir(TEST_DIR)) <= 0 and len(os.listdir(VAL_DIR)) <= 0:
    train_data_ann = CocoDataset(categories)
    val_data_ann = CocoDataset(categories)
    test_data_ann = CocoDataset(categories)

    rand_paths = paths
    random.shuffle(rand_paths)
    i = 0
    for pth in rand_paths:
        SRC_PATH = os.path.join(RAW_DIR, pth)
        if os.path.isdir(SRC_PATH):
            if i < test_count:
                process_image_and_masks(SRC_PATH, TEST_DIR, test_data_ann)
                i = i + 1
            elif test_count <= i < test_count + val_count:
                process_image_and_masks(SRC_PATH, VAL_DIR, val_data_ann)
                i = i + 1
            elif test_count + val_count <= i <= img_count:
                process_image_and_masks(SRC_PATH, TRAIN_DIR, train_data_ann)
                i = i + 1
    ann_test_count = len(test_data_ann.images) - 1
    ann_val_count = len(val_data_ann.images) - 1
    ann_train_count = len(train_data_ann.images) - 1

    print()
    print('Annotated Overall count:', ann_test_count+ann_val_count+ann_train_count)
    print('Annotated Train count:', ann_train_count)
    print('Annotated Validation count:', ann_val_count)
    print('Annotated Test count:', ann_test_count)

    assert ann_test_count == test_count
    assert ann_val_count == val_count
    assert ann_train_count == train_count

    for ann in {"train": train_data_ann, "val": val_data_ann, "test": test_data_ann,}.items():
        with open(os.path.join(ANN_DIR, ann[0]+".json"), "w") as out:
            json.dump(ann[1].get_as_dict(), out, sort_keys=True, indent=4)

real_test_count = len(os.listdir(TEST_DIR))
real_val_count = len(os.listdir(VAL_DIR))
real_train_count = len(os.listdir(TRAIN_DIR))

print()
print('Real Overall count:', real_test_count+real_val_count+real_train_count)
print('Real Train count:', real_train_count)
print('Real Validation count:', real_val_count)
print('Real Test count:', real_test_count)

assert real_test_count == test_count
assert real_val_count == val_count
assert real_train_count == train_count

Overall count: 2235
Train count: 1453
Validation count: 447
Test count: 335

Real Overall count: 2235
Real Train count: 1453
Real Validation count: 447
Real Test count: 335
