In [1]:
import yaml
import os
from xml2yolo import json_to_yolo
import glob
import random
import itertools


In [2]:
# Read YAML file
with open("configDNVdata.yaml", 'r') as stream:
    config_file = yaml.safe_load(stream)

DATASET_DIR = config_file['dataset_src_dir']
    
TRAIN_PATH = os.path.join(DATASET_DIR, config_file['train'])
VAL_PATH = os.path.join(DATASET_DIR, config_file['val'])

TRAIN_SPLIT = 0.4
VAL_SPLIT = 0.1

In [3]:
SIMULATION_SCENARIOS = ['2023-05-09_1257_dnv_scenario1_full_00_000_AfternoonClear',
                        '2023-05-09_1300_dnv_scenario1_full_00_000_AfternoonCloudy',
                        '2023-05-09_1302_dnv_scenario1_full_00_000_AfternoonCloudy_Rain',                          
                        '2023-05-09_1308_dnv_scenario1_full_00_000_EveningClear',         
                        '2023-05-09_1311_dnv_scenario1_full_00_000_EveningCloudy',         
                        '2023-05-09_1313_dnv_scenario1_full_00_000_EveningCloudy_Rain',         
                        '2023-05-09_1315_dnv_scenario1_full_00_000_FoggyClear',         
                        '2023-05-09_1317_dnv_scenario1_full_00_000_FoggyCloudy',         
                        '2023-05-09_1319_dnv_scenario1_full_00_000_MorningClear',         
                        '2023-05-09_1321_dnv_scenario1_full_00_000_MorningCloudy',         
                        '2023-05-09_1324_dnv_scenario1_full_00_000_NightClear',         
                        '2023-05-09_1326_dnv_scenario1_full_00_000_NightCloudy',         
                        '2023-05-09_1328_dnv_scenario1_full_00_000_NoonClear',         
                        '2023-05-09_1330_dnv_scenario1_full_00_000_NoonCloudy',         
                        '2023-05-09_1332_dnv_scenario1_full_00_000_NoonCloudy_Rain',         
                        '2023-05-09_1335_dnv_scenario1_full_00_000_Overcast',                 
                        '2023-05-09_1339_dnv_scenario1_full_00_000_StormClouds',         
                        '2023-05-09_1341_dnv_scenario1_full_00_000_StormyClouds_Rain']

In [4]:
simulation = SIMULATION_SCENARIOS[0]
open(TRAIN_PATH, 'w').close()
open(VAL_PATH, 'w').close()

numb_train = 0 
numb_val = 0
numb_test = 0

train_file_paths = []
val_file_paths = []


for simulation in SIMULATION_SCENARIOS:

    current_condition_dir = os.path.join(DATASET_DIR, simulation)
    image_dir = os.path.join(current_condition_dir, 'images/')
    label_dir = os.path.join(current_condition_dir , 'labels/')

    image_height = 2056
    image_width = 2464

    json_to_yolo(image_dir, label_dir, image_height, image_width)

    files = glob.glob(os.path.join(image_dir, '*.jpg'))

    #random.shuffle(files)

    split_point_train = round(len(files)*TRAIN_SPLIT)
    split_point_val = round(len(files)*(TRAIN_SPLIT+VAL_SPLIT))

    test_files = files[split_point_val:]

    # Only shuffel train and val images, we want to keep the timedependece in the test data. 
    files_copy = files.copy()
    shuffeled_files = files_copy[0:split_point_val]
    random.shuffle(shuffeled_files)

    train_file_paths += shuffeled_files[:split_point_train]
    val_file_paths += shuffeled_files[split_point_train:]

    
    test_path = os.path.join(current_condition_dir, 'test.txt')
    with open(test_path, 'w+') as file:
        file.write('\n'.join(str(DATASET_DIR+'/'.join(line.split('/')[5:])) for line in test_files))

    numb_train += split_point_train
    numb_val += (split_point_val-split_point_train)
    numb_test += len(test_files)

    print('----------------------------------------------')
    print(simulation)
    print('# of train: ', split_point_train)
    print('# of val: ', split_point_val-split_point_train)
    print('# of test: ', len(test_files)) 
    print('----------------------------------------------')


random.shuffle(train_file_paths)
random.shuffle(val_file_paths)

with open(TRAIN_PATH, 'a') as file:
    file.write('\n'.join(str(DATASET_DIR+'/'.join(line.split('/')[5:])) for line in train_file_paths))

with open(VAL_PATH, 'a') as file:
    file.write('\n'.join(str(DATASET_DIR+'/'.join(line.split('/')[5:])) for line in val_file_paths))

----------------------------------------------
2023-05-09_1257_dnv_scenario1_full_00_000_AfternoonClear
# of train:  270
# of val:  68
# of test:  337
----------------------------------------------
----------------------------------------------
2023-05-09_1300_dnv_scenario1_full_00_000_AfternoonCloudy
# of train:  270
# of val:  68
# of test:  337
----------------------------------------------
----------------------------------------------
2023-05-09_1302_dnv_scenario1_full_00_000_AfternoonCloudy_Rain
# of train:  270
# of val:  68
# of test:  337
----------------------------------------------
----------------------------------------------
2023-05-09_1308_dnv_scenario1_full_00_000_EveningClear
# of train:  270
# of val:  68
# of test:  337
----------------------------------------------
----------------------------------------------
2023-05-09_1311_dnv_scenario1_full_00_000_EveningCloudy
# of train:  270
# of val:  68
# of test:  337
----------------------------------------------
------

In [5]:
print('----------------------------------------------')
print('Total # of train: ', numb_train)
print('Total # of val: ', numb_val)
print('Total # of test: ', numb_test) 

----------------------------------------------
Total # of train:  4860
Total # of val:  1224
Total # of test:  6066


# Train

In [None]:
#yolo train model=/cluster/work/solveijm/MODSIM/runs/detect/train_1cls_100e_1024imgsz_only_hurtigruta_correct/weights/best.pt data='/cluster/work/solveijm/MODSIM/src/detection/datasetDNV.yaml' epochs=50 imgsz=1024 name=train_50e_1024imgz_mixed_synthetic_and_hurtigruta_correct
