In [1]:
import os
import shutil

In [2]:
import numpy as np
import pandas as pd
import yaml
import csv
from sklearn.model_selection import train_test_split

In [3]:
from tqdm import tqdm
import time

In [23]:
GLOBAL_DEBUG = False
GLOBAL_DEBUG_VAL = 500

In [5]:
def copy_files(file_paths, output_folder):
    for file in tqdm(range(len(file_paths))):
        output_path = os.path.join(output_folder, os.path.basename(file_paths[file]))
        shutil.copyfile(file_paths[file], output_path)

In [34]:
def csv_to_dataframe(file_path):
    df = pd.read_csv(file_path)
    
    cat_counts = pd.DataFrame(df['car_type_id'].value_counts(sort=True).reset_index())
    cat_counts.columns = ['car_type_id','count']
    min_samples = df['car_type_id'].value_counts().min()
    
    print(cat_counts)
    print(min_samples)

    df_filtered = pd.DataFrame()
    
    for index, row in cat_counts.iterrows():
        samples = df[df['car_type_id'] == row['car_type_id']].sample(min_samples)
        df_filtered = pd.concat([df_filtered, samples])

    df_filtered.to_csv('Z:/CompCarsYOLO/data/balanced_data.csv', index=False)
    
    return df_filtered

In [16]:
def resize_path_load(image_paths, label_paths, df_filtered):
    resized_image_paths = []
    resized_label_paths = []
    for i in tqdm(range(len(image_paths))):
        if os.path.basename(image_paths[i]) in df_filtered['image_name'].values:
            resized_image_paths.append(image_paths[i])
            resized_label_paths.append(label_paths[i])
    return resized_image_paths, resized_label_paths
                

In [8]:
def generate_string(values, names):
    regen = []
    for i in values:
        regen.append(names[i])
        
    return regen

In [9]:
def df_column_combined(dataframe, names):
    images = []
    labels = []
    df = dataframe
    df['car_type_name'] = generate_string(df['car_type_id'], names)
    df_new = df.sort_values(by='image_name')
    
    images = df_new['image_name'].values
    labels = df_new['car_type_name'].values

    return images, labels

In [10]:
def data_selector(images_paths, labels_paths, image_keys):
    dictionary = {key: (imgp, lbp) for key, imgp, lbp in zip(image_keys, images_paths, labels_paths)}

    return dictionary

In [11]:
def data_stratification(x, y, dictionary, output_folder, train_size=0.7, random_seed=42):
    train_folder = f'{output_folder}/train/'
    val_folder = f'{output_folder}/val/'
    test_folder = f'{output_folder}/test/'

    train_images_folder = f'{output_folder}/train/images/'
    val_images_folder = f'{output_folder}/val/images/'
    test_images_folder = f'{output_folder}/test/images/'

    train_labels_folder = f'{output_folder}/train/labels/'
    val_labels_folder = f'{output_folder}/val/labels/'
    test_labels_folder = f'{output_folder}/test/labels/'
    
    if not os.path.exists(train_folder):
        os.makedirs(train_folder, exist_ok=True)
        os.makedirs(train_images_folder)
        os.makedirs(train_labels_folder)

    if not os.path.exists(val_folder):
        os.makedirs(val_folder, exist_ok=True)
        os.makedirs(val_images_folder)
        os.makedirs(val_labels_folder)
        
    if not os.path.exists(test_folder):
        os.makedirs(test_folder, exist_ok=True)
        os.makedirs(test_images_folder)
        os.makedirs(test_labels_folder)

    # A script that will relate text entry to file paths
    train_keys, val_and_test_keys, train_lbs, val_and_test_lbs = train_test_split(x, y, test_size=(1-train_size), random_state=random_seed, stratify=y)
    val_keys, test_keys, val_lbs, test_lbs = train_test_split(val_and_test_keys, val_and_test_lbs, test_size=0.33333, random_state=random_seed, stratify=val_and_test_lbs)

    train_paths = [dictionary[key] for key in train_keys if key in dictionary]
    val_paths = [dictionary[key] for key in val_keys if key in dictionary]
    test_paths = [dictionary[key] for key in test_keys if key in dictionary]

    train_img = [tuple_element[0] for tuple_element in train_paths]
    train_labels = [tuple_element[1] for tuple_element in train_paths]
    val_img = [tuple_element[0] for tuple_element in val_paths]
    val_labels = [tuple_element[1] for tuple_element in val_paths]
    test_img = [tuple_element[0] for tuple_element in test_paths]
    test_labels = [tuple_element[1] for tuple_element in test_paths]

    copy_files(train_img, train_images_folder)
    copy_files(train_labels, train_labels_folder)
    copy_files(val_img, val_images_folder)
    copy_files(val_labels, val_labels_folder)
    copy_files(test_img, test_images_folder)
    copy_files(test_labels, test_labels_folder)

In [35]:
data_folder_path = 'Z:/CompCarsYOLO/data/ROBO_UPLOAD1/'
csv_file_path = 'Z:/CompCarsYOLO/data/yolo_car_type_only_classified_data.csv'
images_folder_path = f'{data_folder_path}/images/'
labels_folder_path = f'{data_folder_path}/labels/'
data_yaml = f'{data_folder_path}/datanew.yaml'

output_folder_path = 'Z:/CompCarsYOLO/data/segmented_only_classified_data/'

with open(data_yaml, 'r') as file:
    yaml_loader = yaml.safe_load(file)

classes = yaml_loader.get('names')

all_image_paths = sorted([os.path.join(images_folder_path, filename) for filename in os.listdir(images_folder_path)])
all_label_paths = sorted([os.path.join(labels_folder_path, filename) for filename in os.listdir(labels_folder_path)])
csv_df = csv_to_dataframe(csv_file_path)
x, y = df_column_combined(csv_df, classes)

image_paths, label_paths = resize_path_load(all_image_paths, all_label_paths, csv_df)

if GLOBAL_DEBUG:
    image_paths = image_paths[:GLOBAL_DEBUG_VAL]
    label_paths = label_paths[:GLOBAL_DEBUG_VAL]
    x = x[:GLOBAL_DEBUG_VAL]
    y = y[:GLOBAL_DEBUG_VAL]
    
diction = data_selector(image_paths, label_paths, x)
data_stratification(x, y, diction, output_folder_path)

    car_type_id  count
0             2  32274
1             1  21809
2             3  16573
3             0   6149
4             9   6023
5             4   4729
6             5   2473
7             8   2473
8             7   2133
9             6   1215
10           11   1046
11           10    937
937


100%|████████████████████████████████████████████████████████████████████████| 136632/136632 [00:30<00:00, 4433.07it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 7870/7870 [00:31<00:00, 246.92it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 7870/7870 [00:27<00:00, 290.08it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 2249/2249 [00:08<00:00, 257.52it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 2249/2249 [00:08<00:00, 274.17it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 1125/1125 [00:04<00:00, 247.98it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 1125/1125 [00:04<00:00, 258.73it/s]
