In [7]:
import os
import glob
import shutil
import numpy as np
import pandas as pd
from PIL import Image

## Images

In [10]:
def png_to_jpg(src, dst):
    """
    Converts all images in a image folder to jpg file,
    with RGB channels.
    """
    
    for dir_, folder, files in os.walk(src):
        for file in files:
            if file.endswith(('JPG', 'PNG', 'jpg', 'jpeg', 'png', 'JPEG')):
                img_path = os.path.join(dir_, file)
                save_folder = os.path.join(dst, dir_.split('/')[-1])
                img_save_path = os.path.join(save_folder, file.split('.')[0] + '.jpg')

                if not os.path.exists(save_folder):
                    os.mkdir(save_folder)

                img = Image.open(img_path)
                img = img.convert('RGB')
                img.save(img_save_path)
    print('Completed')

In [None]:
png_to_jpg('/home/user/Documents/CV/Datasets/ExDark', '/home/user/Documents/CV/ExDark_processed/')

## Train test Split

In [15]:
data_dir = '/home/user/Documents/CV/ExDark_processed/'
# Training data dir
training_dir = '/home/user/Documents/CV/train/'
 
# Test data dir
testing_dir = '/home/user/Documents/CV/test/'
 
# Ratio of training and testing data
train_test_ratio = 0.75 

def split_dataset_into_test_and_train_sets(all_data_dir = data_dir, training_data_dir = training_dir, testing_data_dir=testing_dir, train_test_ratio = 0.8):
 # recreate test and train directories if they don't exist
    if not os.path.exists(training_data_dir):
        os.mkdir(training_data_dir)

    if not os.path.exists(testing_data_dir):
        os.mkdir(testing_data_dir)               

    num_training_files = 0
    num_testing_files = 0
 # iterate through the data directory 
    for subdir, dirs, files in os.walk(all_data_dir):
        category_name = os.path.basename(subdir)
        if category_name == os.path.basename(all_data_dir):
            continue
            
        file_list = glob.glob(subdir + '/*.jpg')

        print(str(category_name) + ' has ' + str(len(files)) + ' images') 
        random_set = np.random.permutation((file_list))
        
         # copy percentage of data from each category to train and test directory
        train_list = random_set[:round(len(random_set)*(train_test_ratio))] 
        test_list = random_set[-round(len(random_set)*(1-train_test_ratio)):]
        
        for lists in train_list : 
            shutil.copy(lists, training_data_dir + '/' )
            num_training_files += 1
        
        for lists in test_list : 
            shutil.copy(lists, testing_data_dir + '/' )
            num_testing_files += 1
    print("Processed " + str(num_training_files) + " training files.")
    print("Processed " + str(num_testing_files) + " testing files.")

In [None]:
split_dataset_into_test_and_train_sets()

## Annotations

In [None]:
def txt_files_to_csv(img_folder, txt_folder, save_path):
    """
    Merge all text files with BBOX and Class info to a single csv.
    """
    df = pd.DataFrame(columns=['filename', 'width', 'height', 'class', 'xmin', 'ymin', 'xmax', 'ymax'])
    index = 0
    for dir_, folder, files in os.walk(img_folder):
        for img_file in files:
                txt_file = glob.glob(txt_folder + f"{img_file.split('.')[0]}*.txt")[0]
                lines = open(txt_file, 'r').readlines()
                img = Image.open(os.path.join(img_path, img_file))
                for line in lines[1:]:
                    info = line.split(' ')[:5]
                    df_test.loc[count, : ] = [img_file, img.width, img.height, info[0], 
                                              info[1], info[2], info[3], info[4]]
                    index += 1
        #     break
        
    # Below addition is done, as the bbox info in the text file is xmin, ymin, width, height
    # This converts the bbox info to xmin, ymin, xmax, ymax
    df['xmax'] = df['xmax'].apply(lambda x:int(x)) + df['xmin'].apply(lambda x:int(x))
    df['ymax'] = df['ymax'].apply(lambda x:int(x)) + df['ymin'].apply(lambda x:int(x))
    
    # Since all images sorted according to classes, we need to shuffle to make train data random
    df = df.sample(frac=1).reset_index().drop('index', axis=1)
    df.to_csv(save_path+'/annotations.csv', index=False)
    print('Completed')

In [None]:
txt_files_to_csv()