In [10]:
import os
from glob import glob
import shutil
import numpy as np
import random
import matplotlib.pyplot as plt

In [11]:
# import sys
# sys.path.sort()
# sys.path.insert(1, os.path.join(sys.path[0], '../../'))

In [12]:
# Set a seed value
seed = 42
os.environ['PYTHONHASHSEED'] = str(seed)
random.seed(seed)
np.random.seed(seed)

In [13]:
import utils

In [14]:
# Define dataset.
parent_img_dir = './vision'      # Change according to your data path.
parent_audio_dir = './sound'     # Change according to your data path.

class_labels = [f for f in os.listdir(parent_img_dir) if os.path.isdir(os.path.join(parent_img_dir, f))]
class_labels.sort()
print(class_labels)

['airport', 'beach', 'bridge', 'farmland', 'forest', 'grassland', 'harbour', 'lake', 'orchard', 'residential', 'sparse shrub land', 'sports land', 'train station']


In [15]:
train_ratio = 0.8
dest_img_train_dir = './train/vision'        # Set destination path.
dest_audio_train_dir = './train/sound'       # Set destination path.

dest_img_test_dir = './test/vision'          # Set destination path.
dest_audio_test_dir = './test/sound'         # Set destination path.

In [16]:
train_data_notes = {}
test_data_notes = {}

for class_label in class_labels:

    img_sub_dir = os.path.join(parent_img_dir, class_label)
    file_extensions = ['*.JPG', '*.JPEG', '*.jpg', '*.png', '*.PNG']

    id_list = []
    for extension in file_extensions:
        id_list.extend(glob(os.path.join(img_sub_dir, extension)))  # '../datasets/ADVANCE/vision/airport/07090_2.jpg'
    id_list = [os.path.basename(x) for x in id_list]                # '07090_2.jpg'
    id_list = [os.path.splitext(x)[0] for x in id_list]             # '07090_2'
    id_list.sort()
    random.shuffle(id_list)

    split_idx = int(len(id_list) * train_ratio)
    train_id_list = id_list[:split_idx]
    test_id_list = id_list[split_idx:]

    train_data_notes[class_label] = len(train_id_list)
    test_data_notes[class_label] = len(test_id_list)

    # Train.
    dest_img_dir = os.path.join(dest_img_train_dir, class_label)
    dest_audio_dir = os.path.join(dest_audio_train_dir, class_label)
    os.makedirs(dest_img_dir)
    os.makedirs(dest_audio_dir)
    for id in train_id_list:
        img = os.path.join(parent_img_dir, class_label, id + '.jpg')
        audio =  os.path.join(parent_audio_dir, class_label, id + '.wav')

        shutil.copy(img, dest_img_dir)
        shutil.copy(audio, dest_audio_dir)
        
    # Test.
    dest_img_dir = os.path.join(dest_img_test_dir, class_label)
    dest_audio_dir = os.path.join(dest_audio_test_dir, class_label)
    os.makedirs(dest_img_dir)
    os.makedirs(dest_audio_dir)
    for id in test_id_list:
        img = os.path.join(parent_img_dir, class_label, id + '.jpg')
        audio =  os.path.join(parent_audio_dir, class_label, id + '.wav')

        shutil.copy(img, dest_img_dir)
        shutil.copy(audio, dest_audio_dir)

In [17]:
print(train_data_notes)
print(test_data_notes)

{'airport': 148, 'beach': 172, 'bridge': 224, 'farmland': 344, 'forest': 689, 'grassland': 120, 'harbour': 406, 'lake': 285, 'orchard': 165, 'residential': 844, 'sparse shrub land': 324, 'sports land': 127, 'train station': 208}
{'airport': 37, 'beach': 44, 'bridge': 56, 'farmland': 86, 'forest': 173, 'grassland': 30, 'harbour': 102, 'lake': 72, 'orchard': 42, 'residential': 212, 'sparse shrub land': 81, 'sports land': 32, 'train station': 52}


In [20]:
note_file = './notes.txt'
txt = ''

txt += 'Train\n'
for class_label, num_samples in train_data_notes.items():
    buff = '{:15} : {}\n'.format(class_label, num_samples)
    txt += buff

txt += '\nTest\n'
for class_label, num_samples in test_data_notes.items():
    buff = '{:15} : {}\n'.format(class_label, num_samples)
    txt += buff
utils.save_notes(note_file, txt)