In [1]:
! pip3 install pandas



In [2]:
import os
import pandas as pd
import glob
import numpy as np
import random

random.seed(42)

In [3]:
base_MIT = "/hadatasets/andreza/SceneDatasets/MIT_Indoors_8/"
base_SUN = "/hadatasets/andreza/SceneDatasets/SUN397_8/"
classes = ["bathroom", "bedroom", "childs_room",
           "classroom", "dressing_room", "living_room",
           "studio", "swimming_pool"]

In [4]:
# MIT_Indoors
MIT_dict = {"file_path": [], "label": [], "path_and_label": []}

for class_name in classes:
    path = os.path.join(base_MIT, class_name)
    print(class_name, path)
    files = glob.glob(path+"/*")
    for file in files:
        MIT_dict["file_path"].append(file)
        MIT_dict["label"].append(class_name)
        MIT_dict["path_and_label"].append([file, class_name])

np.save("MIT_Indoors_samples_paths_complete.npy", np.asarray(MIT_dict["file_path"]))
np.save("MIT_Indoors_samples_labels_complete.npy", np.asarray(MIT_dict["label"]))
np.save("MIT_Indoors_samples_paths_and_labels_complete.npy", np.asarray(MIT_dict["path_and_label"]))


bathroom /hadatasets/andreza/SceneDatasets/MIT_Indoors_8/bathroom
bedroom /hadatasets/andreza/SceneDatasets/MIT_Indoors_8/bedroom
childs_room /hadatasets/andreza/SceneDatasets/MIT_Indoors_8/childs_room
classroom /hadatasets/andreza/SceneDatasets/MIT_Indoors_8/classroom
dressing_room /hadatasets/andreza/SceneDatasets/MIT_Indoors_8/dressing_room
living_room /hadatasets/andreza/SceneDatasets/MIT_Indoors_8/living_room
studio /hadatasets/andreza/SceneDatasets/MIT_Indoors_8/studio
swimming_pool /hadatasets/andreza/SceneDatasets/MIT_Indoors_8/swimming_pool


In [5]:
# SUN_Indoors
SUN_dict = {"file_path": [], "label": [], "path_and_label": []}

for class_name in classes:
    path = os.path.join(base_SUN, class_name)
    print(class_name, path)
    path = path+"/*"
    files = glob.glob(path)
    for file in files:
        SUN_dict["file_path"].append(file)
        SUN_dict["label"].append(class_name)
        SUN_dict["path_and_label"].append([file, class_name])

np.save("SUN397_Indoors_samples_paths_complete.npy", np.asarray(SUN_dict["file_path"]))
np.save("SUN397_Indoors_samples_labels_complete.npy", np.asarray(SUN_dict["label"]))
np.save("SUN397_Indoors_samples_paths_and_labels_complete.npy", np.asarray(SUN_dict["path_and_label"]))


bathroom /hadatasets/andreza/SceneDatasets/SUN397_8/bathroom
bedroom /hadatasets/andreza/SceneDatasets/SUN397_8/bedroom
childs_room /hadatasets/andreza/SceneDatasets/SUN397_8/childs_room
classroom /hadatasets/andreza/SceneDatasets/SUN397_8/classroom
dressing_room /hadatasets/andreza/SceneDatasets/SUN397_8/dressing_room
living_room /hadatasets/andreza/SceneDatasets/SUN397_8/living_room
studio /hadatasets/andreza/SceneDatasets/SUN397_8/studio
swimming_pool /hadatasets/andreza/SceneDatasets/SUN397_8/swimming_pool


# Generate randomized SUN splits

All environments should be the same size for OoD-benchmark execution.
- MIT Indoors has 2535 samples
- SUN397 has 9285 samples

SUN397 dataset will be splitted at random, selecting the amount of MIT samples for each class 

In [7]:
mit_labels = np.load("MIT_Indoors_samples_labels_complete.npy")
mit_img_paths = np.load("MIT_Indoors_samples_paths_complete.npy")

sun_labels = np.load("SUN397_Indoors_samples_labels_complete.npy")
sun_img_paths_labels = np.load("SUN397_Indoors_samples_paths_and_labels_complete.npy")

labels, counts = np.unique(mit_labels, return_counts=True)
mit_dict_count = dict(zip(labels,counts))
print("MIT labels, counts", mit_dict_count )
print()
labels, counts = np.unique(sun_labels, return_counts=True)
print("SUN labels, counts", dict(zip(labels,counts)))

MIT labels, counts {'bathroom': 197, 'bedroom': 662, 'childs_room': 256, 'classroom': 240, 'dressing_room': 135, 'living_room': 706, 'studio': 165, 'swimming_pool': 174}

SUN labels, counts {'bathroom': 951, 'bedroom': 2879, 'childs_room': 1551, 'classroom': 561, 'dressing_room': 300, 'living_room': 2479, 'studio': 197, 'swimming_pool': 367}


In [8]:
mit_dict_count

{'bathroom': 197,
 'bedroom': 662,
 'childs_room': 256,
 'classroom': 240,
 'dressing_room': 135,
 'living_room': 706,
 'studio': 165,
 'swimming_pool': 174}

In [12]:
# select SUN samples to form a split
save_infos = {
    "seeds": [42, 666],
    "save_paths": ["SUN397_Indoors_samples_split_seed42.npy", "SUN397_Indoors_samples_split_seed666.npy"]}

for idx, seed in enumerate(save_infos['seeds']):
    print(idx, seed)
    random.seed(seed)
    selected_indexes = []
    for classe in mit_dict_count.keys():
#         print(classe)
        sun_classe_labels = np.where(sun_labels==classe)
        selected_indexes += random.sample(list(sun_classe_labels[0]), k=mit_dict_count[classe])
#         print(len(selected_indexes))

    selected_indexes.sort()
    split_sun_img_paths = sun_img_paths_labels[selected_indexes]
    np.save(save_infos["save_paths"][idx], np.asarray(split_sun_img_paths))


0 42
1 666


In [3]:
np.load('MIT_Indoors_samples_paths_and_labels_complete.npy')

array([['/hadatasets/andreza/SceneDatasets/MIT_Indoors_8/bathroom/n190011.jpg',
        'bathroom'],
       ['/hadatasets/andreza/SceneDatasets/MIT_Indoors_8/bathroom/dscn2018.jpg',
        'bathroom'],
       [ '/hadatasets/andreza/SceneDatasets/MIT_Indoors_8/bathroom/indoor_0368.jpg',
        'bathroom'],
       ..., 
       [ '/hadatasets/andreza/SceneDatasets/MIT_Indoors_8/swimming_pool/piscina_cubierta_03_03_altavista.jpg',
        'swimming_pool'],
       [ '/hadatasets/andreza/SceneDatasets/MIT_Indoors_8/swimming_pool/apartment_schwimmbad.jpg',
        'swimming_pool'],
       [ '/hadatasets/andreza/SceneDatasets/MIT_Indoors_8/swimming_pool/P1050218.jpg',
        'swimming_pool']],
      dtype='<U140')

In [5]:
x = np.load("SUN397_Indoors_samples_split_seed42.npy")