In [1]:
#
# prepareCUB.ipynb
#
# Jupyter Notebook to prepare CUB dataset for further zero-shot learning tasks 
# CUB  : Caltech-UCSD Birds 200
#
# n_clases    : 200
# n_attributes: 312
#
# Written by cetinsamet -*- cetin.samet@metu.edu.tr
# December, 2019 

In [2]:
import scipy.io as sio
import numpy as np
import os

In [3]:
def read_file(filepath):
    file_content = []
    with open(filepath, 'r') as infile:
        for line in infile:
            file_content.append(line.strip())
    
    return file_content

In [4]:
CUB_PATH = "./CUB"
CUBP_PATH = "./CUBP" #Preprocessed CUB directory

In [5]:
# Read training classes  ---  Number of Classes: 100
train_classes = read_file(os.path.join(CUB_PATH, "trainclasses1.txt"))
print(len(train_classes), train_classes)
print()

# Read validation classes  ---  Number of Classes: 50
val_classes = read_file(os.path.join(CUB_PATH, "valclasses1.txt"))
print(len(val_classes), val_classes)
print()

# Read training and validation classes  ---  Number of Classes: 150 (100 + 50)
trainval_classes = read_file(os.path.join(CUB_PATH, "trainvalclasses.txt"))
print(len(trainval_classes), trainval_classes)
print()

# Read test classes  ---  Number of Classes: 50
test_classes = read_file(os.path.join(CUB_PATH, "testclasses.txt"))
print(len(test_classes), test_classes)
print()

100 ['108.White_necked_Raven', '167.Hooded_Warbler', '142.Black_Tern', '039.Least_Flycatcher', '002.Laysan_Albatross', '187.American_Three_toed_Woodpecker', '106.Horned_Puffin', '181.Worm_eating_Warbler', '060.Glaucous_winged_Gull', '015.Lazuli_Bunting', '067.Anna_Hummingbird', '107.Common_Raven', '013.Bobolink', '105.Whip_poor_Will', '088.Western_Meadowlark', '147.Least_Tern', '006.Least_Auklet', '160.Black_throated_Blue_Warbler', '110.Geococcyx', '183.Northern_Waterthrush', '024.Red_faced_Cormorant', '152.Blue_headed_Vireo', '022.Chuck_will_Widow', '008.Rhinoceros_Auklet', '019.Gray_Catbird', '154.Red_eyed_Vireo', '185.Bohemian_Waxwing', '068.Ruby_throated_Hummingbird', '196.House_Wren', '122.Harris_Sparrow', '014.Indigo_Bunting', '020.Yellow_breasted_Chat', '054.Blue_Grosbeak', '038.Great_Crested_Flycatcher', '115.Brewer_Sparrow', '079.Belted_Kingfisher', '101.White_Pelican', '027.Shiny_Cowbird', '186.Cedar_Waxwing', '053.Western_Grebe', '099.Ovenbird', '003.Sooty_Albatross', '030.F

In [6]:
# Load image features
features = sio.loadmat(os.path.join(CUB_PATH, "res101.mat"))['features'].T
print(features.shape)

# Load image labels
labels   = sio.loadmat(os.path.join(CUB_PATH, "res101.mat"))['labels']
print(labels.shape)

# Load metadata of dataset
metaData = sio.loadmat(os.path.join(CUB_PATH, "att_splits.mat"))
print(metaData.keys())

(11788, 2048)
(11788, 1)
dict_keys(['__header__', '__version__', '__globals__', 'allclasses_names', 'att', 'original_att', 'test_seen_loc', 'test_unseen_loc', 'trainval_loc'])


In [7]:
# Read all classes
all_classes = [str(currClass[0][0]) for currClass in metaData['allclasses_names']]
print(all_classes)

['002.Laysan_Albatross', '003.Sooty_Albatross', '005.Crested_Auklet', '007.Parakeet_Auklet', '010.Red_winged_Blackbird', '011.Rusty_Blackbird', '012.Yellow_headed_Blackbird', '013.Bobolink', '015.Lazuli_Bunting', '016.Painted_Bunting', '017.Cardinal', '018.Spotted_Catbird', '019.Gray_Catbird', '020.Yellow_breasted_Chat', '021.Eastern_Towhee', '022.Chuck_will_Widow', '024.Red_faced_Cormorant', '025.Pelagic_Cormorant', '026.Bronzed_Cowbird', '027.Shiny_Cowbird', '028.Brown_Creeper', '030.Fish_Crow', '032.Mangrove_Cuckoo', '039.Least_Flycatcher', '040.Olive_sided_Flycatcher', '041.Scissor_tailed_Flycatcher', '042.Vermilion_Flycatcher', '044.Frigatebird', '045.Northern_Fulmar', '046.Gadwall', '047.American_Goldfinch', '048.European_Goldfinch', '050.Eared_Grebe', '052.Pied_billed_Grebe', '054.Blue_Grosbeak', '055.Evening_Grosbeak', '056.Pine_Grosbeak', '057.Rose_breasted_Grosbeak', '058.Pigeon_Guillemot', '059.California_Gull', '060.Glaucous_winged_Gull', '061.Heermann_Gull', '062.Herring_G

In [8]:
# Find train class indices
train_indices = [i for i, class_ in enumerate(all_classes) if class_ in train_classes]
print("|#ofIndices:", len(train_indices), "\t|Train Indices:", train_indices)

|#ofIndices: 100 	|Train Indices: [0, 1, 4, 5, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 19, 21, 22, 23, 24, 25, 26, 27, 29, 30, 32, 34, 37, 38, 40, 42, 45, 46, 47, 50, 53, 56, 58, 59, 60, 62, 64, 65, 66, 69, 72, 73, 74, 75, 77, 80, 84, 85, 88, 89, 92, 93, 95, 96, 98, 102, 105, 106, 108, 111, 113, 117, 118, 122, 125, 126, 127, 130, 131, 133, 135, 143, 146, 148, 150, 152, 153, 155, 161, 163, 164, 168, 171, 176, 177, 179, 180, 182, 187, 189, 193, 195, 196, 197, 198, 199]


In [9]:
# Find val class indices
val_indices = [i for i, class_ in enumerate(all_classes) if class_ in val_classes]
print("|#ofIndices:", len(val_indices), "\t|Val Indices:", val_indices)

|#ofIndices: 50 	|Val Indices: [2, 3, 11, 31, 36, 39, 41, 43, 44, 48, 51, 52, 54, 57, 63, 70, 76, 81, 82, 83, 91, 100, 101, 104, 109, 110, 112, 114, 116, 120, 129, 132, 134, 136, 137, 139, 142, 144, 145, 147, 154, 157, 160, 162, 167, 169, 172, 174, 183, 185]


In [10]:
trainval_indices = [i for i, class_ in enumerate(all_classes) if class_ in trainval_classes]
print("|#ofIndices:", len(trainval_indices), "\t|TrainVal Indices:", trainval_indices)

|#ofIndices: 150 	|TrainVal Indices: [0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 19, 21, 22, 23, 24, 25, 26, 27, 29, 30, 31, 32, 34, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 50, 51, 52, 53, 54, 56, 57, 58, 59, 60, 62, 63, 64, 65, 66, 69, 70, 72, 73, 74, 75, 76, 77, 80, 81, 82, 83, 84, 85, 88, 89, 91, 92, 93, 95, 96, 98, 100, 101, 102, 104, 105, 106, 108, 109, 110, 111, 112, 113, 114, 116, 117, 118, 120, 122, 125, 126, 127, 129, 130, 131, 132, 133, 134, 135, 136, 137, 139, 142, 143, 144, 145, 146, 147, 148, 150, 152, 153, 154, 155, 157, 160, 161, 162, 163, 164, 167, 168, 169, 171, 172, 174, 176, 177, 179, 180, 182, 183, 185, 187, 189, 193, 195, 196, 197, 198, 199]


In [11]:
test_indices = [i for i, class_ in enumerate(all_classes) if class_ in test_classes]
print("|#ofIndices:", len(test_indices), "\t|Test Indices:", test_indices)

|#ofIndices: 50 	|Test Indices: [6, 18, 20, 28, 33, 35, 49, 55, 61, 67, 68, 71, 78, 79, 86, 87, 90, 94, 97, 99, 103, 107, 115, 119, 121, 123, 124, 128, 138, 140, 141, 149, 151, 156, 158, 159, 165, 166, 170, 173, 175, 178, 181, 184, 186, 188, 190, 191, 192, 194]


In [12]:
trainval_loc    = metaData['trainval_loc']
test_seen_loc   = metaData['test_seen_loc']
test_unseen_loc = metaData['test_unseen_loc']

print("Num TrainVal    : ", str(trainval_loc.shape[0]))
print("Num Test Seen   : ", str(test_seen_loc.shape[0]))
print("Num Test Unseen : ", str(test_unseen_loc.shape[0]))

Num TrainVal    :  7057
Num Test Seen   :  1764
Num Test Unseen :  2967


In [13]:
if not os.path.exists(os.path.join(CUBP_PATH, "test")):
    os.makedirs(os.path.join(CUBP_PATH, "test"))

if not os.path.exists(os.path.join(CUBP_PATH, "validation")):
    os.makedirs(os.path.join(CUBP_PATH, "validation"))

In [14]:
# LABELS (PART I)

trainval_labels    = (labels[trainval_loc - 1] - 1).reshape(-1, 1)
test_seen_labels   = (labels[test_seen_loc - 1] - 1).reshape(-1, 1)
test_unseen_labels = (labels[test_unseen_loc - 1] - 1).reshape(-1, 1)

sio.savemat(os.path.join(CUBP_PATH, "test", "trainval_labels.mat"), {'trainval_labels':trainval_labels})
sio.savemat(os.path.join(CUBP_PATH, "test", "test_seen_labels.mat"), {'test_seen_labels':test_seen_labels})
sio.savemat(os.path.join(CUBP_PATH, "test", "test_unseen_labels.mat"), {'test_unseen_labels':test_unseen_labels})

In [15]:
# FEATURES (PART I)

trainval_features     = (features[trainval_loc - 1]).reshape(-1, 2048)
test_seen_features    = (features[test_seen_loc - 1]).reshape(-1, 2048)
test_unseen_features  = (features[test_unseen_loc - 1]).reshape(-1, 2048)

sio.savemat(os.path.join(CUBP_PATH, "test", "trainval_features.mat"), {'trainval_features':trainval_features})
sio.savemat(os.path.join(CUBP_PATH, "test", "test_seen_features.mat"), {'test_seen_features':test_seen_features})
sio.savemat(os.path.join(CUBP_PATH, "test", "test_unseen_features.mat"), {'test_unseen_features':test_unseen_features})

In [16]:
train_loc      = []
val_unseen_loc = [] 

for i, label in enumerate(trainval_labels):

    if label[0] in train_indices:
        train_loc.append(i)
    elif label[0] in val_indices:
        val_unseen_loc.append(i)
    else:
        pass
    
val_unseen_loc    = np.asarray(val_unseen_loc)

print("Num train loc     : %d" % len(train_loc))
print("Num val unseen loc: %d" % len(val_unseen_loc))

Num train loc     : 4702
Num val unseen loc: 2355


In [17]:
from sklearn.model_selection import train_test_split
splitRate = 0.1

x_train_loc, x_val_seen_loc, _, _ = train_test_split(train_loc,\
                                                     trainval_labels[np.asarray(train_loc)],\
                                                     test_size=splitRate,\
                                                     random_state=123)
x_train_loc    = np.asarray(x_train_loc)
x_val_seen_loc = np.asarray(x_val_seen_loc)
print("Num x_train loc    : %d" % len(x_train_loc))
print("Num x_val_seen loc : %d" % len(x_val_seen_loc))

Num x_train loc    : 4231
Num x_val_seen loc : 471


In [18]:
# LABELS (PART II)

train_labels      = trainval_labels[x_train_loc]
val_seen_labels   = trainval_labels[x_val_seen_loc]
val_unseen_labels = trainval_labels[val_unseen_loc]

sio.savemat(os.path.join(CUBP_PATH, "validation", "train_labels.mat"), {'train_labels':train_labels})
sio.savemat(os.path.join(CUBP_PATH, "validation", "val_seen_labels.mat"), {'val_seen_labels':val_seen_labels})
sio.savemat(os.path.join(CUBP_PATH, "validation", "val_unseen_labels.mat"), {'val_unseen_labels':val_unseen_labels})

In [19]:
# FEATURES (PART II)

train_features      = trainval_features[x_train_loc]
val_seen_features   = trainval_features[x_val_seen_loc]
val_unseen_features = trainval_features[val_unseen_loc]

sio.savemat(os.path.join(CUBP_PATH, "validation", "train_features.mat"), {'train_features':train_features})
sio.savemat(os.path.join(CUBP_PATH, "validation", "val_seen_features.mat"), {'val_seen_features':val_seen_features})
sio.savemat(os.path.join(CUBP_PATH, "validation", "val_unseen_features.mat"), {'val_unseen_features':val_unseen_features})

In [20]:
attribute     = metaData['att'].T
org_attribute = metaData['original_att'].T
print(attribute.shape)

(200, 312)


In [21]:
# class-level attributes
sio.savemat(os.path.join(CUBP_PATH, "attributes.mat"), {'attributes':attribute})