This notebook cleans up the metadata for the CLOC dataset. Note: you must first donwload the CLOC dataset before using this notebook. 

CLOC dataset provides a huge list of image URLs hosted in Flickr server. However, some of these URLs may no longer be valid. This notebook removes these invalid URLs in the metadata of the training and test sets.

In [None]:
import pandas as pd
import torch
import os
from tqdm import tqdm

In [None]:
root = "/path/to/CLOC/release/dataset/images/"
metadata_path = "/path/to/CLOC/metadata/"

In [None]:
assert root != "/path/to/CLOC/release/dataset/images/", "Please provide a valid path"
assert metadata_path != "/path/to/CLOC/metadata/", "Please provide a valid path"

# Training set

In [None]:
labels = torch.load(metadata_path + 'train_labels.torchSave')
time_taken = torch.load(metadata_path + 'train_time.torchSave')
user = torch.load(metadata_path + 'train_user.torchSave')
userID = torch.load(metadata_path + 'train_userID.torchSave')
store_loc = torch.load(metadata_path + 'train_store_loc.torchSave')

In [None]:
len(labels)

In [None]:
# Check whether each image pointer exists in the downloaded files, if so add it to the index_list
index_list = []
for i in tqdm(range(len(labels))):
    path = root + store_loc[i].strip()
    if os.path.isfile(path):
        index_list.append(i)

In [None]:
labels_clean = [labels[i] for i in index_list]
time_taken_clean = [time_taken[i] for i in index_list]
user_clean = [user[i] for i in index_list]
userID_clean = [userID[i] for i in index_list]
store_loc_clean = [store_loc[i] for i in index_list]

In [None]:
len(labels_clean)

In [None]:
assert len(labels_clean) > 0, "Something went wrong, ensure that the root path is valid."

In [None]:
# You may perform some sanity checks before overwriting the original CLOC metadata with the following files.
torch.save(labels_clean, metadata_path + 'train_labels.torchSave')
torch.save(time_taken_clean, metadata_path + 'train_time.torchSave')
torch.save(user_clean, metadata_path + 'train_user.torchSave')
torch.save(userID_clean, metadata_path + 'train_userID.torchSave')
torch.save(store_loc_clean, metadata_path + 'train_store_loc.torchSave')

# Test set

In [None]:
test_set_file = "yfcc100m_metadata_with_labels_usedDataRatio0.05_t110000_t250_valid_files_2004To2014_compact_val.csv"
df = pd.read_csv(metadata_path + test_set_file)

In [None]:
len(df)

In [None]:
df.head()

In [None]:
# Check whether each image pointer exists in the downloaded files, if so add it to the index_list
index_list = []
for i in tqdm(range(len(df.iloc[:,4]))):
    path = root + df.iloc[i,4].strip()
    if os.path.isfile(path):
        index_list.append(i)

In [None]:
df_clean = df.iloc[index_list,:]

In [None]:
len(index_list)

In [None]:
assert len(index_list) > 0, "Something went wrong, ensure that the root path is valid."

In [None]:
df_clean.to_csv(metadata_path + test_set_file) 