In [None]:
import pandas as pd
from pathlib import Path
import numpy as np
import pickle
from collections import defaultdict
import exifread
import cv2

In [None]:
data_root = Path('../data')
train_root = data_root / 'train_orig'
flickr_root = data_root / 'external/flickr_images'
ext_valid_root = data_root / 'external/val_images'
sets_root = data_root / 'sets'

## Create train and validation splits from the official dataset.

In [None]:
train_split = 0.8

np.random.seed(2018)

train_set = []
valid_set = []

fix_path = lambda p: p.relative_to(train_root)

for class_dir in train_root.iterdir():
    if not class_dir.is_dir():
        continue
    print(class_dir)
    image_paths = [path for path in class_dir.glob('*.jpg')]
    image_paths.extend([path for path in class_dir.glob('*.JPG')])
    n_images = len(image_paths)
    assert n_images == 275, n_images
    
    image_paths = [fix_path(p) for p in image_paths]
    
    image_paths = np.random.permutation(image_paths)
    train_image_paths = image_paths[:int(n_images * train_split)]
    valid_image_paths = image_paths[int(n_images * train_split):]
    
    train_set.extend(train_image_paths)
    valid_set.extend(valid_image_paths)

In [None]:
pd.DataFrame({'fname': train_set}).to_csv(str(sets_root / 'train.csv'), index=None)
pd.DataFrame({'fname': valid_set}).to_csv(str(sets_root / 'valid.csv'), index=None)
pd.DataFrame({'fname': train_set + valid_set}).to_csv(str(sets_root / 'trainval.csv'), index=None)

In [None]:
df = pd.read_csv(str(sets_root / 'train.csv'))
df.head()

In [None]:
[c for c in df.columns]

## Create validation dataset from the FLICKR dataset

In [None]:
def get_props(exif_per_model, prop, unique=True):
    out = {}
    for model, exifs in exif_per_model.items():
        props = [exif[prop].__str__() for _, exif in exifs if prop in exif]
        out[model] = set(props) if unique else props
    return out

In [None]:
exif_per_model = pickle.load(open(str(flickr_root / 'exif_per_model.pkl'), 'rb'))

In [None]:
flickr_paths = []
with open(str(flickr_root / 'good_jpgs_andres')) as f:
    for path in [l.strip() for l in f.readlines()]:
        if not (flickr_root/Path(path)).exists():
            print('{} not found'.format(path))
        else:
            flickr_paths.append(path)

with open(str(flickr_root / 'good_jpgs'), 'w') as f:
    for p in flickr_paths:
        f.write(str(p) + '\n')

flickr_models, count = np.unique([Path(p).parts[0] for p in flickr_paths], return_counts=True)
[m for m in zip(list(flickr_models), list(count))]

In [None]:
low_quality = []
with open(str(flickr_root / 'low-quality.txt')) as f:
    for path in [l.strip() for l in f.readlines()]:
        low_quality.append(path.split(' ')[0])

In [None]:
flickr_train_split = 0.8
flickr_max_model_samples = min(count)
print('max samples', flickr_max_model_samples)

np.random.seed(2018)

flickr_train_set = []
flickr_valid_set = []

#flickr_fix_path = lambda p: (flickr_root / p).relative_to('..')

flickr_paths_m = defaultdict(list)
for path in flickr_paths:
    m = name_map[Path(path).parts[0]]
    flickr_paths_m[m].append(path)

for m, paths in flickr_paths_m.items():
    paths = np.random.permutation(paths)[:min(flickr_max_model_samples, len(paths))]
    n_images = len(paths)
    train_paths = paths[:int(n_images * flickr_train_split)]
    valid_paths = paths[int(n_images * flickr_train_split):]
    flickr_train_set.extend(train_paths)
    flickr_valid_set.extend(valid_paths)

In [None]:
train_manip = [int(p in low_quality) for p in flickr_train_set]
valid_manip = [int(p in low_quality) for p in flickr_valid_set]

In [None]:
columns = ['fname', 'manip']
pd.DataFrame(dict(zip(columns, [flickr_train_set, train_manip])), columns=columns).to_csv(
    str(sets_root / 'flickr_train.csv'), index=None)
pd.DataFrame(dict(zip(columns, [flickr_valid_set, valid_manip])), columns=columns).to_csv(
    str(sets_root / 'flickr_valid.csv'), index=None)

In [None]:
im_make = get_props(exif_per_model, 'Image Make', unique=False)
np.unique(im_make['nexus_5x'],return_counts=1)