In [1]:
import pandas as pd
from pathlib import Path
import numpy as np
import pickle
from collections import defaultdict
import exifread
import cv2

In [2]:
data_root = Path('../data')
train_root = data_root / 'train_orig'
flickr_root = data_root / 'external/flickr_images_orig'
ext_valid_root = data_root / 'external/val_images'
sets_root = data_root / 'sets'

## Create train and validation splits from the official dataset.

In [None]:
train_split = 0.8

np.random.seed(2018)

train_set = []
valid_set = []

fix_path = lambda p: p.relative_to(train_root)

def image

for class_dir in train_root.iterdir():
    if not class_dir.is_dir():
        continue
    print(class_dir)
    image_paths = [path for path in class_dir.glob('*.jpg')]
    image_paths.extend([path for path in class_dir.glob('*.JPG')])
    n_images = len(image_paths)
    assert n_images == 275, n_images
    
    image_paths = [fix_path(p) for p in image_paths]
    
    image_paths = np.random.permutation(image_paths)
    train_image_paths = image_paths[:int(n_images * train_split)]
    valid_image_paths = image_paths[int(n_images * train_split):]
    
    train_set.extend(train_image_paths)
    valid_set.extend(valid_image_paths)

In [None]:
pd.DataFrame({'fname': train_set}).to_csv(str(sets_root / 'train.csv'), index=None)
pd.DataFrame({'fname': valid_set}).to_csv(str(sets_root / 'valid.csv'), index=None)
pd.DataFrame({'fname': train_set + valid_set}).to_csv(str(sets_root / 'trainval.csv'), index=None)

In [None]:
df = pd.read_csv(str(sets_root / 'train.csv'))
df.head()

In [None]:
[c for c in df.columns]

## Create validation dataset from the FLICKR dataset

In [56]:
flickr_paths = []
with open(str(flickr_root / 'good_jpgs')) as f:
    for path in [l.strip() for l in f.readlines()]:
        if not (flickr_root/Path(path)).exists():
            print('{} not found'.format(path))
        else:
            flickr_paths.append(path)

flickr_models, count = np.unique([Path(p).parts[0] for p in flickr_paths], return_counts=True)
[m for m in zip(list(flickr_models), list(count))]

[('htc_m7', 745),
 ('iphone_4s', 499),
 ('iphone_6', 546),
 ('moto_maxx', 543),
 ('moto_x', 344),
 ('nexus_5x', 403),
 ('nexus_6', 649),
 ('samsung_note3', 803),
 ('samsung_s4', 1131),
 ('sony_nex7', 552)]

In [57]:
low_quality = []
with open(str(flickr_root / 'low-quality.txt')) as f:
    for path in [l.strip() for l in f.readlines()]:
        low_quality.append(path.split(' ')[0])

In [58]:
flickr_train_split = 0.8
flickr_max_model_samples = 500 #min(count)
print('max samples', flickr_max_model_samples)

np.random.seed(2018)

flickr_train_set = []
flickr_valid_set = []

#flickr_fix_path = lambda p: (flickr_root / p).relative_to('..')

flickr_paths_m = defaultdict(list)
for path in flickr_paths:
    flickr_paths_m[Path(path).parts[0]].append(path)

for m, paths in flickr_paths_m.items():
    paths = np.random.permutation(paths)[:min(flickr_max_model_samples, len(paths))]
    n_images = len(paths)
    train_paths = paths[:int(n_images * flickr_train_split)]
    valid_paths = paths[int(n_images * flickr_train_split):]
    flickr_train_set.extend(train_paths)
    flickr_valid_set.extend(valid_paths)

max samples 500


In [59]:
train_manip = [int(p in low_quality) for p in flickr_train_set]
valid_manip = [int(p in low_quality) for p in flickr_valid_set]

In [60]:
columns = ['fname', 'manip']
pd.DataFrame(dict(zip(columns, [flickr_train_set, train_manip])), columns=columns).to_csv(
    str(sets_root / 'flickr_train.csv'), index=None)
pd.DataFrame(dict(zip(columns, [flickr_valid_set, valid_manip])), columns=columns).to_csv(
    str(sets_root / 'flickr_valid.csv'), index=None)

### Extract EXIF

In [53]:
exif_per_model = pickle.load(open(str(flickr_root / 'exif_per_model.pkl'), 'rb'))

In [54]:
def get_props(exif_per_model, prop, unique=True):
    out = {}
    for model, exifs in exif_per_model.items():
        props = [exif[prop].__str__() for _, exif in exifs if prop in exif]
        out[model] = set(props) if unique else props
    return out

In [55]:
props = get_props(exif_per_model, 'Image Model', unique=False)
for k, v in props.items():
    print('='*3, k, '='*3)
    for vv, count in zip(*np.unique(v, return_counts=1)):
        print(vv, 'x', count)

=== htc_m7 ===
HTC One x 745
=== sony_nex7 ===
NEX-7 x 552
=== iphone_6 ===
iPhone 6 x 546
=== samsung_note3 ===
SM-N9005 x 805
=== moto_maxx ===
XT1080 x 543
=== iphone_4s ===
iPhone 4S x 499
=== samsung_s4 ===
GT-I9505 x 1131
=== nexus_6 ===
Nexus 6 x 649
=== nexus_5x ===
Nexus 5X x 397
=== moto_x ===
XT1060 x 1
XT1096 x 343


In [19]:
#exif_per_model['iphone_4s'][0]