In [None]:
import pandas as pd
from pathlib import Path
import numpy as np
import pickle
from collections import defaultdict
import exifread
import cv2

In [None]:
data_root = Path('../data')
train_root = data_root / 'train_orig'
flickr_root = data_root / 'external/flickr_images_orig'
ext_valid_root = data_root / 'external/val_images'
sets_root = data_root / 'sets'

## Create train and validation splits from the official dataset.

In [155]:
train_split = 0.8

np.random.seed(2018)

image_models = []
image_paths = []
#image_sizes = []

fix_path = lambda p: p.relative_to(train_root)

m = 0
for class_dir in train_root.iterdir():
    if not class_dir.is_dir():
        continue
    print(class_dir)
    image_paths1 = [path for path in class_dir.glob('*.jpg')]
    image_paths1.extend([path for path in class_dir.glob('*.JPG')])
    n_images = len(image_paths1)
    assert n_images == 275, n_images
    
    #for img_path in image_paths1:
    #    image_sizes.append(cv2.imread(str(img_path)).shape[:2])
    
    image_paths1 = [fix_path(p) for p in image_paths1]
    image_paths.extend(image_paths1)
    
    image_models.extend([m] * n_images)
    m += 1

../data/train_orig/Samsung-Galaxy-Note3
../data/train_orig/LG-Nexus-5x
../data/train_orig/Samsung-Galaxy-S4
../data/train_orig/iPhone-4s
../data/train_orig/HTC-1-M7
../data/train_orig/iPhone-6
../data/train_orig/Motorola-Droid-Maxx
../data/train_orig/Sony-NEX-7
../data/train_orig/Motorola-X
../data/train_orig/Motorola-Nexus-6


In [157]:
train_image_paths = []
train_image_sizes = []
valid_image_paths = []
valid_image_sizes = []

for m in range(10):
    inds = np.argwhere(np.array(image_models) == m).squeeze()
    inds = np.random.permutation(inds)
    image_paths1 = [image_paths[i] for i in inds]
    image_sizes1 = [image_sizes[i] for i in inds]
    n_images = len(inds)
    n_train = int(n_images * train_split)
    train_image_paths.extend(image_paths1[:n_train])
    train_image_sizes.extend(image_sizes1[:n_train])
    valid_image_paths.extend(image_paths1[n_train:])
    valid_image_sizes.extend(image_sizes1[n_train:])

In [158]:
train_df =\
pd.DataFrame({'fname': train_image_paths, 
              'h': [s[0] for s in train_image_sizes], 
              'w': [s[1] for s in train_image_sizes]})
train_df.to_csv(str(sets_root / 'train.csv'), index=None)

valid_df =\
pd.DataFrame({'fname': valid_image_paths, 
              'h': [s[0] for s in valid_image_sizes],
              'w': [s[1] for s in valid_image_sizes]})
valid_df.to_csv(str(sets_root / 'valid.csv'), index=None)

pd.concat([train_df, valid_df]).to_csv(str(sets_root / 'trainval.csv'), index=None)

In [160]:
df = pd.read_csv(str(sets_root / 'train.csv'))
df.head()

Unnamed: 0,fname,h,w
0,Samsung-Galaxy-Note3/(GalaxyN3)166.jpg,2322,4128
1,Samsung-Galaxy-Note3/(GalaxyN3)100.jpg,4128,2322
2,Samsung-Galaxy-Note3/(GalaxyN3)170.jpg,4128,2322
3,Samsung-Galaxy-Note3/(GalaxyN3)207.jpg,4128,2322
4,Samsung-Galaxy-Note3/(GalaxyN3)97.jpg,4128,2322


In [161]:
[c for c in df.columns]

['fname', 'h', 'w']

## Create validation dataset from the FLICKR dataset

In [130]:
flickr_paths = []
with open(str(flickr_root / 'good_jpgs_refined')) as f:
    for path in [l.strip() for l in f.readlines()]:
        if not (flickr_root/Path(path)).exists():
            print('{} not found'.format(path))
        else:
            flickr_paths.append(path)

flickr_models, count = np.unique([Path(p).parts[0] for p in flickr_paths], return_counts=True)
[m for m in zip(list(flickr_models), list(count))]

[('htc_m7', 745),
 ('iphone_4s', 436),
 ('iphone_6', 546),
 ('moto_maxx', 543),
 ('moto_x', 344),
 ('nexus_5x', 329),
 ('nexus_6', 649),
 ('samsung_note3', 803),
 ('samsung_s4', 1131),
 ('sony_nex7', 552)]

In [None]:
low_quality = []
with open(str(flickr_root / 'low-quality.txt')) as f:
    for path in [l.strip() for l in f.readlines()]:
        low_quality.append(path.split(' ')[0])

In [None]:
flickr_train_split = 0.8
flickr_max_model_samples = 500 #min(count)
print('max samples', flickr_max_model_samples)

np.random.seed(2018)

flickr_train_set = []
flickr_valid_set = []

#flickr_fix_path = lambda p: (flickr_root / p).relative_to('..')

flickr_paths_m = defaultdict(list)
for path in flickr_paths:
    flickr_paths_m[Path(path).parts[0]].append(path)

for m, paths in flickr_paths_m.items():
    paths = np.random.permutation(paths)[:min(flickr_max_model_samples, len(paths))]
    n_images = len(paths)
    train_paths = paths[:int(n_images * flickr_train_split)]
    valid_paths = paths[int(n_images * flickr_train_split):]
    flickr_train_set.extend(train_paths)
    flickr_valid_set.extend(valid_paths)

In [None]:
train_manip = [int(p in low_quality) for p in flickr_train_set]
valid_manip = [int(p in low_quality) for p in flickr_valid_set]

In [None]:
columns = ['fname', 'manip']
pd.DataFrame(dict(zip(columns, [flickr_train_set, train_manip])), columns=columns).to_csv(
    str(sets_root / 'flickr_train.csv'), index=None)
pd.DataFrame(dict(zip(columns, [flickr_valid_set, valid_manip])), columns=columns).to_csv(
    str(sets_root / 'flickr_valid.csv'), index=None)

### Extract EXIF

In [None]:
exif_per_model = pickle.load(open(str(flickr_root / 'exif_per_model.pkl'), 'rb'))

In [None]:
def get_props(exif_per_model, prop, unique=True):
    out = {}
    for model, exifs in exif_per_model.items():
        props = [exif[prop].__str__() for _, exif in exifs if prop in exif]
        out[model] = set(props) if unique else props
    return out

In [None]:
props = get_props(exif_per_model, 'Image Software', unique=False)
for k, v in props.items():
    print('='*3, k, '='*3)
    for vv, count in zip(*np.unique(v, return_counts=1)):
        print(vv, 'x', count)
    print()

In [None]:
nexus_5x_sea = (flickr_root / 'nexus_5x_sea').read_text().splitlines()
good_jpgs_refined = []
fout = open(str(Path(flickr_root / 'good_jpgs_refined')), 'w')
for p in flickr_paths:
    if Path(p).parts[0] == 'nexus_5x' and Path(p).stem in nexus_5x_sea:
        continue
    fout.write(p + '\n')

In [None]:
#exif_per_model['iphone_4s'][0]

In [None]:
m = [Path(p).parts[0] for p in low_quality]
np.unique(m,return_counts=True)