In [3]:
import pandas as pd
from pathlib import Path
import numpy as np
import pickle
from collections import defaultdict
import exifread
import cv2

In [7]:
data_root = Path('/mnt/data/kaggle_camera')
train_root = data_root / 'train_'
flickr_root = data_root / 'flickr_images_'
flickr_new_root = data_root / 'flickr_images_new'
reviews_root = data_root / 'reviews_images_'
sets_root = Path('../data/sets')

In [17]:
def collect_exif(in_path):
    image_paths = []
    if in_path.is_dir():
        image_paths = [p for p in in_path.glob('*/*.jpg')] + [p for p in in_path.glob('*/*.JPG')]
    else:
        image_paths = [in_path.parent / p for p in in_path.read_text().splitlines()]
    exif_per_model = defaultdict(list)
    for i,p in enumerate(image_paths):
        model = p.parts[-2]
        with open(str(p), 'rb') as fh:
            _tags = exifread.process_file(fh)
            tags = {k: v for k, v in _tags.items() if 'thumbnail' not in k.lower()}
            exif_per_model[model].append((str(p), tags))
    return exif_per_model
                
def get_props(exif_per_model, prop, unique=True):
    out = {}
    for model, exifs in exif_per_model.items():
        props = [exif[prop].__str__() for _, exif in exifs if prop in exif]
        out[model] = set(props) if unique else props
    return out

def output_props(exif_per_model, prop):
    props = get_props(exif_per_model, prop, unique=False)
    for k, v in props.items():
        print('='*3, k, '='*3)
        for vv, count in zip(*np.unique(v, return_counts=1)):
            print(vv, 'x', count)
        print()

restrict_software = [
    'Adobe', 'Google', 'Snapseed', 'Capture', 'GIMP', 'Photos', 'PlayMemories', 'Aperture', 'Camera+',
    'Elements', 'Flickr', 'Microsoft', 'Polarr', 'VSCO', 'Bibble', 'Perfectly', 'Picasa', 'PhotoScape',
    'ProCamera', 'QuickTime', 'iPhoto', 'ACD', 'Photo Supreme', 'Image Data Converter'
]

def clean_by_sw(exifs_per_model):
    kept = []
    dropped = defaultdict(list)
    prop = 'Image Software'
    for model, exifs in exifs_per_model.items():
        for fn, e in exifs:
            if prop in e and sum([sw.lower() in str(e[prop]).lower() for sw in restrict_software]):
                #print('DROP', fn, e[prop])
                dropped[model].append((fn, e[prop]))
            else:
                kept.append(fn)
    return kept,dropped

## Org dataset

In [155]:
train_split = 0.8

np.random.seed(2018)

image_models = []
image_paths = []
#image_sizes = []

fix_path = lambda p: p.relative_to(train_root)

m = 0
for class_dir in train_root.iterdir():
    if not class_dir.is_dir():
        continue
    print(class_dir)
    image_paths1 = [path for path in class_dir.glob('*.jpg')]
    image_paths1.extend([path for path in class_dir.glob('*.JPG')])
    n_images = len(image_paths1)
    assert n_images == 275, n_images
    
    #for img_path in image_paths1:
    #    image_sizes.append(cv2.imread(str(img_path)).shape[:2])
    
    image_paths1 = [fix_path(p) for p in image_paths1]
    image_paths.extend(image_paths1)
    
    image_models.extend([m] * n_images)
    m += 1

../data/train_orig/Samsung-Galaxy-Note3
../data/train_orig/LG-Nexus-5x
../data/train_orig/Samsung-Galaxy-S4
../data/train_orig/iPhone-4s
../data/train_orig/HTC-1-M7
../data/train_orig/iPhone-6
../data/train_orig/Motorola-Droid-Maxx
../data/train_orig/Sony-NEX-7
../data/train_orig/Motorola-X
../data/train_orig/Motorola-Nexus-6


In [157]:
train_image_paths = []
train_image_sizes = []
valid_image_paths = []
valid_image_sizes = []

for m in range(10):
    inds = np.argwhere(np.array(image_models) == m).squeeze()
    inds = np.random.permutation(inds)
    image_paths1 = [image_paths[i] for i in inds]
    image_sizes1 = [image_sizes[i] for i in inds]
    n_images = len(inds)
    n_train = int(n_images * train_split)
    train_image_paths.extend(image_paths1[:n_train])
    train_image_sizes.extend(image_sizes1[:n_train])
    valid_image_paths.extend(image_paths1[n_train:])
    valid_image_sizes.extend(image_sizes1[n_train:])

In [158]:
train_df =\
pd.DataFrame({'fname': train_image_paths, 
              'h': [s[0] for s in train_image_sizes], 
              'w': [s[1] for s in train_image_sizes]})
train_df.to_csv(str(sets_root / 'train.csv'), index=None)

valid_df =\
pd.DataFrame({'fname': valid_image_paths, 
              'h': [s[0] for s in valid_image_sizes],
              'w': [s[1] for s in valid_image_sizes]})
valid_df.to_csv(str(sets_root / 'valid.csv'), index=None)

pd.concat([train_df, valid_df]).to_csv(str(sets_root / 'trainval.csv'), index=None)

In [160]:
df = pd.read_csv(str(sets_root / 'train.csv'))
df.head()

Unnamed: 0,fname,h,w
0,Samsung-Galaxy-Note3/(GalaxyN3)166.jpg,2322,4128
1,Samsung-Galaxy-Note3/(GalaxyN3)100.jpg,4128,2322
2,Samsung-Galaxy-Note3/(GalaxyN3)170.jpg,4128,2322
3,Samsung-Galaxy-Note3/(GalaxyN3)207.jpg,4128,2322
4,Samsung-Galaxy-Note3/(GalaxyN3)97.jpg,4128,2322


In [161]:
[c for c in df.columns]

['fname', 'h', 'w']

## FLICKR

In [6]:
flickr_paths = []
with open(str(flickr_root / 'good_jpgs')) as f:
    for path in [l.strip() for l in f.readlines()]:
        if not (flickr_root/Path(path)).exists():
            print('{} not found'.format(path))
        else:
            flickr_paths.append(path)

flickr_models, count = np.unique([Path(p).parts[0] for p in flickr_paths], return_counts=True)
[m for m in zip(list(flickr_models), list(count))]

[('htc_m7', 745),
 ('iphone_4s', 499),
 ('iphone_6', 546),
 ('moto_maxx', 543),
 ('moto_x', 344),
 ('nexus_5x', 403),
 ('nexus_6', 649),
 ('samsung_note3', 803),
 ('samsung_s4', 1131),
 ('sony_nex7', 552)]

In [7]:
low_quality = []
with open(str(flickr_root / 'low-quality.txt')) as f:
    for path in [l.strip() for l in f.readlines()]:
        low_quality.append(path.split(' ')[0])

In [8]:
#pickle.dump(collect_exif(flickr_root), open(str(flickr_root / 'flickr_exifs.pkl'), 'wb'))
flickr_exifs = pickle.load(open(str(flickr_root / 'flickr_exifs.pkl'), 'rb'))

In [15]:
output_props(flickr_exifs, 'Image Software')

=== samsung_s4 ===
Adobe Photoshop CS5 Macintosh x 2
Adobe Photoshop CS6 (Macintosh) x 6
Adobe Photoshop Lightroom 6.12 (Windows) x 1
I9505VJUEMKE x 1
I9505VJUGNE2 x 5
I9505VJUHOK1 x 27
I9505XXUGNF1 x 17
I9505XXUGNH6 x 4
I9505XXUGNJ8 x 41
I9505XXUHOE3 x 1
I9505XXUHOJ2 x 107
I9505XXUHOJ3 x 1
I9505XXUHPF4 x 16
I9505XXUHPK2 x 876
I9505XXUHQC1 x 4
Microsoft Windows Photo Viewer 6.1.7600.16385 x 5
Microsoft Windows Photo Viewer 6.3.9600.17415 x 1
Photos 1.3 x 1
VSCO Android Version: v9 (813) x 3
Windows Photo Editor 6.3.9600.17418 x 12

=== htc_m7 ===
Adobe Photoshop Elements 10.0 Windows x 1
Adobe Photoshop Lightroom Classic 7.0 (Windows) x 1
Google x 1
Snapseed 2.0 x 1

=== iphone_6 ===
10.2.1 x 9
10.3.1 x 1
10.3.3 x 60
11.0.2 x 2
11.1.2 x 13
11.2 x 1
11.2.1 x 150
9.3.5 x 64
Adobe Photoshop Lightroom 4.4 (Macintosh) x 5
Adobe Photoshop Lightroom 6.6.1 (Macintosh) x 3
Adobe Photoshop Lightroom Classic 7.1 (Macintosh) x 1
Adobe Photoshop Lightroom Classic 7.1 (Windows) x 1
Aperture 3.6 x 20

In [11]:
kept,dropped=clean_by_sw(flickr_exifs)
for k,v in dropped.items():
    print(k,len(v))
    
drop_n=0
kept = [Path(p).relative_to(flickr_root) for p in kept]
flickr_paths_pure = []
for dp in flickr_paths:
    pure = sum([p == Path(dp) for p in kept])
    if not pure:
        drop_n+=1
    else:
        flickr_paths_pure.append(dp)
print('dropped {} paths'.format(drop_n))

samsung_s4 19
htc_m7 4
iphone_6 241
nexus_6 82
iphone_4s 85
moto_x 13
moto_maxx 22
nexus_5x 23
sony_nex7 137
samsung_note3 3
dropped 629 paths


In [12]:
flickr_train_split = 0.8
flickr_max_model_samples = 1e6 #min(count)
print('max samples', flickr_max_model_samples)

np.random.seed(2018)

flickr_train_set = []
flickr_valid_set = []

#flickr_fix_path = lambda p: (flickr_root / p).relative_to('..')

flickr_paths_m = defaultdict(list)
for path in flickr_paths_pure:
    flickr_paths_m[Path(path).parts[0]].append(path)

for m, paths in flickr_paths_m.items():
    paths = np.random.permutation(paths)[:min(flickr_max_model_samples, len(paths))]
    n_images = len(paths)
    train_paths = paths[:int(n_images * flickr_train_split)]
    valid_paths = paths[int(n_images * flickr_train_split):]
    flickr_train_set.extend(train_paths)
    flickr_valid_set.extend(valid_paths)

max samples 1000000.0


In [13]:
train_manip = [int(p in low_quality) for p in flickr_train_set]
valid_manip = [int(p in low_quality) for p in flickr_valid_set]

In [14]:
columns = ['fname', 'manip']
pd.DataFrame(dict(zip(columns, [flickr_train_set, train_manip])), columns=columns).to_csv(
    str(sets_root / 'flickr_train.csv'), index=None)
pd.DataFrame(dict(zip(columns, [flickr_valid_set, valid_manip])), columns=columns).to_csv(
    str(sets_root / 'flickr_valid.csv'), index=None)

In [None]:
#exif_per_model['iphone_4s'][0]

In [None]:
m = [Path(p).parts[0] for p in low_quality]
np.unique(m,return_counts=True)

# REVIEWS

In [228]:
train_split = 0.8

np.random.seed(2018)

reviews_models = []
reviews_paths = []

fix_path = lambda p: p.relative_to(reviews_root)

m = 0
for class_dir in reviews_root.iterdir():
    if not class_dir.is_dir():
        continue
    print(class_dir)
    reviews_paths1 = [path for path in class_dir.glob('*.jpg')]
    reviews_paths1.extend([path for path in class_dir.glob('*.JPG')])
    n_images = len(reviews_paths1)
    
    reviews_paths1 = [fix_path(p) for p in reviews_paths1]
    reviews_paths.extend(reviews_paths1)
    
    reviews_models.extend([m] * n_images)
    m += 1

../data/external/val_images/iphone_6
../data/external/val_images/sony_nex7
../data/external/val_images/moto_x
../data/external/val_images/samsung_note3
../data/external/val_images/nexus_6
../data/external/val_images/samsung_s4
../data/external/val_images/htc_m7
../data/external/val_images/nexus_5x
../data/external/val_images/moto_maxx
../data/external/val_images/iphone_4s


In [213]:
reviews_exifs = collect_exif(reviews_root)

../data/external/val_images/iphone_6
skip ../data/external/val_images/iphone_6/urls_dpreview
../data/external/val_images/sony_nex7
skip ../data/external/val_images/sony_nex7/urls_dpreview
../data/external/val_images/moto_x
skip ../data/external/val_images/moto_x/urls_dpreview
../data/external/val_images/samsung_note3
skip ../data/external/val_images/samsung_note3/urls_dpreview
../data/external/val_images/nexus_6
skip ../data/external/val_images/nexus_6/urls_dpreview
../data/external/val_images/samsung_s4
skip ../data/external/val_images/samsung_s4/urls_dpreview
../data/external/val_images/htc_m7
skip ../data/external/val_images/htc_m7/urls_dpreview
../data/external/val_images/nexus_5x
skip ../data/external/val_images/nexus_5x/urls_engadget
../data/external/val_images/moto_maxx
skip ../data/external/val_images/moto_maxx/urls_phonearena
../data/external/val_images/iphone_4s
skip ../data/external/val_images/iphone_4s/urls_anandtech


In [214]:
output_props(reviews_exifs, 'Image Software')

=== htc_m7 ===

=== sony_nex7 ===
Adobe Photoshop Camera Raw 6.6 (Macintosh) x 2
Adobe Photoshop Camera Raw 6.6 (Windows) x 3
Image Data Converter x 1
NEX-7 v1.00 x 42

=== iphone_6 ===
8.0 x 10
8.0.2 x 20

=== samsung_note3 ===
N900PVPUBMI5 x 18

=== moto_maxx ===

=== iphone_4s ===
5.0 x 1

=== nexus_6 ===
HDR+ 1.0.76504408 x 9

=== nexus_5x ===

=== moto_x ===

=== samsung_s4 ===
Adobe Photoshop CS5 Macintosh x 26
L720VPUAMDC x 6
L720VPUAMDL x 12



In [229]:
kept,dropped=clean_by_sw(reviews_exifs)
for k,v in dropped.items():
    print(k,len(v))
    
drop_n=0
kept = [Path(p).relative_to(reviews_root) for p in kept]
reviews_paths_pure = []
for dp in reviews_paths:
    pure = sum([p == Path(dp) for p in kept])
    if not pure:
        drop_n+=1
    else:
        reviews_paths_pure.append(dp)
print('dropped {} paths'.format(drop_n))

sony_nex7 6
samsung_s4 26
dropped 32 paths


In [232]:
pd.DataFrame({'fname': [str(p) for p in reviews_paths_pure]}).to_csv(
    str(sets_root / 'reviews.csv'), index=None)

# FLICKR NEW

In [19]:
#pickle.dump(collect_exif(flickr_new_root/'good_jpgs'), open(str(flickr_new_root / 'flickr_exifs.pkl'), 'wb'))
flickr_new_exifs = pickle.load(open(str(flickr_new_root / 'flickr_exifs.pkl'), 'rb'))

In [20]:
output_props(flickr_new_exifs, 'Image Software')

=== htc_m7 ===
Adobe Photoshop 7.0 x 2
Adobe Photoshop CS3 Windows x 2
Adobe Photoshop Lightroom 4.0 (Macintosh) x 1
Adobe Photoshop Lightroom 5.6 (Macintosh) x 5
Adobe Photoshop Lightroom 6.7 (Windows) x 10
Adobe Photoshop Lightroom Classic 7.0 (Windows) x 1
Aperture 3.6 x 5
Aviary for Android 3.5.0 x 1
Aviary for Android 3.6.2 x 1
Google x 2
Nero PhotoSnap 1, 2, 0, 23 x 2
Picasa x 91
Snapseed 1.6 x 1
Snapseed 2.0 x 1
VSCOcam Android Version: v2.2.1 (76)  x 2
VSCOcam Android Version: v2.3 (80)  x 1
VSCOcam Android Version: v2.3.1 (87)  x 1
VSCOcam Android Version: v2.4.2 (111) x 2
VSCOcam Android Version: v3.0.1 (122) x 6
VSCOcam Android Version: v3.0.3 (129) x 1
VSCOcam Android Version: v3.0.4 (133) x 10
VSCOcam Android Version: v3.1.1 (149) x 8
VSCOcam Android Version: v3.1.2 (160) x 9

=== nexus_6 ===
Adobe Photoshop CC (Macintosh) x 1
Adobe Photoshop CC (Windows) x 1
Adobe Photoshop CC 2015 (Windows) x 1
Adobe Photoshop CC 2015.5 (Macintosh) x 1
Adobe Photoshop CC 2015.5 (Windows)

In [45]:
kept,dropped=clean_by_sw(flickr_new_exifs)
for k,v in dropped.items():
    print(k,len(v))
    
drop_n=0
kept = [str(Path(p).relative_to(flickr_new_root)) for p in kept]
flickr_paths_pure = []
for dp in [str(Path(p)) for p in (flickr_new_root/'good_jpgs').read_text().splitlines()]:
    if dp not in kept:
        drop_n+=1
    else:
        flickr_paths_pure.append(dp)
print('dropped {} paths'.format(drop_n))

htc_m7 163
nexus_6 116
iphone_6 243
samsung_s4 32
nexus_5x 28
samsung_note3 18
moto_maxx 55
iphone_4s 413
sony_nex7 753
dropped 1821 paths


In [54]:
max_count = 1000
model_counts = defaultdict(int)

flickr_paths_balanced = []
for p in flickr_paths_pure:
    model = (Path(p).parts[0])
    model_counts[model] += 1
    if model_counts[model] > max_count:
        continue
    flickr_paths_balanced.append(p)

In [56]:
pd.DataFrame({'fname': flickr_paths_balanced}).to_csv(
    str(sets_root / 'flickr_new.csv'), index=None)

with open(str((flickr_new_root/'good_jpgs_refined')),'w') as f:
    [f.write(p+'\n') for p in flickr_paths_balanced]