In [2]:
import gc
import re
import os
import keras
import numpy as np
import pandas as pd
import time

from PIL import Image

from tqdm import tqdm
import tensorflow as tf
import matplotlib.pyplot as plt
from itertools import combinations
from sklearn import metrics

%matplotlib inline

In [2]:
np.random.seed(seed=999)

In [3]:
RUN = 'O'
SAMPLE_SIZE = (3+2048*5)*2
MODELS_DIR = '/d3/caches/kaggle-painters-v3/models/' + RUN
TFB_DIR = '/tmp-persistent/painters3/' + RUN

In [4]:
if not os.path.isdir(MODELS_DIR): os.makedirs(MODELS_DIR)

In [5]:
# how many same-artist 
SAME_ARTIST_PROB = 0.55

VAL_N_SAMPLES = 125000
VAL_SAMPLES_FILE = 'out/X_val.mem'
VAL_YS_FILE = 'out/y_val.mem'

HSS_CACHE_FILE = 'out/halfsamples.npy'

TRAIN_N_PER_BATCH = 320
TRAIN_N_SAMPLES_PER_EPOCH = 500000
TRAIN_N_SAMPLES_PER_EPOCH -= TRAIN_N_SAMPLES_PER_EPOCH % TRAIN_N_PER_BATCH

TRAIN_N_EPOCHS = 11111

print ('TRAIN_N_SAMPLES_PER_EPOCH', TRAIN_N_SAMPLES_PER_EPOCH)
print ('TRAIN_N_EPOCHS', TRAIN_N_EPOCHS)

TRAIN_N_SAMPLES_PER_EPOCH 499840
TRAIN_N_EPOCHS 11111


In [6]:

info_df = pd.read_csv('train_info.csv')
info_df['fid'] = [int(re.findall(r'(\d+).jpg$', x)[0]) for x in info_df['filename']]
info_df = info_df.set_index('fid', drop=True)


avail_fids = np.array(info_df.index.values, dtype=np.int32)
print ("# fids in training set: %d"%(len(avail_fids)))

# fids in training set: 79433


In [7]:
# generate pairs
artists = np.unique(info_df.artist.values)
same_artist_fids = []

for a in tqdm(artists):
    info_artist_df = info_df[info_df['artist'] == a]
    
    np.random.shuffle(info_artist_df.index.values)
    
    for c in combinations(info_artist_df.index.values, 2):
        same_artist_fids.append(c)

same_artist_fids = np.array(same_artist_fids, dtype=np.int32)

print ("# same-artist pairs: %d"%(len(same_artist_fids)))

100%|██████████| 1584/1584 [00:03<00:00, 420.91it/s]

# same-artist pairs: 5773652





In [8]:
# read halfsamples cache
hss_cache = np.load(HSS_CACHE_FILE).item()

# create sample from bottlecks, dpi and ARs of a pair
def create_pair_sample(fid1, fid2, hss_cache=hss_cache):
    
    hs1 = hss_cache[fid1]
    hs2 = hss_cache[fid2]
    
    return np.hstack((hs1, hs2))

In [9]:
# same pair generation

same_artist_ix = -1

def get_same_pair(flip_on_even_pass=False):
    global same_artist_ix
    same_artist_ix += 1
    
    l = len(same_artist_fids)
    
    ix = same_artist_ix % l
    
        # shuffle fids on each iteration start
    if 0 == ix:
        print ('shuffling same-artist pairs...')
        np.random.shuffle(same_artist_fids)
    
    pair = same_artist_fids[ix]

    if flip_on_even_pass and (float(same_artist_ix) / l % 2 >= 1):
        return np.flipud(pair) # on even passes flip pairs
    else:
        return pair

In [10]:
# diff pair generation

diff_pairs_used = {}
def is_diff_pair_used(p, add=True):
    global diff_pairs_used
    
    if not diff_pairs_used.has_key(p[0]):
        diff_pairs_used[p[0]] = {}

    if not diff_pairs_used.has_key(p[1]):
        diff_pairs_used[p[1]] = {}

    if diff_pairs_used[p[0]].has_key(p[1]) or \
        diff_pairs_used[p[1]].has_key(p[0]):
        return True

    if add:
        diff_pairs_used[p[0]][p[1]] = True
        diff_pairs_used[p[1]][p[0]] = True
    
    return False

def get_different_pair():
    while True:
        pair = np.random.choice(avail_fids, 2, replace=True)
        if pair[0] == pair[1]: continue
        if info_df.ix[pair[0]].artist == info_df.ix[pair[1]].artist: continue
        if is_diff_pair_used(pair, add=True): continue
        return pair

In [11]:
def gen_datafile(X_f=None, y_f=None, n_samples=10, memfiles=False):
    
    if memfiles:
        # memory-mapped records store
        print ('Creating ', X_f, y_f, '...'; time.sleep(0.5))
        Xs = np.memmap(X_f, dtype=np.float32, mode='w+', shape=(n_samples,SAMPLE_SIZE))
        ys = np.memmap(y_f, dtype=np.float32, mode='w+', shape=(n_samples, 2))
    else:
        Xs = np.zeros([n_samples, SAMPLE_SIZE], dtype=np.float32)
        ys = np.zeros([n_samples, 2], dtype=np.float32)

    for i in range(n_samples):
        
        if np.random.rand() < SAME_ARTIST_PROB:
            y = [1., 0.]
            pair = get_same_pair()    
        else:
            y = [0., 1.]
            pair = get_different_pair()

        Xs[i] = create_pair_sample(pair[0], pair[1])
        ys[i] = y
    
    if memfiles:
        ys.flush()
        Xs.flush()
        gc.collect()
    
    return Xs, ys

In [12]:
# gen validation file
X_val, y_val = gen_datafile(
    X_f=VAL_SAMPLES_FILE,
    y_f=VAL_YS_FILE,
    n_samples=VAL_N_SAMPLES
)

shuffling same-artist pairs...


In [13]:
# remove portion of same-artists pair used for validation from training data
print ('Same-artist combinations used in validation set:', 1 + same_artist_ix)

same_artist_fids = same_artist_fids[same_artist_ix+1:]
same_artist_ix = -1

print ('Same-artist combinations left:', len(same_artist_fids))

Same-artist combinations used in validation set: 68568
Same-artist combinations left: 5705084


In [14]:
# training data generator
def generate_sample():
    while 1:
        yield gen_datafile(n_samples=TRAIN_N_PER_BATCH)

--- *Training*

In [15]:
def score_auc():
    s = 0
    n = X_val.shape[0]
    y_p = model.predict(X_val[s:s+n], verbose=False)
    y_p = np.nan_to_num(y_p)
    return metrics.roc_auc_score(y_val[s:s+n].T[0], y_p.T[0])

class MyCallback(keras.callbacks.Callback):
    def _validate(self):
        s = score_auc()
        scores.append(s)
        print ("\n\n AUC = %.5f\n"%s; time.sleep(.5))
    def on_train_begin(self, epoch, logs={}):
        self._validate()
    def on_epoch_end(self, epoch, logs={}):
        self._validate()
        gc.collect()

In [16]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Conv1D
from keras.regularizers import l2, activity_l2

model = Sequential([
    Dense(4096, input_dim=SAMPLE_SIZE, activation='relu', init='glorot_uniform'),
    Dropout(0.55),
    Dense(4096, activation='relu', init='glorot_uniform'),
    Dropout(0.25),
    Dense(2048, activation='relu', init='glorot_uniform'),
    Dropout(0.25), 
    Dense(1024, activation='relu', init='glorot_uniform'),
    Dropout(0.25),
    Dense(2, activation='softmax')
])

model.compile(loss='binary_crossentropy', optimizer='SGD', metrics=['accuracy'])

In [None]:
scores = []

In [None]:
# train
hist = model.fit_generator(
        generate_sample(),
        samples_per_epoch=TRAIN_N_SAMPLES_PER_EPOCH,
        nb_epoch=TRAIN_N_EPOCHS,
        validation_data=(X_val, y_val),
        verbose=True,
        max_q_size=100,
        nb_worker=1,
        pickle_safe=False,
        callbacks = [
            MyCallback(),
            keras.callbacks.TensorBoard(log_dir=TFB_DIR, histogram_freq=0),
            keras.callbacks.ModelCheckpoint(
                MODELS_DIR + \
                '/e{epoch:02d}-l={loss:.5f}-vl={val_loss:.5f}-a={acc:.5f}-va={val_acc:.5f}.h5', 
                monitor='val_acc', verbose=0, save_best_only=False, 
                save_weights_only=False, mode='auto'
            ),
        ]
     )



 AUC = 0.51403

Epoch 1/11111
shuffling same-artist pairs...