In [4]:
import re

import numpy as np
import pandas as pd

import os

from tqdm import tqdm
import gc

from sklearn.utils import shuffle
from sklearn.metrics import log_loss

import feather

In [5]:
df_train = pd.read_json('data/train.json')
df_test = pd.read_json('data/test.json')

In [6]:
df_train.interest_level.value_counts()

low       34284
medium    11229
high       3839
Name: interest_level, dtype: int64

In [7]:
interest_level_map = {'low': 0, 'medium': 1, 'high': 2}
df_train.interest_level = df_train.interest_level.apply(lambda x: interest_level_map.get(x, -1))

In [8]:
def image_path_full(url):
    path = url[url.rfind('/')+1:]
    return '/' + path[0:7] + '/' + path

In [9]:
np.random.seed(1)
folds = np.random.choice([0, 1, 2, 3, 4, 5], size=len(df_train))
df_train['fold'] = folds.astype('uint8')

In [10]:
images_train = []

for row in df_train.itertuples():
    id = row.listing_id
    fold = row.fold
    lev = row.interest_level
    for p in row.photos:
        path = image_path_full(p)
        images_train.append((id, path, lev, fold))

In [11]:
df_images_train = pd.DataFrame(images_train, columns=['listing_id', 'image', 'interest_level', 'fold'])
df_images_train.sample(n=5)

Unnamed: 0,listing_id,image,interest_level,fold
274272,6825469,/6825469/6825469_821bbf62fe3aa31d02c280e32699f...,0,2
156522,7075598,/7075598/7075598_f0d4610b19b77bfba1a9aabb46992...,1,2
90125,7116399,/7116399/7116399_a7de1b4810c351badb6a3a77a5578...,2,2
146039,7008495,/7008495/7008495_137a6915dd2a7688423cba6f2f87f...,0,3
98000,7230905,/7230905/7230905_34ca4dff89629bd3aa25e7016287a...,0,4


In [None]:
#from keras.applications.inception_v3 import InceptionV3
#from keras.applications.inception_v3 import preprocess_input

from keras.applications.vgg16 import VGG16, preprocess_input

from keras.preprocessing import image
from keras.models import Model

#from keras.layers import Dense, GlobalAveragePooling2D
from keras.models import Sequential
from keras.layers import Dropout, Flatten, Dense

from keras import backend as K

In [13]:
def load_img(impath):
    if not os.path.exists(impath):
        return None

    try:
        #img = image.load_img(impath, target_size=(224, 224))
        img = image.load_img(impath, target_size=(224, 224))
        x = image.img_to_array(img)
        return preprocess_input(x)
    except:
        return None

In [14]:
failures = []

for impath in tqdm(df_images_train.image):
    new_path = 'processed_vgg' + impath + '.npy'
    if os.path.exists(new_path):
        continue

    img = load_img('images' + impath)
    if img is None:
        failures.append(impath)
        continue    

    folder = os.path.dirname(new_path)
    if not os.path.exists(folder):
        os.mkdir(folder)

    np.save(new_path, img)

100%|██████████| 276714/276714 [25:28<00:00, 181.04it/s] | 7/276714 [00:00<1:07:40, 68.14it/s] 77%|███████▋  | 212290/276714 [20:46<06:18, 170.28it/s]


In [15]:
failures = set(failures)
df_images_train = df_images_train[~df_images_train.image.isin(failures)]
feather.write_dataframe(df_images_train, 'tmp/df_images_train.feather')

ValueError: cannot serialize column 1 named image with dtype empty

In [5]:
df_images_train = feather.read_dataframe('tmp/df_images_train.feather')

In [None]:
fold012 = df_images_train[df_images_train.fold.isin([0, 1, 2])].reset_index(drop=1)

In [None]:
y_proto = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]], dtype=np.float32)


In [18]:
def prepare_batches(df, n):
    for i in range(0, len(df), n):
        yield df.iloc[i:i+n]

def df_image_generator(df, n=32, seed=0):
    i = 0
    while True:
        df = shuffle(df, random_state=(seed + i))

        batches = prepare_batches(df, n)

        for batch in batches:
            batch_res = []
            batch_label = []

            for impath, label in zip(batch.image, batch.interest_level):
                path = 'processed_vgg' + impath + '.npy'

                if not os.path.exists(path):
                    continue

                batch_res.append(np.load(path))
                batch_label.append(y_proto[label])

            if len(batch_res) > 0:
                batch_res = np.array(batch_res)
                batch_label = np.array(batch_label)
                yield batch_res, batch_label

        i = i + 1

In [19]:
base_model = InceptionV3(weights='imagenet', include_top=False)

In [None]:
top_model = Sequential()
top_model.add(Flatten(input_shape=model.output_shape[1:]))
top_model.add(Dense(256, activation='relu'))
top_model.add(Dropout(0.5))
top_model.add(Dense(3, activation='softmax'))

In [20]:
# add a global spatial average pooling layer
x = base_model.output
x = GlobalAveragePooling2D()(x)
# let's add a fully-connected layer
x = Dense(1024, activation='relu')(x)
# and a logistic layer -- let's say we have 200 classes
predictions = Dense(3, activation='softmax')(x)

# this is the model we will train
model = Model(input=base_model.input, output=predictions)

# first: train only the top layers (which were randomly initialized)
# i.e. freeze all convolutional InceptionV3 layers
for layer in base_model.layers:
    layer.trainable = False

# compile the model (should be done *after* setting layers to non-trainable)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')



In [21]:
from math import ceil
steps = ceil(len(fold012) / 32)
steps

4298

In [None]:
from keras_tqdm import TQDMNotebookCallback

In [None]:
gen = df_image_generator(fold012, n=32, seed=1)

model.fit_generator(gen, steps_per_epoch=steps, epochs=10, 
                    verbose=0, callbacks=[TQDMNotebookCallback()])

In [None]:
fold3 = df_images_train[df_images_train.fold == 3].reset_index(drop=1)

val_imgs = []

for impath in tqdm(fold3.image):
    path = 'processed' + impath + '.npy'
    val_imgs.append(np.load(path))

val_imgs = np.array(val_imgs)
y_val = y_proto[fold3.interest_level.values]

100%|██████████| 45735/45735 [01:11<00:00, 642.02it/s]  | 52/45735 [00:00<01:27, 519.86it/s]


In [None]:
y_pred = model.predict(val_imgs)
log_loss(y_val, y_pred)