# Yelp Restaurant Photo Classification

https://www.kaggle.com/c/yelp-restaurant-photo-classification

In [1]:
import os
import cv2
import numpy as np
import pandas as pd
from tensorflow.contrib import keras

preprocess_input = keras.applications.vgg16.preprocess_input
VGG16 = keras.applications.VGG16

## Загружаем разметку

In [2]:
train_biz_df = pd.read_csv('train.csv')
train_photos_df = pd.read_csv('train_photo_to_biz_ids.csv')
train_df = train_photos_df.merge(train_biz_df)

test_photos_df = pd.read_csv('test_photo_to_biz.csv')

In [3]:
def get_image(path, img_id, img_size=(224, 224)):
    img = cv2.imread(os.path.join(path, '%s.jpg' % img_id))[:,:,::-1]
    img = preprocess_input(img.astype(np.float32))
    return cv2.resize(img, img_size)

def to_dense(labels):
    result = [0] * 9
    for i in labels:
        result[i] = 1.
    return result

def train_generator(df, img_size=(224, 224), batch_size=32):
    while True:
        df = df.sample(frac=1).reset_index(drop=True)
        for i in range(0, len(df) // batch_size * batch_size, batch_size):
            X, y = [], []
            for _, row in df[i:i + batch_size].iterrows():
                X.append(get_image('train_photos', row['photo_id'], img_size))
                y.append(to_dense(map(int, str(row['labels']).split())))
            y = np.array(y)
            yield np.array(X), [y[:, i] for i in range(9)]

In [4]:
# TODO: build CNN model
# TODO: train CNN model using train_generator

## Получаем предсказания

In [7]:
model.load_weights('yelp_weights.h5')

In [10]:
class TestSequence(keras.utils.Sequence):
    """ Класс для чтения батча """
    def __init__(self, df, batch_size):
        self._df = df
        self._batch_size = batch_size

    def __len__(self):
        return int(np.ceil(len(self._df) / float(self._batch_size)))

    def __getitem__(self, idx):
        result = []
        sample = self._df[idx * self._batch_size:(idx + 1) * self._batch_size]
        for _, row in sample.iterrows():
            result.append(get_image('test_photos', row['photo_id']))
        return np.array(result)

In [12]:
# читаем данные параллельно в 4потока и применяем модель
preds = model.predict_generator(
    TestSequence(test_photos_df, batch_size=100), workers=4)

In [24]:
test_photos_df['labels'] = [[float(preds[j][i]) for j in range(9)] 
                            for i in range(len(test_photos_df))]

In [29]:
test_photos_df.head()

Unnamed: 0,photo_id,business_id,labels
0,317818,003sg,"[0.15017016232, 0.720844268799, 0.786582291126..."
1,30679,003sg,"[0.358560830355, 0.313709139824, 0.35823640227..."
2,455084,003sg,"[0.306661188602, 0.248710289598, 0.37077081203..."
3,371381,003sg,"[0.0900943800807, 0.979245781898, 0.9886131286..."
4,86224,003sg,"[0.0362958088517, 0.917712509632, 0.9531230926..."


In [30]:
business_df = test_photos_df.groupby('business_id')

In [61]:
submission = []
for k, labels in business_df['labels'].apply(list).iteritems():
    labels = np.array(labels).mean(axis=0)
    labels = np.where(labels > 0.5)[0]
    submission.append((k, ' '.join(map(str, labels))))

In [70]:
submission_df = pd.DataFrame(submission, columns=('business_id', 'labels'))
submission_df.to_csv('submission.csv', index=False)
# 0.70970