In [16]:
import numpy as np
import pandas as pd 
import keras
from keras.applications import VGG19
from keras.applications.imagenet_utils import preprocess_input
from keras.models import Model
from keras.layers import Dense, Dropout, Flatten, Lambda, GlobalAveragePooling2D, Input
from keras.preprocessing import image

import os
from tqdm import tqdm

### Считываем данные в датафрэймы

In [3]:
data_dir = os.getcwd()
df_train = pd.read_csv('labels.csv')
df_test = pd.read_csv('sample_submission.csv')

In [4]:
df_train.head(10)

Unnamed: 0,id,breed
0,000bec180eb18c7604dcecc8fe0dba07,boston_bull
1,001513dfcb2ffafc82cccf4d8bbaba97,dingo
2,001cdf01b096e06d78e9e5112d419397,pekinese
3,00214f311d5d2247d5dfe4fe24b2303d,bluetick
4,0021f9ceb3235effd7fcde7f7538ed62,golden_retriever
5,002211c81b498ef88e1b40b9abf84e1d,bedlington_terrier
6,00290d3e1fdd27226ba27a8ce248ce85,bedlington_terrier
7,002a283a315af96eaea0e28e7163b21b,borzoi
8,003df8b8a8b05244b1d920bb6cf451f9,basenji
9,0042188c895a2f14ef64a918ed9c7b64,scottish_deerhound


In [5]:
# Создаем one_hot массив с ответами
target_series = pd.Series(df_train['breed'])
one_hot = pd.get_dummies(target_series, sparse=True)

one_hot_labels = np.asarray(one_hot)

In [6]:
one_hot_labels[0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0], dtype=uint8)

In [7]:
IM_SIZE = 224

In [10]:
# Функция для чтения изображений и перевода в numpy массив
# из keras.preprocessing
def read_img(img_id, train_or_test, size):
    """Read and resize image.
    # Arguments
        img_id: string
        train_or_test: string 'train' or 'test'.
        size: resize the original image.
    # Returns
        Image as numpy array.
    """
    img = image.load_img(os.path.join(data_dir, train_or_test, '%s.jpg' % img_id), target_size=size)
    img = image.img_to_array(img)
    return img

In [None]:
# Создаем массивы для тенировки
x_train = np.zeros((len(df_train), IM_SIZE, IM_SIZE, 3), dtype=np.float32)
y_train = np.zeros((one_hot_labels.shape), dtype=np.uint8)
for i, img_id in tqdm(enumerate(df_train['id'])):
    img = read_img(img_id, 'train', (IM_SIZE, IM_SIZE))
    x_train[i] = img
    y_train[i] = one_hot_labels[i]
    
print('Train Images shape: {} size: {:,}'.format(x_train.shape, x_train.size))

8438it [02:15, 62.32it/s] 

In [13]:
# Функция для выделения признаков при помощи предобученных моделей из Keras
def get_features(MODEL, data=x_train):
    cnn_model = MODEL(include_top=False, input_shape=(IM_SIZE, IM_SIZE, 3), weights='imagenet')
    
    inputs = Input((IM_SIZE, IM_SIZE, 3))
    x = inputs
    x = cnn_model(x)
    x = GlobalAveragePooling2D()(x)
    cnn_model = Model(inputs, x)

    features = cnn_model.predict(data, batch_size=64, verbose=1)
    return features

In [14]:
features = get_features(VGG19, x_train)



In [15]:
num_class = y_train.shape[1]

In [16]:
# Создаем модель для классификации
inputs = Input(features.shape[1:])
x = inputs
x = Dropout(0.5)(x)
x = Dense(num_class, activation='softmax')(x)
model = Model(inputs, x)

In [17]:
# используем оптимизатор stochastic gradient descent
sgd = keras.optimizers.SGD(lr=0.01, momentum=0.9, decay=5e-4)

In [18]:
# компилируем модель
model.compile(loss='categorical_crossentropy', optimizer=sgd,
              metrics=['accuracy'])

In [19]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         (None, 512)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 120)               61560     
Total params: 61,560
Trainable params: 61,560
Non-trainable params: 0
_________________________________________________________________


In [20]:
# список условий для ранней остановки тренировки или снижения leraning_rate
callbacks = [keras.callbacks.EarlyStopping(monitor='val_loss',
                                          patience=10, verbose=1),
            keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1,
                              patience=5, verbose=1)]

In [22]:
model.fit(features, y_train, batch_size=128, epochs=75, validation_split=0.1, verbose=1, callbacks=callbacks)

Train on 9199 samples, validate on 1023 samples
Epoch 1/75
Epoch 2/75
Epoch 3/75
Epoch 4/75
Epoch 5/75
Epoch 6/75
Epoch 7/75
Epoch 8/75
Epoch 9/75
Epoch 10/75
Epoch 11/75
Epoch 12/75
Epoch 13/75
Epoch 14/75
Epoch 15/75
Epoch 16/75
Epoch 17/75
Epoch 18/75
Epoch 19/75
Epoch 20/75
Epoch 21/75
Epoch 22/75
Epoch 23/75
Epoch 24/75
Epoch 25/75
Epoch 26/75
Epoch 27/75
Epoch 28/75
Epoch 29/75
Epoch 30/75
Epoch 31/75
Epoch 32/75
Epoch 00031: reducing learning rate to 0.0009999999776482583.
Epoch 33/75
Epoch 34/75
Epoch 35/75
Epoch 36/75
Epoch 37/75
Epoch 00036: reducing learning rate to 9.999999310821295e-05.
Epoch 00036: early stopping


<keras.callbacks.History at 0x7f7a980239e8>

In [None]:
# сохраняем веса
filepath = os.path.join(data_dir, "vgg19_imgnet_pretrained.h5")
model.save_weights(filepath)

In [None]:
# подготовка тестовых изображений
x_test = np.zeros((len(df_test), IM_SIZE, IM_SIZE, 3), dtype='float32')
for i, img_id in tqdm(enumerate(df_test['id'])):
    img = read_img(img_id, 'test', (IM_SIZE, IM_SIZE))
    x = preprocess_input(np.expand_dims(img.copy(), axis=0))
    x_test[i] = x
    
print('Test Images shape: {} size: {:,}'.format(x_test.shape, x_test.size))

In [None]:
# предсказание
preds = model.predict(x_test, verbose=1)

In [None]:
# переводим в пандас датафрейм и сохраням в csv
sub = pd.DataFrame(preds)
col_names = one_hot.columns.values
sub.columns = col_names

sub.insert(0, 'id', df_test['id'])
sub.head()

In [None]:
sub.to_csv('answers.csv', index=False)