# Plant Seedlings Classification

https://www.kaggle.com/c/plant-seedlings-classification

In [None]:
import os
import cv2
import glob
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.utils import shuffle
from sklearn.cross_validation import train_test_split
from tensorflow import keras

In [None]:
def _process_image(path, dsize=(128, 128)):
    ''' Ресайз и нормировка изображения '''
    img = cv2.imread(path, cv2.IMREAD_COLOR)[:, :, ::-1].astype(np.float32)
    img -= [83.693283426971817, 72.796752160982763, 51.88953774401763]
    return cv2.resize(img, dsize)  # приводим все изображения к единому размеру

def read_train_labels(path = 'train'):
    ''' Загрузка списка файлов для обучения '''
    labels = []
    for entry in glob.glob(os.path.join(path, '*', '*.png')):
        parts = entry.split(os.path.sep)
        labels.append((parts[-2], entry))
    df = pd.DataFrame(data=labels, columns=('label', 'path'))
    df['label'] = df['label'].astype('category')
    return df

def load_test_data(path = 'test', dsize = (128, 128)):
    ''' Загружаем тестовые изображения '''
    paths = list(glob.glob(os.path.join(path, '*.png')))
    X = [_process_image(entry) for entry in paths]
    y = [path.split(os.path.sep)[-1] for path in paths]
    return np.array(X), y

def _to_categorical(label, categories):
    ''' Onehot encoding '''
    onehot = np.zeros(len(categories), np.float32)
    onehot[categories[label]] = 1
    return onehot

def generator(df, categories, batch_size=24, dsize = (128, 128)):
    ''' Возвращает батч картинок и классов '''
    while True:
        # перемешиваем данные на очередной эпохе
        df = shuffle(df)
        # итерируем по батчам 
        for i in range(len(df) // batch_size):
            X, y = [], []
            # итерируем по элементам в батче
            for j in range(i * batch_size, (i + 1) * batch_size):
                X.append(_process_image(df.values[j][1]))
                y.append(_to_categorical(df.values[j][0], categories))
            yield np.array(X), np.array(y)

## Загружаем данные для обучения

In [1]:
!ls

all.zip               sample_submission.csv [34mtrain[m[m
plant.ipynb           submission.csv        train.zip
plant.tar.gz          [34mtest[m[m
plant_weights.h5      test.zip


In [None]:
df = read_train_labels()

categories = dict((label, i) for i, label 
                  in enumerate(sorted(df['label'].cat.categories)))

inv_categories = dict((v, k) for k, v in categories.items())

In [None]:
df.groupby('label').count()

## Архитектура сверточной сети

In [None]:
### https://www.kaggle.com/miklgr500/keras-simple-model-0-97103-best-public-score

# полносвязный слой
def dense_set(inp_layer, n, activation, drop_rate=0):
    dp = keras.layers.Dropout(drop_rate)(inp_layer)
    dns = keras.layers.Dense(n)(dp)
    bn = keras.layers.BatchNormalization(axis=-1)(dns)
    act = keras.layers.Activation(activation=activation)(bn)
    return act

# сверточный слой
def conv_layer(feature_batch, feature_map, kernel_size=(3, 3),strides=(1,1), zp_flag=False):
    if zp_flag:
        zp = keras.layers.ZeroPadding2D((1,1))(feature_batch)
    else:
        zp = feature_batch
    conv = keras.layers.Conv2D(filters=feature_map, kernel_size=kernel_size, strides=strides)(zp)
    bn = keras.layers.BatchNormalization(axis=3)(conv)
    act = keras.layers.LeakyReLU(1/10)(bn)
    return act

# создаем модель
def get_model():
    inp_img = keras.layers.Input(shape=(128, 128, 3))

    # 128
    conv1 = conv_layer(inp_img, 64, zp_flag=False)
    conv2 = conv_layer(conv1, 64, zp_flag=False)
    mp1 = keras.layers.MaxPooling2D(pool_size=(3, 3), strides=(2, 2))(conv2)
    # 64
    conv3 = conv_layer(mp1, 128, zp_flag=False)
    conv4 = conv_layer(conv3, 128, zp_flag=False)
    mp2 = keras.layers.MaxPooling2D(pool_size=(3, 3), strides=(2, 2))(conv4)
    # 32
    conv7 = conv_layer(mp2, 256, zp_flag=False)
    conv8 = conv_layer(conv7, 256, zp_flag=False)
    conv9 = conv_layer(conv8, 256, zp_flag=False)
    mp3 = keras.layers.MaxPooling2D(pool_size=(3, 3), strides=(2, 2))(conv9)
    # 1
    # dense layers
    flt = keras.layers.Flatten()(mp3)
    ds1 = dense_set(flt, 128, activation='tanh')
    out = dense_set(ds1, 12, activation='softmax')

    model = keras.models.Model(inputs=inp_img, outputs=out)
    
    optimizer = keras.optimizers.Adam(lr=0.5 * 1e-2, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
    model.compile(loss='categorical_crossentropy',
                   optimizer=optimizer,
                   metrics=['accuracy'])
    
    return model

In [None]:
model = get_model()
model.summary()

## Запускаем обучение последнего слоя

In [None]:
# Выделяем 30% на валидацию
train_df, test_df = train_test_split(df,
                                     test_size=0.33,
                                     random_state=123)

# мониторинг процесса обучения
lr_reduce = keras.callbacks.ReduceLROnPlateau(monitor='val_acc', 
                                              factor=0.1,
                                              epsilon=1e-5, 
                                              patience=5, 
                                              verbose=1)

checkpoint = keras.callbacks.ModelCheckpoint('plant_weights.h5',
                                             save_best_only=True,
                                             verbose=1)
callbacks = [lr_reduce, checkpoint]

# закружаем валидационные изображения в память
test_gen = generator(test_df, categories, batch_size = len(test_df))
test_X, test_y = next(test_gen)

# запускаем процесс обучения модели
batch_size = 128
train_generator = generator(train_df, categories, batch_size = batch_size)
steps_per_epoch = len(train_df) / batch_size

if os.path.exists('plant_weights.h5'):
    model.load_weights('plant_weights.h5')

model.fit_generator(train_generator,
                    steps_per_epoch,
                    epochs=30,
                    shuffle=True,
                    validation_data=(test_X, test_y),
                    callbacks=[lr_reduce, checkpoint])

## Оцениваем качество предсказания

In [None]:
from sklearn.metrics import classification_report

actual = np.argmax(test_y, -1)
predicted = np.argmax(model.predict(test_X), -1)

print(classification_report(actual, predicted))

## Применяем обученную модель

In [None]:
# инициализируем модель и загружаем веса
model = get_model()
model.load_weights(filepath='plant_weights.h5')

# загружаем тестовые изображения
test_imgs, test_path = load_test_data()

# применяем модель и сохраняем результат
pred = np.argmax(model.predict(test_imgs, verbose=1), axis=-1)
pred_df = pd.DataFrame({'file': test_path,
                        'species': [inv_categories[p] for p in pred]})
pred_df.to_csv('submission.csv', index=False, header=True)

# F1=0.79219

In [None]:
!head submission.csv