# Задача

По заданной выборке изображений составить бинарный классификатор, имеющий две категории: __Крокодилы__ и __Часы__

# Решения

Будем пробовать несколько моделей

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import cv2
import os
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, StratifiedShuffleSplit, train_test_split, GridSearchCV

Считаем данные

In [2]:
CROC_PATH = 'data/crocodile/'
CLOCK_PATH = 'data/clock/'
DEBUG = False
croc_dict = dict()
clock_dict = dict()
croc_data = np.array([cv2.imread(CROC_PATH + i) for i in os.listdir(CROC_PATH)])
clock_data = np.array([cv2.imread(CLOCK_PATH + i) for i in os.listdir(CLOCK_PATH)])
'''
for i in os.listdir(CROC_PATH):
    if DEBUG: print('Reading image `{}`'.format(i))
    arr = cv2.imread(CROC_PATH + i)
    croc_dict[i.replace('.png', '')] = arr
    croc_data = np.append(croc_data, arr)
for i in os.listdir(CLOCK_PATH):
    arr = cv2.imread(CLOCK_PATH + i)
    clock_dict[i.replace('.png', '')] = arr
    np.append(croc_data, arr)
'''

"\nfor i in os.listdir(CROC_PATH):\n    if DEBUG: print('Reading image `{}`'.format(i))\n    arr = cv2.imread(CROC_PATH + i)\n    croc_dict[i.replace('.png', '')] = arr\n    croc_data = np.append(croc_data, arr)\nfor i in os.listdir(CLOCK_PATH):\n    arr = cv2.imread(CLOCK_PATH + i)\n    clock_dict[i.replace('.png', '')] = arr\n    np.append(croc_data, arr)\n"

Построим простые модели и оценим их качество.

### Линейная модель (можно пропустить при выполнении кода)

In [3]:
ans = np.concatenate((np.ones(clock_data.shape[0]), np.zeros(croc_data.shape[0])))
data = np.concatenate((clock_data.reshape(clock_data.shape[0], -1), croc_data.reshape(croc_data.shape[0], -1)))
train_data, test_data, train_labels, test_labels = train_test_split(data, ans, test_size=0.3, random_state=0)
cv = StratifiedShuffleSplit(n_splits=10, random_state=0)

In [4]:
reg = LogisticRegression(random_state=0)
param_grid = {'penalty': ['l1', 'l2'], 'C': [1, 10, 100]}
grid_cv = GridSearchCV(reg, param_grid, scoring='accuracy', cv=cv, n_jobs=4)
%time grid_cv.fit(train_data, train_labels)

CPU times: user 1.76 s, sys: 96.4 ms, total: 1.86 s
Wall time: 58.3 s


GridSearchCV(cv=StratifiedShuffleSplit(n_splits=10, random_state=0, test_size='default',
            train_size=None),
       error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=4,
       param_grid={'penalty': ['l1', 'l2'], 'C': [1, 10, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [5]:
grid_cv.best_estimator_

LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [6]:
grid_cv.best_score_

0.7057142857142857

Результаты не очень.

### Случайный лес (можно пропустить)

In [7]:
reg = RandomForestClassifier(random_state=0, n_jobs=4)
param_grid = {'n_estimators': [600],
              'max_depth': [30], 'criterion': ['entropy']}
grid_cv = GridSearchCV(reg, param_grid, scoring='accuracy', cv=cv, n_jobs=4)
%time grid_cv.fit(train_data, train_labels)

CPU times: user 23.3 s, sys: 63.1 ms, total: 23.3 s
Wall time: 1min 2s


GridSearchCV(cv=StratifiedShuffleSplit(n_splits=10, random_state=0, test_size='default',
            train_size=None),
       error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=4,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=4,
       param_grid={'n_estimators': [600], 'max_depth': [30], 'criterion': ['entropy']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [8]:
grid_cv.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=30, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=600, n_jobs=4,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [9]:
grid_cv.best_score_

0.8242857142857143

### Градиентный бустинг над случайными лесами (тоже можно пропустить)

In [13]:
import xgboost as xgb

In [14]:
reg = xgb.XGBClassifier(random_state=0, learning_rate=0.005, n_jobs=4, n_estimators=300)
#param_grid = {'n_estimators': [100, 200, 400, 600],
#              'learning_rate': [0.001, 0.005, 0.1]}
param_grid = {'n_estimators': [400],
              'learning_rate': [0.005]}
grid_cv = GridSearchCV(reg, param_grid, scoring='accuracy', cv=cv, n_jobs=4)
%time grid_cv.fit(train_data, train_labels)

  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


CPU times: user 1min 6s, sys: 182 ms, total: 1min 6s
Wall time: 10min 31s


GridSearchCV(cv=StratifiedShuffleSplit(n_splits=10, random_state=0, test_size='default',
            train_size=None),
       error_score='raise',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.005, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=300,
       n_jobs=4, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
       fit_params=None, iid=True, n_jobs=4,
       param_grid={'n_estimators': [400], 'learning_rate': [0.005]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [15]:
reg.fit(train_data, train_labels)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.005, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=300,
       n_jobs=4, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [16]:
cross_val_score(reg, test_data, test_labels)

  if diff:
  if diff:
  if diff:


array([0.76, 0.77, 0.81])

## А теперь изюминка нашего проекта: CNN

In [17]:
from keras.models import Model
from keras.layers import Input, Convolution2D, MaxPooling2D, Dense, Dropout, Flatten
from keras.utils import np_utils

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [18]:
ans = np.concatenate((np.ones(clock_data.shape[0]), np.zeros(croc_data.shape[0])))
data = np.concatenate((clock_data, croc_data))
train_data, test_data, train_labels, test_labels = train_test_split(data, ans, test_size=0.2, random_state=42)
#cv = StratifiedShuffleSplit(n_splits=10, random_state=0)

In [19]:
train_data.shape

(800, 32, 32, 3)

In [29]:
# Параметры модели. С ними надо будет поиграться
batch_size = 21
num_epochs = 91
kernel_size = 3
pool_size = 2
conv_depth_1 = 32
conv_depth_2 = 32
drop_prob_1 = 0.3
drop_prob_2 = 0.4
hidden_size = 128

In [30]:
num_train, depth, height, width = train_data.shape
num_test = test_data.shape[0]
num_classes = np.unique(train_labels).shape[0]

In [31]:
train_data = train_data.astype('float32')
test_data = test_data.astype('float32')
train_data /= np.max(train_data)
test_data /= np.max(test_data)

In [32]:
Y_train = np_utils.to_categorical(train_labels, num_classes)
Y_test = np_utils.to_categorical(test_labels, num_classes)

In [33]:
inp = Input(shape=(depth, height, width))

# Conv [32] -> Conv [32] -> Pool (with dropout on the pooling layer)
conv_1 = Convolution2D(conv_depth_1, (kernel_size, kernel_size), padding='same', activation='relu')(inp)
#conv_2 = Convolution2D(conv_depth_1, (kernel_size, kernel_size), padding='same', activation='relu')(conv_1)
pool_1 = MaxPooling2D(pool_size=(pool_size, pool_size))(conv_1)
drop_1 = Dropout(drop_prob_1)(pool_1)

# Conv [64] -> Conv [64] -> Pool (with dropout on the pooling layer)
conv_3 = Convolution2D(conv_depth_2, (kernel_size, kernel_size), padding='same', activation='relu')(drop_1)
#conv_4 = Convolution2D(conv_depth_2, (kernel_size, kernel_size), padding='same', activation='relu')(conv_3)
pool_2 = MaxPooling2D(pool_size=(pool_size, pool_size))(conv_3)
drop_2 = Dropout(drop_prob_2)(pool_2)

# Now flatten to 1D, apply FC -> ReLU (with dropout) -> softmax
flat = Flatten()(drop_2)
hidden = Dense(hidden_size, activation='relu')(flat)
drop_3 = Dropout(drop_prob_2)(hidden)
out = Dense(num_classes, activation='softmax')(drop_3)

In [34]:
model = Model(input=inp, output=out)

  """Entry point for launching an IPython kernel.


In [35]:
model.compile(loss='categorical_crossentropy', optimizer='adagrad', metrics=['accuracy'])

In [36]:
model.fit(train_data, Y_train, batch_size=batch_size, epochs=num_epochs, verbose=1, validation_split=0.1)

Train on 720 samples, validate on 80 samples
Epoch 1/91
Epoch 2/91
Epoch 3/91
Epoch 4/91
Epoch 5/91
Epoch 6/91
Epoch 7/91
Epoch 8/91
Epoch 9/91
Epoch 10/91
Epoch 11/91
Epoch 12/91
Epoch 13/91
Epoch 14/91
Epoch 15/91
Epoch 16/91
Epoch 17/91
Epoch 18/91
Epoch 19/91
Epoch 20/91
Epoch 21/91
Epoch 22/91
Epoch 23/91
Epoch 24/91
Epoch 25/91
Epoch 26/91
Epoch 27/91
Epoch 28/91
Epoch 29/91
Epoch 30/91
Epoch 31/91
Epoch 32/91
Epoch 33/91
Epoch 34/91
Epoch 35/91
Epoch 36/91
Epoch 37/91
Epoch 38/91
Epoch 39/91
Epoch 40/91
Epoch 41/91
Epoch 42/91
Epoch 43/91
Epoch 44/91
Epoch 45/91
Epoch 46/91
Epoch 47/91
Epoch 48/91
Epoch 49/91
Epoch 50/91
Epoch 51/91
Epoch 52/91
Epoch 53/91
Epoch 54/91
Epoch 55/91
Epoch 56/91
Epoch 57/91
Epoch 58/91
Epoch 59/91
Epoch 60/91
Epoch 61/91
Epoch 62/91


Epoch 63/91
Epoch 64/91
Epoch 65/91
Epoch 66/91
Epoch 67/91
Epoch 68/91
Epoch 69/91
Epoch 70/91
Epoch 71/91
Epoch 72/91
Epoch 73/91
Epoch 74/91
Epoch 75/91
Epoch 76/91
Epoch 77/91
Epoch 78/91
Epoch 79/91
Epoch 80/91
Epoch 81/91
Epoch 82/91
Epoch 83/91
Epoch 84/91
Epoch 85/91
Epoch 86/91
Epoch 87/91
Epoch 88/91
Epoch 89/91
Epoch 90/91
Epoch 91/91


<keras.callbacks.History at 0x7fd15f729dd8>

In [37]:
model.evaluate(test_data, Y_test, verbose=1)



[0.40107868909835814, 0.89]