# Tools

In [1]:
from PIL import Image

from os import listdir
from os.path import isfile, join

from tensorflow import keras
from tensorflow.keras import layers

import pandas as pd
import numpy as np
import json

import matplotlib.pyplot as plt

import keras, keras.layers as L
import tensorflow as tf

from keras.models import Sequential
from keras.layers.normalization import BatchNormalization
from keras.layers.convolutional import Conv2D
from keras.layers.convolutional import MaxPooling2D
from keras.layers.core import Activation, Flatten, Dropout, Dense
from keras import backend as K
from keras.preprocessing.image import ImageDataGenerator
from keras.optimizers import Adam
from keras.preprocessing import image
from keras.preprocessing.image import img_to_array
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

# Functions

In [2]:
def convert_image_to_array(img, image_size):
    try:
      return np.array(img.resize((image_size, image_size), Image.ANTIALIAS))
    except BaseException as e:
      print('Error!')
      print(e)
      plt.imshow(np.array(img) / 255)

# Get data

In [13]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [15]:
import os
os.listdir('./drive/My Drive')

['lemon-dataset',
 'конкуренция (2).gsheet',
 'Копия конкуренция.gsheet',
 'конкуренция (1).gsheet',
 'конкуренция.gsheet',
 'IMG_7587.JPG',
 'IMG_7571.JPG',
 'IMG_7591.JPG',
 'IMG_7589.JPG',
 'IMG_7573.JPG',
 'IMG_7581.JPG',
 'IMG_7584.JPG',
 'IMG_7583.JPG',
 'IMG_7577.JPG',
 'IMG_7578.JPG',
 'IMG_7569.JPG',
 'IMG_7582.JPG',
 'IMG_7576.JPG',
 'IMG_7586.JPG',
 'IMG_7575.JPG',
 'IMG_7588.JPG',
 'IMG_7574.JPG',
 'IMG_7580.JPG',
 'IMG_7570.JPG',
 'IMG_7572.JPG',
 'IMG_7568.JPG',
 'IMG_7590.JPG',
 'IMG_7579.JPG',
 'IMG_7585.JPG',
 'CV_Dudukin_SPG.pdf',
 'CL_Dudukin_SPG.pdf',
 'Untitled document (1).gdoc',
 'желез.иванов (11.04.14).docx',
 'images (12).pdf',
 'images (11).pdf',
 'images (10).pdf',
 'images (9).pdf',
 'images (8).pdf',
 'Untitled document.gdoc',
 'images (7).pdf',
 'images (6).pdf',
 'images (5).pdf',
 'images (4).pdf',
 'images (3).pdf',
 'images (2).pdf',
 'images (1).pdf',
 'images.pdf',
 'SSRN_ID392043_code030408590.pdf',
 'image1.PNG',
 'Untitled map.gmap',
 'EY_case_Co

In [17]:
train_dir = 'drive/My Drive/DS Чемпионат/Качество лимонов/lemon-dataset/images/train/'
test_dir = 'drive/My Drive/DS Чемпионат/Качество лимонов/lemon-dataset/images/test/'

images_train_filenames =  [f for f in listdir(train_dir) if isfile(join(train_dir, f))]
images_test_filenames =  [f for f in listdir(test_dir) if isfile(join(test_dir, f))]

In [18]:
annotations_dir = 'drive/My Drive/DS Чемпионат/Качество лимонов/lemon-dataset/annotations/'

with open(annotations_dir + 'instances_default.json', 'r') as fp:
    data = json.load(fp)

In [19]:
df_annotations = data['annotations']
df_annotations = pd.DataFrame(df_annotations)

In [20]:
df_images = data['images']
df_images = pd.DataFrame(df_images)

In [None]:
%%time

mapper = dict()

for index, row in df_annotations.iterrows():
  if mapper.get(row['image_id'], None) is None:
    mapper[row['image_id']] = set()
  mapper[row['image_id']].add(row['category_id'])

CPU times: user 1.17 s, sys: 0 ns, total: 1.17 s
Wall time: 1.17 s


In [None]:
image_size = 256

In [None]:
%%time

X_train, y_train = [], []

for index, row in df_images.iterrows():
  im = Image.open(train_dir + row['file_name'])
  im_array = convert_image_to_array(im, image_size = image_size)
  cat = np.zeros(9, dtype = int)
  for x in mapper[row['id']]:
    cat[x - 1] = 1
  X_train.append(im_array)
  y_train.append(cat)

CPU times: user 1min, sys: 906 ms, total: 1min 1s
Wall time: 1min 4s


In [None]:
X_train = np.array(X_train)
y_train = np.array(y_train)

In [None]:
X_train.shape, y_train.shape

((1984, 256, 256, 3), (1984, 9))

In [None]:
%%time

X_test = []
id_test = []

for filename in images_test_filenames:
  im = Image.open(test_dir + filename)
  im_array = convert_image_to_array(im, image_size)
  X_test.append(im_array)
  id_test.append(filename)

CPU times: user 13.7 s, sys: 136 ms, total: 13.9 s
Wall time: 14.7 s


In [None]:
X_test = np.array(X_test)

# Model

In [None]:
def get_model():
  model = keras.models.Sequential()

  model.add(Conv2D(32, (3, 3), padding="same",input_shape=[image_size, image_size, 3]))
  model.add(Activation("relu"))
  model.add(BatchNormalization(axis=-1))
  model.add(MaxPooling2D(pool_size=(3, 3)))
  model.add(Dropout(0.25))

  model.add(Conv2D(64, (3, 3), padding="same"))
  model.add(Activation("relu"))
  model.add(BatchNormalization(axis=-1))

  model.add(Conv2D(64, (3, 3), padding="same"))
  model.add(Activation("relu"))
  model.add(BatchNormalization(axis=-1))
  model.add(MaxPooling2D(pool_size=(2, 2)))
  model.add(Dropout(0.25))

  model.add(Conv2D(128, (3, 3), padding="same"))
  model.add(Activation("relu"))
  model.add(BatchNormalization(axis=-1))

  model.add(Conv2D(128, (3, 3), padding="same"))
  model.add(Activation("relu"))
  model.add(BatchNormalization(axis=-1))

  model.add(MaxPooling2D(pool_size=(2, 2)))
  model.add(Dropout(0.25))
  model.add(Flatten())
  model.add(Dense(1024))
  model.add(Activation("relu"))
  model.add(BatchNormalization())
  model.add(Dropout(0.5))

  model.add(Dense(9))
  model.add(Activation("sigmoid"))

  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=
                [keras.metrics.AUC(name='roc_auc', curve = 'ROC'),
                 keras.metrics.AUC(name='roc_auc_multi_label', curve = 'ROC', multi_label=True)])

  return model

In [None]:
aug = ImageDataGenerator(
    rotation_range=25, width_shift_range=0.1,
    height_shift_range=0.1, shear_range=0.2, 
    zoom_range=0.2,horizontal_flip=True, 
    fill_mode="nearest")

In [None]:
model = get_model()

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 256, 256, 32)      896       
_________________________________________________________________
activation (Activation)      (None, 256, 256, 32)      0         
_________________________________________________________________
batch_normalization (BatchNo (None, 256, 256, 32)      128       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 85, 85, 32)        0         
_________________________________________________________________
dropout (Dropout)            (None, 85, 85, 32)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 85, 85, 64)        18496     
_________________________________________________________________
activation_1 (Activation)    (None, 85, 85, 64)        0

In [None]:
%%time

BS = 32
EPOCHS = 8

history = model.fit_generator(
    aug.flow(X_train, y_train, batch_size=BS),
    steps_per_epoch=len(X_train) // BS,
    epochs=EPOCHS, verbose=1)

Instructions for updating:
Please use Model.fit, which supports generators.
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
CPU times: user 1h 44min 46s, sys: 50.8 s, total: 1h 45min 37s
Wall time: 54min 23s


In [None]:
%%time

y_preds = model.predict_proba(X_test)

for i in range(len(y_preds)):
  y_preds[i] = np.array(y_preds[i] >= 0.5)

y_preds = np.array(y_preds, dtype = int)

Instructions for updating:
Please use `model.predict()` instead.
CPU times: user 37.9 s, sys: 308 ms, total: 38.3 s
Wall time: 19.7 s


In [None]:
y_preds = pd.DataFrame(data = np.concatenate((np.array(id_test).reshape(-1, 1), y_preds), axis = 1), index = None, columns=['image_id'] + list(map(str, range(1, 10))))
y_preds.to_csv('drive/My Drive/DS Чемпионат/Качество лимонов/y_preds.csv', index = None)