In [None]:
!pip install --user git+https://github.com/bolein/keras_img_iterator.git --upgrade

In [None]:
!pip install h5py --upgrade

In [None]:
# Now restart your kernel with Kernel -> Restart

In [None]:
# !!! DO NOT READOWNLOAD EVERY TIME !!!
# Download the data (need only once!!)

## Load Libraries
import os
import requests, zipfile, io

# load data into platform
url = requests.get('https://he-s3.s3.amazonaws.com/media/hackathon/deep-learning-challenge-1/identify-the-objects/a0409a00-8-dataset_dp.zip')
data = zipfile.ZipFile(io.BytesIO(url.content))
data.extractall('data/')

In [None]:
# check if the files have been download in current directory
os.listdir('data')

In [1]:
import pandas as pd
import tensorflow as tf
import numpy as np

from keras.models import save_model, load_model
from keras.preprocessing.image import ImageDataGenerator
from keras_img_iterator import SingleDirectoryIterator

from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [2]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.regularizers import l2


def convnet(num_classes, image_size):
    model = Sequential()
    
    model.add(Conv2D(32, (3, 3), input_shape=(image_size, image_size, 3),
                     activation='relu', padding='same', kernel_regularizer=l2(0.001)))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Conv2D(32, (3, 3), activation='relu', padding='same', kernel_regularizer=l2(0.001)))
    model.add(MaxPooling2D(pool_size=(2, 2)))
        
    model.add(Conv2D(64, (3, 3), activation='relu', padding='same'))
    model.add(MaxPooling2D(pool_size=(2, 2)))

    model.add(Conv2D(64, (3, 3), activation='relu', padding='same'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    
    model.add(Flatten())  # this converts our 3D feature maps to 1D vectors
    model.add(Dropout(0.3))
    model.add(Dense(1024, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(512, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes, activation='softmax'))
    return model


In [2]:
# Load the meta data
# Make sure you downloaded train_sample.csv file
meta_data = pd.read_csv('data/train_sample.csv', header=0) 
filenames = meta_data['image_id'].apply(lambda id: id + '.png').values
labels = meta_data['label'].values
classes = list(set(labels))

# split into test and validation
files_train, files_validate, labels_train, labels_validate = \
    train_test_split(filenames, labels, test_size=0.2, random_state=42)

num_train_samples = files_train.shape[0]
num_val_samples = files_validate.shape[0]
num_classes = len(classes)

In [3]:
# this is the augmentation configuration we will use for training
batch_size = 32
image_size = 128

train_gen = ImageDataGenerator(
    rescale=1. / 255,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True, 
    width_shift_range=0.2, 
    height_shift_range=0.2)

# this is a similar generator, for validation data
# only rescaling
test_gen = ImageDataGenerator(rescale=1. / 255)

train_iterator = SingleDirectoryIterator(
    directory='data/train_img/',
    filenames=files_train,
    labels=labels_train,
    classes=classes,
    image_data_generator=train_gen,
    batch_size=batch_size,
    target_size=(image_size, image_size),
    seed=1337)

validation_iterator = SingleDirectoryIterator(
    directory='data/train_img/',
    filenames=files_validate,
    labels=labels_validate,
    classes=classes,
    image_data_generator=test_gen,
    batch_size=batch_size,
    target_size=(image_size, image_size),
    seed=1337)

Found 2572 files belonging to 25 classes.
Found 643 files belonging to 25 classes.


In [6]:
# initialize and compile the model
model = convnet(num_classes, image_size)

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [4]:
# OR load from saved file (only if the model previously saved)
model = load_model('model_0.321674930874.h5', compile = True)

In [None]:
# Train the model
num_epochs = 40

validation_iterator.reset()
train_iterator.reset()

history = model.fit_generator(
    train_iterator,
    steps_per_epoch=num_train_samples // batch_size + 1,
    epochs=num_epochs,
    validation_data=validation_iterator,
    validation_steps=num_val_samples // batch_size + 1)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
 42/215 [====>.........................] - ETA: 39s - loss: 1.8442 - acc: 0.4722

In [None]:
# Visualize learning
import matplotlib.pyplot as plt

# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

# summarize history for accuracy
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
# Calculate f1-score against validation set

from sklearn.metrics import f1_score
from keras.utils import to_categorical

# Set up iterator for test set
metric_iterator = SingleDirectoryIterator(
    directory='data/train_img/',
    filenames=files_validate,
    image_data_generator=test_gen,
    batch_size=batch_size,
    target_size=(image_size, image_size),
    shuffle=False)

predictions = model.predict_generator(
    generator=metric_iterator,
    steps=num_val_samples // batch_size + 1)

# binarize validation labels
encode = np.vectorize(lambda label: classes.index(label)) # encode to integers
y_true = to_categorical(encode(labels_validate), num_classes) # encode to one-hot vectors

int_labels = np.argmax(predictions, axis=1)
y_predicted = to_categorical(int_labels, num_classes)

score = f1_score(y_true, y_predicted, average='weighted')

print("model scored {} on validation set".format(score))

model scored 0.321674930874 on validation set


  'precision', 'predicted', average, warn_for)


In [None]:
# top 5 predictions
from keras.preprocessing.image import load_img
for i in range(5):
    print('I see this product is ' + classes[int_labels[i]])
    plt.imshow(load_img('data/train_img/' + files_validate[i]))
    plt.show()

In [None]:
# Test model
# Read test data set
test_data = pd.read_csv('data/test.csv', header=0)
files_test = test_data['image_id'].apply(lambda id: id + '.png').values

In [None]:
# Set up iterator for test set
test_iterator = SingleDirectoryIterator(
    directory='data/test_img/',
    filenames=files_test,
    image_data_generator=test_gen,
    batch_size=batch_size,
    target_size=(image_size, image_size),
    shuffle=False)

# make prediction
num_test_samples = files_test.shape[0]
predictions = model.predict_generator(
    generator=test_iterator,
    steps=num_test_samples // batch_size + 1)

test_labels = [classes[i] for i in np.argmax(predictions, axis=1)]

In [None]:
# function for downloading results
from IPython.display import HTML
import base64

def create_download_link(df, filename):  
    csv = df.to_csv(index=False)
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{filename}</a>'
    html = html.format(payload=payload,filename=filename)
    return HTML(html)

In [None]:
# save to file and create download link
submission = pd.DataFrame({'image_id':test_data.image_id, 'label':test_labels})
create_download_link(submission, "submission.csv")

In [None]:
# save model
model_file = 'model_{}.h5'.format(score)
save_model(model, model_file)
print('Training complete. model was saved as ', model_file)

('Training complete. model was saved as ', 'model_0.321674930874.h5')
