In [None]:
import keras 

import numpy as np
import pandas as pd
from sklearn import model_selection, preprocessing

import cv2
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
# mount your gdrive to colab instance
from google.colab import drive
drive.mount('/content')

In [None]:
# copy kaggle credentials json
!mkdir /root/.kaggle
!cp "/content/My Drive/kaggle.json" /root/.kaggle

Download the data from Kaggle "__Humpback Whale Identification Challenge__"

In [None]:
!kaggle competitions download -c whale-categorization-playground
!unzip train.zip -d data
!unzip test.zip -d data
!ls data

In [None]:
# reading data
df = pd.read_csv("train.csv")

# encode labels
label_encoder = preprocessing.LabelEncoder()
label_encoder.fit_transform(df.Id)

labels = label_encoder.transform(df["Id"].values)

one_hot_encoder = preprocessing.OneHotEncoder()

labels = one_hot_encoder.fit_transform(labels.reshape(-1, 1))\
                        .toarray()\
                        .tolist()

df["Labels"] = [np.array(l) for l in labels]

df.head()

In [None]:
from keras.applications import VGG16
from keras import models, layers
from keras.preprocessing.image import ImageDataGenerator
from keras.metrics import \
  categorical_accuracy as cat_acc,\
  top_k_categorical_accuracy,\
  categorical_crossentropy

# competition metric
def top_5_acc(y_true, y_pred):
    return top_k_categorical_accuracy(y_true, y_pred, k=5)

# data generator
data_gen = ImageDataGenerator(
      rescale=1./255,
      rotation_range=20,
      width_shift_range=0.2,
      height_shift_range=0.2,
      fill_mode='nearest')

train_gen = data_gen.flow_from_dataframe(
    df, 
    "data/train", 
    x_col="Image", 
    y_col="Id",
    target_size=(256, 256), 
    color_mode="rgb", 
    class_mode="categorical", 
    batch_size=32, 
    shuffle=True)

In [None]:
# create model
features = VGG16(input_shape=(256, 256, 3),
                       include_top=False)

model = models.Sequential()

for layer in features.layers:
  model.add(layer)

model.add(layers.Flatten())
model.add(layers.Dense(64))
model.add(layers.BatchNormalization())
model.add(layers.ReLU())
model.add(layers.Dense(128))
model.add(layers.BatchNormalization())
model.add(layers.ReLU())
model.add(layers.Dropout(0.625))
model.add(layers.Dense(len(one_hot_encoder.categories_[0]), 
                       activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy',
              optimizer=keras.optimizers.Nadam(),
              metrics=['acc', top_5_acc])

model.summary()

In [None]:
# Train the model
history = model.fit_generator(
    train_gen,
    steps_per_epoch=train_gen.samples/train_gen.batch_size,
    epochs=100,
    verbose=1)

In [None]:
acc = history.history['acc']
val_acc = history.history['top_5_acc']
loss = history.history['loss']
 
epochs = range(len(acc))
 
plt.plot(epochs, acc, 'b', label='Training acc')
plt.title('Training and validation accuracy')
plt.legend()
 
plt.figure()
 
plt.plot(epochs, loss, 'b', label='Training loss')
plt.title('Training and validation loss')
plt.legend()
 
plt.show()