In [1]:
import os

import numpy as np
import pandas as pd

# import keras 
from sklearn import preprocessing, model_selection

import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
# mount your gdrive to colab instance
from google.colab import drive
drive.mount('/content/drive')

# copy kaggle credentials json
!mkdir /root/.kaggle
!cp "/content/drive/My Drive/whales/kaggle.json" /root/.kaggle

# copy dataset
!kaggle competitions download -c humpback-whale-identification

# unzipping
!mkdir -p /content/data
!cp sample_submission.csv train.csv /content/data

!unzip -q train.zip -d /content/data/train
!unzip -q test.zip -d /content/data/test

# checking
!ls /content/data

In [None]:
# set paths
PATH_TO_MY_DRIVE = "'/content/drive/My Drive'"
PATH_TO_DATA_COLAB = "/content/data"

MODEL_NAME = "mobilenet"

PATH_TO_MODEL_DATA_COLAB = os.path.join(PATH_TO_DATA_COLAB, MODEL_NAME)
os.makedirs(PATH_TO_MODEL_DATA_COLAB, exist_ok=True)

PATH_TO_MODEL_DATA_ON_MY_DRIVE = os.path.join(PATH_TO_MY_DRIVE + "/whales", MODEL_NAME)
PATH_TO_CHECKPOINTS_ON_MY_DRIVE = os.path.join(PATH_TO_MODEL_DATA_ON_MY_DRIVE, "checkpoints")

In [2]:
def labels_and_label_encoder(ids):
    
    values = np.array(ids)
    
    label_encoder = preprocessing.LabelEncoder()
    integer_encoded = label_encoder.fit_transform(values)
    
    onehot_encoder = preprocessing.OneHotEncoder(sparse=False)
    labels = onehot_encoder.fit_transform(integer_encoded.reshape(-1, 1))
    
    return labels, label_encoder

In [21]:
# df = pd.read_csv(os.path.join(PATH_TO_DATA_COLAB, "train.csv"))
df = pd.read_csv("/data/workspace/all/train.csv")

df_new_whales = df[df.Id == "new_whales"]
df = df[~df.isin(df_new_whales).all(1)]

labels, label_encoder = labels_and_label_encoder(df.Id)

df["Labels"] = [np.array(l) for l in labels]

df.head()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


Unnamed: 0,Image,Id,Labels
0,0000e88ab.jpg,w_f48451c,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,0001f9222.jpg,w_c3d896a,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,00029d126.jpg,w_20df2c5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,00050a15a.jpg,new_whale,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,0005c1ef8.jpg,new_whale,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [None]:
from keras.applications import MobileNet
from keras import models
from keras.callbacks import \
    ReduceLROnPlateau, \
    EarlyStopping, \
    ModelCheckpoint, \
    TensorBoard
from keras.preprocessing.image import ImageDataGenerator
from keras.metrics import \
  categorical_accuracy as cat_acc,\
  top_k_categorical_accuracy,\
  categorical_crossentropy

# competition metric
def top_5_acc(y_true, y_pred):
    return top_k_categorical_accuracy(y_true, y_pred, k=5)

# train data generator
train_data_gen = ImageDataGenerator(
      rescale=1./255,
      rotation_range=20,
      width_shift_range=0.2,
      height_shift_range=0.2,
      fill_mode="nearest")

train_gen = train_data_gen.flow_from_dataframe(
    df, 
    os.path.join(PATH_TO_DATA_COLAB, "train"), 
    x_col="Image", 
    y_col="Id",
    target_size=(256, 256), 
    color_mode="rgb", 
    class_mode="categorical", 
    batch_size=32, 
    shuffle=True)

# callbacks
reduce_lr_callback = ReduceLROnPlateau(
    monitor="loss", 
    mode="min",
    factor=0.2,
    patience=3, 
    verbose=1,
    min_lr=1e-5)

early_stopping_callback = EarlyStopping(
    monitor="loss", 
    mode="min",
    patience=3, 
    verbose=1, 
    restore_best_weights=True)

checkpoint_callback = ModelCheckpoint(
    os.path.join(PATH_TO_CHECKPOINTS_ON_MY_DRIVE, MODEL_NAME + "_{epoch:02d}_{loss:.2f}.hdf5"), 
    monitor="loss", 
    mode="min",
    verbose=1, 
    save_best_only=True, 
    period=5)

tensorbard_callbacks = TensorBoard(
    log_dir="./logs", 
    batch_size=32)

In [None]:
# create model
model = MobileNet(input_shape=(256, 256, 3), 
                  alpha=1., 
                  weights=None, 
                  classes=len(one_hot_encoder.categories_[0]))

# Compile the model
model.compile(loss="categorical_crossentropy",
              optimizer=keras.optimizers.Nadam(),
              metrics=["acc", top_5_acc])

model.summary()

In [None]:
# Train the model
history = model.fit_generator(
    train_gen,
    steps_per_epoch=train_gen.samples / train_gen.batch_size,
    epochs=100,
    verbose=1,
    callbacks=[reduce_lr_callback,
               early_stopping_callback, 
               checkpoint_callback, 
               tensorbard_callbacks])

In [None]:
acc = history.history["acc"]
val_acc = history.history["top_5_acc"]
loss = history.history["loss"]
 
epochs = range(len(acc))
 
plt.plot(epochs, acc, "b", label="Training acc")
plt.title("Training and validation accuracy")
plt.legend()
 
plt.figure()
 
plt.plot(epochs, loss, "b", label="Training loss")
plt.title("Training and validation loss")
plt.legend()
 
plt.show()

In [None]:
# read test data
test = os.listdir(os.path.join(PATH_TO_DATA_COLAB, "test"))
print("Number of test samples: {}".format(len(test)))

# dataframe with empty predictions
test_df = pd.DataFrame(test, columns=["Image"])
test_df["Id"] = ""

# test data generator
test_data_gen = ImageDataGenerator(rescale=1./255)

test_gen = test_data_gen.flow_from_dataframe(
    test_df, 
    os.path.join(PATH_TO_DATA_COLAB, "test"), 
    x_col="Image", 
    target_size=(256, 256), 
    color_mode="rgb", 
    class_mode=None, 
    batch_size=128,
    shuffle=False)

# predictions
predictions = model.predict_generator(
    test_gen,
    steps=test_gen.samples / test_gen.batch_size,
    verbose=True)

# write a submission
for i, pred in enumerate(predictions):
    test_df.loc[i, "Id"] = " ".join(label_encoder.inverse_transform(pred.argsort()[-5:][::-1]))

test_df.to_csv(os.path.join(PATH_TO_MODEL_DATA_ON_MY_DRIVE, "submission.csv"), index=False)

In [None]:
# upload data to google drive
!kaggle competitions submit -c humpback-whale-identification -f submission.csv -m $MODEL_NAME