In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import cv2
from sklearn import datasets
%matplotlib inline

In [None]:
# import raw train data 
train_raw = pd.read_csv('train.csv')

In [None]:
# import raw test data 
test_raw = pd.read_csv('test.csv')

In [None]:
# add .jpg to the incomplete image path/test data have no such issue
train_raw.loc[~train_raw['image_path'].str.contains('jpg'), 'image_path'] = train_raw.loc[~train_raw['image_path'].str.contains('jpg'), 'image_path'] + '.jpg'

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = train_raw[['title', 'image_path']]

In [None]:
y = train_raw['Category']

In [None]:
# split raw train data into train and validation set in stratified manner to balnace the categories in both data sets
X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.33, stratify=y, random_state=50)

In [None]:
train = pd.concat([X_train,y_train], axis = 1)

In [None]:
validation = pd.concat([X_validation,y_validation], axis = 1)

In [None]:
test = test_raw

In [None]:
###############################

In [None]:
import keras
from keras.utils import Sequence

In [None]:
class NDSCSequence_train(Sequence):

    def __init__(self, x_set, y_set, batch_size):
        self.x, self.y = x_set, y_set
        self.batch_size = batch_size

    def __len__(self):
        return int(np.ceil(len(self.x) / float(self.batch_size)))

    def __getitem__(self, idx):
        batch_x = self.x[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_y = self.y[idx * self.batch_size:(idx + 1) * self.batch_size]

        return np.array([
            cv2.resize(cv2.imread(file_name), (80, 80)) / 255.
               for file_name in batch_x]), np.array(batch_y)

In [None]:
class NDSCSequence_test(Sequence):

    def __init__(self, x_set, batch_size):
        self.x = x_set
        self.batch_size = batch_size

    def __len__(self):
        return int(np.ceil(len(self.x) / float(self.batch_size)))

    def __getitem__(self, idx):
        batch_x = self.x[idx * self.batch_size:(idx + 1) * self.batch_size]

        return np.array([
            cv2.resize(cv2.imread(file_name), (80, 80)) / 255.
               for file_name in batch_x])

In [None]:
training_set = NDSCSequence_train(list(train['image_path']), list(train['Category']), 64)

In [None]:
validation_set = NDSCSequence_train(list(validation['image_path']), list(validation['Category']), 64)

In [None]:
test_set = NDSCSequence_test(list(test['image_path']), 400)

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D

In [None]:
model = Sequential()
model.add(Conv2D(32, (3, 3), activation='relu', input_shape=(80, 80, 3)))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(128, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))


model.add(Flatten())
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(58, activation='softmax'))

In [None]:
#checkpoint
from keras.callbacks import ModelCheckpoint
filepath = 'image-{epoch:02d}-{val_acc:.2f}.hdf5'
checkpoint = ModelCheckpoint(filepath, monitor = 'val_acc', verbose=1, save_best_only=False, save_weights_only=False, mode='auto',period=1)
callbacks_list = [checkpoint]

In [None]:
model.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])

In [None]:
history = model.fit_generator(
    training_set,
    steps_per_epoch=np.ceil(len(train)/64.),
    epochs=10,
    validation_data = validation_set,
    validation_steps=np.ceil(len(validation)/64.),
    callbacks=callbacks_list)

In [None]:
# model has best val_acc after 4 epochs

In [None]:
# load model weights from the best model
model.load_weights('image-04-0.46.hdf5')

In [None]:
# predict test data
pred_test=model.predict_generator(test_set,steps=np.ceil(len(test)/400.),verbose=1)

In [None]:
# predict validation data
pred_validation=model.predict_generator(validation_set,steps=np.ceil(len(validation)/64.),verbose=1)

In [None]:
# predict training data
pred_train=model.predict_generator(training_set,steps=np.ceil(len(train)/64.),verbose=1)

In [None]:
# add suffix 'image' to dataframe
image_result_train_df = pd.DataFrame(pred_train).add_suffix('_image')
image_result_validation_df = pd.DataFrame(pred_validation).add_suffix('_image')
image_result_test_df = pd.DataFrame(pred_test).add_suffix('_image')

In [None]:
# merged predicted probability with itemid and categoty
image_train_probablity = pd.concat([train.reset_index(),image_result_train_df], axis=1).drop(['title','image_path'], axis=1).rename(columns={'index': 'itemid'})
image_validation_probablity = pd.concat([validation.reset_index(),image_result_validation_df], axis=1).drop(['title','image_path'], axis=1).rename(columns={'index': 'itemid'})
image_test_probablity = pd.concat([test,image_result_test_df], axis=1).drop(['title','image_path'], axis=1)

In [None]:
# save files to csv for future use
image_train_probablity.to_csv("image_train_probablity.csv",index=False)
image_validation_probablity.to_csv("image_validation_probablity.csv",index=False)
image_test_probablity.to_csv("image_test_probablity.csv",index=False)