In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
import nltk
%matplotlib inline

In [None]:
## import raw train data 
train_raw = pd.read_csv('train.csv')

In [None]:
## import raw test data 
test_raw = pd.read_csv('test.csv')

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = train_raw[['title', 'image_path']]

In [None]:
y = train_raw['Category']

In [None]:
## split raw train data into train and validation set in stratified manner to balnace the categories in both data sets
X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.33, stratify=y, random_state=50)

In [None]:
train = pd.concat([X_train,y_train], axis = 1)

In [None]:
validation = pd.concat([X_validation,y_validation], axis = 1)

In [None]:
test = test_raw

In [None]:
##########################

In [None]:
wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')

In [None]:
def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    doc = re.sub(' +', ' ',doc)
    # tokenize document
    tokens = wpt.tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

In [None]:
normalize_corpus = np.vectorize(normalize_document)

In [None]:
normal_corpus_train = normalize_corpus(train['title'])

In [None]:
normal_corpus_validation = normalize_corpus(validation['title'])

In [None]:
normal_corpus_test = normalize_corpus(test['title'])

In [None]:
from keras.preprocessing import text, sequence

In [None]:
max_words = 20000
tokenize = text.Tokenizer(num_words=max_words, char_level=False)

In [None]:
tokenize.fit_on_texts(normal_corpus_train)

In [None]:
from keras.utils import Sequence

In [None]:
class NDSCSequence_train(Sequence):

    def __init__(self, x_set, y_set, batch_size):
        self.x, self.y = x_set, y_set
        self.batch_size = batch_size

    def __len__(self):
        return int(np.ceil(len(self.x) / float(self.batch_size)))

    def __getitem__(self, idx):
        batch_x = self.x[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_y = self.y[idx * self.batch_size:(idx + 1) * self.batch_size]

        return np.array(tokenize.texts_to_matrix(batch_x)), np.array(batch_y)

In [None]:
class NDSCSequence_test(Sequence):

    def __init__(self, x_set, batch_size):
        self.x = x_set
        self.batch_size = batch_size

    def __len__(self):
        return int(np.ceil(len(self.x) / float(self.batch_size)))

    def __getitem__(self, idx):
        batch_x = self.x[idx * self.batch_size:(idx + 1) * self.batch_size]

        return np.array(tokenize.texts_to_matrix(batch_x))

In [None]:
training_set = NDSCSequence_train(normal_corpus_train, list(train['Category']), 64)

In [None]:
validation_set = NDSCSequence_train(normal_corpus_validation, list(validation['Category']), 64)

In [None]:
test_set = NDSCSequence_test(normal_corpus_test, 400)

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout

In [None]:
text_model = Sequential()
text_model.add(Dense(512, input_shape=(max_words,), activation='relu'))
text_model.add(Dropout(0.5))
text_model.add(Dense(58, activation='softmax'))

In [None]:
text_model.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])

In [None]:
#checkpoint
from keras.callbacks import ModelCheckpoint
filepath = 'text-{epoch:02d}-{val_acc:.2f}.hdf5'
checkpoint = ModelCheckpoint(filepath, monitor = 'val_acc', verbose=1, save_best_only=False, save_weights_only=False, mode='auto',period=1)
callbacks_list = [checkpoint]

In [None]:
history = text_model.fit_generator(
    training_set,
    steps_per_epoch=np.ceil(len(train)/64.),
    epochs=20,
    validation_data = validation_set,
    validation_steps=np.ceil(len(validation)/64.),
    callbacks=callbacks_list
    )

In [None]:
######################

In [None]:
# text model has best val_acc after 6 epochs

In [None]:
# load model weights from the best model
text_model.load_weights('text-06-0.74.hdf5')

In [None]:
# predict test data
pred_test=text_model.predict_generator(test_set,steps=np.ceil(len(test)/400.),verbose=1)

In [None]:
# predict validation data
pred_validation=text_model.predict_generator(validation_set,steps=np.ceil(len(validation)/64.),verbose=1)

In [None]:
# predict training data
pred_train=text_model.predict_generator(training_set,steps=np.ceil(len(train)/64.),verbose=1)

In [None]:
# add suffix 'text' to dataframe
text_result_train_df = pd.DataFrame(pred_train).add_suffix('_text')
text_result_validation_df = pd.DataFrame(pred_validation).add_suffix('_text')
text_result_test_df = pd.DataFrame(pred_test).add_suffix('_text')

In [None]:
# merged predicted probability with itemid and categoty
text_train_probablity = pd.concat([train.reset_index(),text_result_train_df], axis=1).drop(['title','image_path'], axis=1).rename(columns={'index': 'itemid'})
text_validation_probablity = pd.concat([validation.reset_index(),text_result_validation_df], axis=1).drop(['title','image_path'], axis=1).rename(columns={'index': 'itemid'})
text_test_probablity = pd.concat([test,text_result_test_df], axis=1).drop(['title','image_path'], axis=1)

In [None]:
# save files to csv for future use
text_train_probablity.to_csv("text_train_probablity.csv",index=False)
text_validation_probablity.to_csv("text_validation_probablity.csv",index=False)
text_test_probablity.to_csv("text_test_probablity.csv",index=False)