In [1]:
import numpy as np
import pandas as pd
from gensim.models import KeyedVectors
from keras.layers import Flatten
from keras.layers import MaxPooling1D
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from nltk.corpus import stopwords

Using TensorFlow backend.


In [2]:
MAX_NB_WORDS = 200000
MAX_SEQUENCE_LENGTH = 100
EMBEDDING_DIM = 100

In [3]:
category_index = {"clothing":0, "camera":1, "home":2}
category_reverse_index = dict((y,x) for (x,y) in category_index.items())
STOPWORDS = set(stopwords.words("english"))

In [4]:
print('Indexing word vectors.')

embeddings_index = {}
GLOVE_DIR_DATA = 'glove.6B.100d.txt'

file = open(GLOVE_DIR_DATA, encoding="utf8")

for line in file:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))

Indexing word vectors.
Found 400000 word vectors.


In [5]:
clothing = pd.read_csv('dataset/clothing.tsv', sep='\t')
cameras = pd.read_csv('dataset/cameras.tsv', sep='\t')
home = pd.read_csv('dataset/home.tsv', sep='\t')

dataset = [clothing, cameras, home]

print("Make sure there are no null values in the datasets")
for data in dataset:
    print('Has null values: ', data.isnull().values.any())

Make sure there are no null values in the datasets
Has null values:  False
Has null values:  False
Has null values:  False


In [6]:
def preprocess(text):
    text= text.strip().lower().split()
    text = filter(lambda word: word not in STOPWORDS, text)
    return " ".join(text)


for data in dataset:
    data['title'] = data['title'].apply(preprocess)

In [7]:
all_texts = clothing['title'] + ' ' + cameras['title'] + ' ' + home['title']
all_texts = all_texts.drop_duplicates(keep=False)

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(all_texts)

clothing_sequences = tokenizer.texts_to_sequences(clothing['title'])
electronics_sequences = tokenizer.texts_to_sequences(cameras['title'])
home_appliances_sequences = tokenizer.texts_to_sequences(home['title'])

clothing_data = pad_sequences(clothing_sequences, maxlen=MAX_SEQUENCE_LENGTH)
electronics_data = pad_sequences(electronics_sequences, maxlen=MAX_SEQUENCE_LENGTH)
home_appliances_data = pad_sequences(home_appliances_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [8]:
word_index = tokenizer.word_index
test_string = "sports action spy pen camera"
print("word\t\tid")
print("-" * 20)
for word in test_string.split():
    print("%s\t\t%s" % (word, word_index[word]))

word		id
--------------------
sports		16
action		13
spy		7
pen		57
camera		2


In [9]:
test_sequence = tokenizer.texts_to_sequences(["sports action camera", "spy pen camera"])
padded_sequence = pad_sequences(test_sequence, maxlen=MAX_SEQUENCE_LENGTH)
print("Text to Vector", test_sequence)
print("Padded Vector", padded_sequence)

Text to Vector [[16, 13, 2], [7, 57, 2]]
Padded Vector [[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0 16 13  2]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  7 57  2]]


In [10]:
print("clothing: \t\t", to_categorical(category_index["clothing"], 3))
print("camera: \t\t", to_categorical(category_index["camera"], 3))
print("home: \t\t\t", to_categorical(category_index["home"], 3))

clothing: 		 [ 1.  0.  0.]
camera: 		 [ 0.  1.  0.]
home: 			 [ 0.  0.  1.]


In [11]:
print("clothing shape: ", clothing_data.shape)
print("electronics shape: ", electronics_data.shape)
print("home appliances shape: ", home.shape)

data = np.vstack((clothing_data, electronics_data, home_appliances_data))
category = pd.concat([clothing['category'], cameras['category'], home['category']]).values
category = to_categorical(category)
print("-"*10)
print("combined data shape: ", data.shape)
print("combined category/label shape: ", category.shape)

clothing shape:  (392721, 100)
electronics shape:  (1347, 100)
home appliances shape:  (11425, 2)
----------
combined data shape:  (405493, 100)
combined category/label shape:  (405493, 3)


In [12]:
VALIDATION_SPLIT = 0.3
indices = np.arange(data.shape[0]) # get sequence of row index
np.random.shuffle(indices) # shuffle the row indexes
data = data[indices] # shuffle data/product-titles/x-axis
category = category[indices] # shuffle labels/category/y-axis
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
x_train = data[:-nb_validation_samples]
y_train = category[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = category[-nb_validation_samples:]

In [13]:
print('Preparing embedding matrix.')
from keras.layers import Embedding
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

# prepare embedding matrix
num_words = min(MAX_NB_WORDS, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

Preparing embedding matrix.
Found 2625 unique tokens.
Null word embeddings: 973


In [14]:
# from keras.models import Sequential
# from keras.layers import Conv1D, GlobalMaxPooling1D, Flatten
# from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation


# model = Sequential()
# model.add(embedding_layer)
# model.add(Dropout(0.2))
# model.add(Conv1D(300, 3, padding='valid',activation='relu',strides=2))
# model.add(Conv1D(150, 3, padding='valid',activation='relu',strides=2))
# model.add(Conv1D(75, 3, padding='valid',activation='relu',strides=2))
# model.add(Flatten())
# model.add(Dropout(0.2))
# model.add(Dense(150,activation='sigmoid'))
# model.add(Dropout(0.2))
# model.add(Dense(3,activation='sigmoid'))

# model.compile(loss='categorical_crossentropy',optimizer='rmsprop',metrics=['acc'])

# model.summary()

In [15]:
# model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=2, batch_size=128)
# score = model.evaluate(x_val, y_val, verbose=0)
# print('Test loss:', score[0])
# print('Test accuracy:', score[1])

In [16]:
from keras.models import Sequential
from keras.layers import Conv1D, GlobalMaxPooling1D, Flatten
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation

model_1 = Sequential()
model_1.add(embedding_layer)
model_1.add(Conv1D(250,3,padding='valid',activation='relu',strides=1))
model_1.add(GlobalMaxPooling1D())
model_1.add(Dense(250))
model_1.add(Dropout(0.2))
model_1.add(Activation('relu'))
model_1.add(Dense(3))
model_1.add(Activation('sigmoid'))
model_1.compile(loss='categorical_crossentropy',optimizer='rmsprop',metrics=['acc'])
model_1.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 100)          262600    
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 98, 250)           75250     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 250)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 250)               62750     
_________________________________________________________________
dropout_1 (Dropout)          (None, 250)               0         
_________________________________________________________________
activation_1 (Activation)    (None, 250)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 753       
__________

In [17]:
from keras.callbacks import LearningRateScheduler, EarlyStopping
from keras.callbacks import ModelCheckpoint

#Training the model and saving the best one!!
batch_size = 128
annealer = LearningRateScheduler(lambda x: 1e-3 * 0.9 ** x)
earlystop = EarlyStopping(patience=5)
modelsave = ModelCheckpoint(
    filepath='product_model_1.h5', save_best_only=True, verbose=0)
model_1.fit(
    x_train, y_train, batch_size=batch_size,
    epochs=2, 
    validation_data=(x_val, y_val),
    callbacks=[annealer, earlystop, modelsave]
)

score = model_1.evaluate(x_val, y_val, verbose=0)
print('Test loss: \t', score[0])
print('Test Accuracy: \t', score[1])

Train on 283846 samples, validate on 121647 samples
Epoch 1/2
Epoch 2/2
Test loss: 	 0.00203745497136
Test Accuracy: 	 0.999819148849


In [18]:
# def model_1

from keras.models import load_model

model_1 = load_model('product_model_1.h5')


In [19]:
example_product = "Nikon Coolpix A10 Point and Shoot Camera (Black)"
example_product = preprocess(example_product)
example_sequence = tokenizer.texts_to_sequences([example_product])
example_padded_sequence = pad_sequences(example_sequence, maxlen=MAX_SEQUENCE_LENGTH)

print("-"*10)
print("Predicted category: ", category_reverse_index[model_1.predict_classes(example_padded_sequence, verbose=0)[0]])
print("-"*10)
probabilities = model_1.predict(example_padded_sequence, verbose=0)
probabilities = probabilities[0]
print("Clothing Probability: ",probabilities[category_index["clothing"]] )
print("Camera Probability: ",probabilities[category_index["camera"]] )
print("home probability: ",probabilities[category_index["home"]] )

----------
Predicted category:  camera
----------
Clothing Probability:  1.9271e-15
Camera Probability:  0.999904
home probability:  3.82342e-08
