In [1]:
# Keras 
# Classifier: LSTM 
# Classification type: multi-class (404 classes)
# Output nodes: #of classes with softmax

# word2vec model: 
# word2vect_class_specififc__vec64_win1__dict_sample_5000

In [1]:
from __future__ import print_function
import numpy as np
import pandas as pd
import pickle
import time

from keras.datasets import reuters
from keras.wrappers.scikit_learn import KerasClassifier
from keras.layers.wrappers import TimeDistributed
from keras.models import load_model, Sequential
from keras.layers.embeddings import Embedding
from keras.layers import Dense, Dropout, Activation, LSTM, Bidirectional
from keras.utils import np_utils
from keras.preprocessing.text import Tokenizer, one_hot, text_to_word_sequence
from keras.preprocessing import sequence
from keras import backend as K
from keras.engine.topology import Layer
from keras import initializations
from keras import regularizers
from keras import constraints

from keras.utils.visualize_util import plot
from IPython.display import SVG
from keras.utils.visualize_util import model_to_dot

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import os    
os.environ['THEANO_FLAGS'] = "device=gpu" 
import theano
#theano.config.device = 'gpu0'
theano.config.floatX = 'float32'


# import custom code
import os
import sys
pardir = os.path.abspath(os.path.join(os.getcwd(), '../'))
script_path = pardir + "/2_common_aux_script"
print('Importing process_string.py \nfrom ' + script_path + " ...\n")
sys.path.append(script_path)
from process_string import process_string
sys.path.remove(script_path)


print(sys.version)

Using Theano backend.


Importing process_string.py 
from /Users/altay.amanbay/Desktop/new node booster/experiments/3a.1 - Nets train/4 train model - keras/2_common_aux_script ...

3.5.2 |Anaconda 4.2.0 (x86_64)| (default, Jul  2 2016, 17:52:12) 
[GCC 4.2.1 Compatible Apple LLVM 4.2 (clang-425.0.28)]


In [2]:
# Functions

def NGramGenerator_wordwise_interval(phrase, min_ngram, max_ngram):
    all_ngram_lists = []

    #printable_ = 'abcdefghijklmnopqrstuvwxyz0123456789 '
    #s_split = "".join((char if char in printable_ else "") for char in phrase).split()
    phrase_processed = process_string(phrase)
    s_split = phrase_processed.split()
    
    for n in range(max_ngram, min_ngram - 1, -1):
        n_gram = [s_split[i:i+n] for i in range(len(s_split)-n+1)]
        all_ngram_lists.extend(n_gram)
        
    all_ngrams = []
    for n_gram in all_ngram_lists:
        all_ngrams.extend([' '.join(n_gram)])
    
    return all_ngrams

def get_word2index(texts_ls_):
    word2index_ = {}

    c = 1
    for text_str in texts_ls_:
        text_tokens_ls = text_str.lower().split()
        for token in text_tokens_ls:
            if(token not in word2index_):
                word2index_[token] = c
                c = c + 1
                
    return word2index_

def train_df_preprocess(top_words_, texts_ls_, max_pad_length_):
    # texts_ls_: list of texts strings
    
    tok = Tokenizer(top_words_)
    tok.fit_on_texts(texts_ls_)

    words = []
    for iter in range(top_words):
        words += [key for key,value in tok.word_index.items() if value==iter+1]

    #Class for vectorizing texts, or/and turning texts into sequences 
    #(=list of word indexes, where the word of rank i in the dataset (starting at 1) has index i).
    texts_vec_ls = tok.texts_to_sequences(texts_ls_)#turns text to sequence, stating which word comes in what place
    texts_vec_mtx = sequence.pad_sequences(texts_vec_ls, maxlen=max_pad_length_)#pad sequence, essentially padding it with 0's at the end
    
    return texts_vec_mtx

def text_2_vec(text_str, word2index_):
    # text_str: text string
    
    text_tokens_ls = text_str.lower().split()
    
    text_vec = []
    for token in text_tokens_ls:
        if token in word2index_:
            text_vec.append(word2index_[token])
        else:
            text_vec.append(0)
            
    return text_vec

def train_df_preprocess_2(texts_ls_, word2index_, max_pad_length_):
    # texts_ls_: list of texts strings
    
    texts_vec_ls = []
    for text_ in texts_ls_:
        #print(text_)
        #print(type(text_))
        text_vec = text_2_vec(text_, word2index_)
        texts_vec_ls.append(text_vec)
    
    texts_vec_ary = sequence.pad_sequences(texts_vec_ls, maxlen=max_pad_length_)
    
    return texts_vec_ary

def texts_to_sequences_custom(texts_ls, word_index_):
    texts_seq = []
    
    for text in texts_ls:
        #text_split = text.lower().split()
        text_split = NGramGenerator_wordwise_interval(text,1,1)
        seq = []
        for token in text_split:
            if(token in word_index_):
                seq.append(word_index_[token])
            else:
                seq.append(0)
                
        texts_seq.append(seq)
#         for k,v in word_index_.items():
#             if(v == 395):
#                 print(k,v)
    return texts_seq


def get_model_file_aux(model_file_aux_name):
    with open(model_file_aux_name, 'rb') as pickle_file:
        model_file_aux = pickle.load(pickle_file)
    return model_file_aux

In [3]:
# Read sampled descriptionary

path = pardir+'/1_data/'
file_name = 'sampled_descriptionary_sample_size_5000.csv'
samples_df = pd.read_csv(path + file_name)

# Rename columns
samples_df.rename(columns={'description': 'description_mod1', 
                           'category_id': 'category_id_mod1',
                           'category_path': 'category_full_path_mod1'}, inplace=True)

# Drop 'screwdrivers' from descriptionary
#samples_df = samples_df.loc[samples_df.category_id_mod1 != 927,:]

# Drop index column
samples_df.drop(labels=['index'], axis=1, inplace=True)

print('samples data shape:',samples_df.shape)
samples_df.head()

samples data shape: (956776, 3)


Unnamed: 0,description_mod1,category_id_mod1,category_full_path_mod1
0,!iT Jeans Maternity Skinny Jeans Dark Wash M,100,Apparel & Accessories > Apparel > Maternity
1,1822 Denim 'Butter' Maternity Skinny Jeans Rin...,100,Apparel & Accessories > Apparel > Maternity
2,25 J Brand Maternity Skinny Jean nirvana blue,100,Apparel & Accessories > Apparel > Maternity
3,26 J Brand Maternity Skinny Jean nirvana blue,100,Apparel & Accessories > Apparel > Maternity
4,26 James Jeans Maternity Skinny External Mater...,100,Apparel & Accessories > Apparel > Maternity


In [4]:
# Concat original train set and sampled descriptionary
#train_df = pd.concat([train_df, samples_df], axis=0)
train_df = samples_df
#train_df.reset_index(drop=True, inplace=True)

# description into chars
train_df['description_mod1'] = train_df['description_mod1'].apply(lambda x: process_string(x))

# deduplicate
train_df.drop_duplicates(subset=['description_mod1'], inplace = True, keep=False)
print('train data shape (deduplicate):',train_df.shape)
    
# Encode target feature
le = LabelEncoder()
le.fit(train_df['category_full_path_mod1'])
train_df['target_le'] = le.transform(train_df['category_full_path_mod1'])


train_df.head(2)

train data shape (deduplicate): (938810, 3)


Unnamed: 0,description_mod1,category_id_mod1,category_full_path_mod1,target_le
0,it jeans maternity skinny jeans dark wash m,100,Apparel & Accessories > Apparel > Maternity,27
1,1822 denim butter maternity skinny jeans rinse...,100,Apparel & Accessories > Apparel > Maternity,27


In [5]:
# Split into train and test
#X = train_df.loc[:,['description_mod1']]
X_ls = np.array(list(train_df['description_mod1']))
y_ary = np.array(list(train_df['target_le']))
y_ary_cat = np_utils.to_categorical(train_df['target_le'])
# X_ls = train_df[['description_mod1']]
# y_ary = train_df[['target_le']]

#print(type(X_ls))
#print(type(y_ary))
#print(type(y_ary_cat))

#X_train_ls, X_test_ls, y_train_ary, y_test_ary = train_test_split(X_ls, y_ary, test_size = 0.3)
#X_train_ls, X_test_ls, y_train_ary, y_test_ary = train_test_split(X_ls, y_ary_cat, test_size = 0.3)

print(len(X_ls))
print(y_ary_cat.shape)

938810
(938810, 398)


In [6]:
# Convert train set into sequences for nets

top_words = None
max_description_length = 30

tok = Tokenizer(nb_words = top_words)
tok.fit_on_texts(X_ls)
word_index = tok.word_index
print('word_index size:',len(word_index))

#train_texts_vec_ls = tok.texts_to_sequences(X_train_ls)
train_texts_vec_ls = texts_to_sequences_custom(X_ls, word_index)
train_texts_vec_mtx = sequence.pad_sequences(train_texts_vec_ls, maxlen = max_description_length)

print('train_texts_vec_mtx shape:',train_texts_vec_mtx.shape)
list(tok.word_index)[0:5]

# Delete objects
X_ls = None
y_ary = None
tok = None
train_texts_vec_ls = None

word_index size: 234950
train_texts_vec_mtx shape: (938810, 30)


In [8]:
# Convert test set into sequences for nets

#test_texts_vec_ls = tok.texts_to_sequences(X_test_ls)
#test_texts_vec_ls = texts_to_sequences_custom(X_test_ls, word_index)
#test_texts_vec_mtx = sequence.pad_sequences(test_texts_vec_ls, maxlen = max_description_length)

In [7]:
# test check
i = 100
print(train_texts_vec_mtx[i])
len(word_index)

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0 6780 4241  251   68   34   58   49   58   49  267]


234950

In [8]:
## Create word embeddings from trained Word2Vec model
from gensim.models import word2vec, Phrases

# Load model
file_path_1 = pardir+"/3_model_word2vec_vec64_win3__dict_sample_5000/word2vect_vec64_win3__dict_sample_5000"
model = word2vec.Word2Vec.load(file_path_1)

#print(model.vocab.keys())
#sys.exit()

# word vector embeddings from model into dictionary
word2vec_dict={}
for word in model.vocab.keys():
    try:
        word2vec_dict[word]=model[word]
    except:    
        pass
print('Loaded %s word vectors.' % len(word2vec_dict))
    
embedding_vecor_length = len(model[word])
print('embedding_vecor_length:',embedding_vecor_length)


embedding_matrix = np.zeros((len(word_index) + 1, embedding_vecor_length))
for word, i in word_index.items():
    embedding_vector = word2vec_dict.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        
print('\nembedding matrix shape:',embedding_matrix.shape)
print(embedding_matrix[0]) # first cell should be all zeros
print(embedding_matrix[1])

Loaded 235349 word vectors.
embedding_vecor_length: 64

embedding matrix shape: (234951, 64)
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
[ 0.17534356 -0.08578434  0.18103811  0.02477756 -0.07118656 -0.15512228
  0.05136703  0.2165056  -0.06552318  0.09003545 -0.20524196  0.00697994
 -0.16218811  0.07659905 -0.02378268  0.04618275  0.09628467  0.00470943
  0.11890937 -0.09564091  0.30388096  0.08125744 -0.0361764   0.0483168
  0.19096969 -0.18906899 -0.04920458  0.010562   -0.17351735  0.16046672
 -0.06544918 -0.00549091  0.01579626 -0.04615073  0.20403807  0.23351906
 -0.1007614  -0.01959764 -0.10367423  0.03201051 -0.01045111  0.30996674
 -0.05559701 -0.0629313  -0.09992196  0.01259949  0.08974306  0.12655872
 -0.07601544 -0.09347545 -0.11799838 -0.06275873  0.16728345 -0

In [9]:
# Best model result holder
best_model_aux = {}
best_model_aux['Max length'] = max_description_length
best_model_aux['Best score'] = 0
best_model_aux['texts_to_sequences'] = texts_to_sequences_custom
best_model_aux['word_index'] = word_index
best_model_aux['Label encoder'] = le


best_model = None

In [12]:
# Optional
# Load previous model (if needs to be compared in the following training)
#best_model = load_model('category_927_nets_1000_model.h5')
#best_model_aux = get_model_file_aux('category_927_nets_1000_model_aux.pkl')

In [10]:
# prediction nodes count
nb_classes = train_df['category_full_path_mod1'].unique()
print('Classes count:', len(nb_classes))

train_df = None
samples_df = None

Classes count: 398


### Model 1

In [11]:
# TRAIN MODEL 1
start = time.time()

# define/initialize model
top_words = len(word_index) + 1
batch_size_ = 64   # 64

model = Sequential()
# --------------------------------------------------------------------------------------
# ---- Embedding layer -----------------------------------------------------------------
embedding_layer = Embedding(top_words, 
                            embedding_vecor_length, 
                            weights=[embedding_matrix], 
                            input_length = max_description_length,
                            trainable=False)
model.add(embedding_layer)

## LSTM 1
## ======================================================================================
LSTM_1 = LSTM(128,return_sequences=False)
model.add(LSTM_1)

## Dense 1
## ======================================================================================
Dense_1 = Dense(128,activation='sigmoid')
#model.add(Dense_1)

## Output classes layer
## ======================================================================================
model.add(Dense(len(nb_classes), activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])  # try loss=categorical_crossentropy
print(model.summary())

# start training multiple times with epoch=1
for ep in range(20):
    print('Epoch iter #' + str(ep+1))
    model.fit(train_texts_vec_mtx, y_ary_cat, nb_epoch=1, batch_size=batch_size_)
    
    scores = model.evaluate(train_texts_vec_mtx, y_ary_cat, verbose=0)
    if(best_model_aux['Best score'] < scores[1]):
        best_model_aux['Best score'] = scores[1]
        best_model = model
        print('Captured improved model')
        print('Valid accuracy: ',best_model_aux['Best score'])
        #print("Accuracy on test set: %.2f%%" % (scores[1]*100))
    else:
        break
    print()
 

print("Training took %g s" % (time.time() - start))

# Accuracy: loss: 0.2879 - acc: 0.9090
# Valid accuracy:  0.915107423227

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_1 (Embedding)          (None, 30, 64)        15036864    embedding_input_1[0][0]          
____________________________________________________________________________________________________
lstm_1 (LSTM)                    (None, 128)           98816       embedding_1[0][0]                
____________________________________________________________________________________________________
dense_2 (Dense)                  (None, 398)           51342       lstm_1[0][0]                     
Total params: 15,187,022
Trainable params: 150,158
Non-trainable params: 15,036,864
____________________________________________________________________________________________________
None
Epoch iter #1
Epoch 1/1
Captured improved model
Valid accuracy:  0.792852653892

Epoch iter #2
Epoch 1/

In [13]:
best_model_aux['Best score']

0.87678443987614019

### Model 2

In [20]:
# FUNCTION FOR TRAIN MODEL 2

from keras import backend as K
from keras.engine.topology import Layer
from keras import initializations

class AttLayer(Layer):
    def __init__(self, **kwargs):
        self.init = initializations.get('normal')
        #self.input_spec = [InputSpec(ndim=3)]
        super(AttLayer, self).__init__(** kwargs)

    def build(self, input_shape):
        assert len(input_shape)==3
        #self.W = self.init((input_shape[-1],1))
        self.W = self.init((input_shape[-1],))
        #self.input_spec = [InputSpec(shape=input_shape)]
        self.trainable_weights = [self.W]
        super(AttLayer, self).build(input_shape)  # be sure you call this somewhere!

    def call(self, x, mask=None):
        eij = K.tanh(K.dot(x, self.W))

        ai = K.exp(eij)
        weights = ai/K.sum(ai, axis=1).dimshuffle(0,'x')

        weighted_input = x*weights.dimshuffle(0,1,'x')
        return weighted_input.sum(axis=1)

    def get_output_shape_for(self, input_shape):
        return (input_shape[0], input_shape[-1])

In [20]:
# TRAIN MODEL 2
# Attention-based
start = time.time()

# define/initialize model
top_words = len(word_index) + 1
batch_size_ = 64   # 64

model = Sequential()
# --------------------------------------------------------------------------------------
# ---- Embedding layer -----------------------------------------------------------------
embedding_layer = Embedding(top_words, 
                            embedding_vecor_length, 
                            weights=[embedding_matrix], 
                            input_length = max_description_length,
                            trainable=False)
model.add(embedding_layer)

## LSTM 1
## ======================================================================================
LSTM_1 = LSTM(128,return_sequences=True)
model.add(LSTM_1)

## Attention 1
## ======================================================================================
TimeDist_1 = TimeDistributed(Dense(200))  #, input_shape=(max_description_length, embedding_vecor_length)
model.add(TimeDist_1)
model.add(AttLayer())

## Dense 1
## ======================================================================================
Dense_1 = Dense(128,activation='sigmoid')
#model.add(Dense_1)

## Output classes layer
## ======================================================================================
model.add(Dense(len(nb_classes), activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])  # try loss=categorical_crossentropy
print(model.summary())

# start training multiple times with epoch=1
for ep in range(20):
    print('Epoch iter #' + str(ep+1))
    model.fit(train_texts_vec_mtx, y_ary_cat, nb_epoch=1, batch_size=batch_size_)
    
    scores = model.evaluate(train_texts_vec_mtx, y_ary_cat, verbose=0)
    if(best_model_aux['Best score'] < scores[1]):
        best_model_aux['Best score'] = scores[1]
        best_model = model
        print('Captured improved model')
        print('Valid accuracy: ',best_model_aux['Best score'])
        #print("Accuracy on test set: %.2f%%" % (scores[1]*100))
    else:
        break
    print()
 

print("Training took %g s" % (time.time() - start))

# Accuracy: loss: 0.2722 - acc: 0.9140

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_1 (Embedding)          (None, 30, 64)        15036864    embedding_input_1[0][0]          
____________________________________________________________________________________________________
lstm_1 (LSTM)                    (None, 30, 128)       98816       embedding_1[0][0]                
____________________________________________________________________________________________________
timedistributed_1 (TimeDistribut (None, 30, 200)       25800       lstm_1[0][0]                     
____________________________________________________________________________________________________
attlayer_1 (AttLayer)            (None, 200)           200         timedistributed_1[0][0]          
___________________________________________________________________________________________

In [23]:
best_model_aux['Best score']

0.91873115965956897

In [None]:
# Plot Nets design
#from keras.utils import plot_model
import matplotlib.pyplot as plt
from keras.utils.visualize_util import model_to_dot

plot(model, to_file='/Users/altay.amanbay/Desktop/model.png')
SVG(model_to_dot(model).create(prog='dot', format='svg'))


### Model 3

In [19]:
# FUNCTION FOR TRAIN MODEL 3

class AttentionWithContext(Layer):
    """
        Attention operation, with a context/query vector, for temporal data.
        Supports Masking.
        Follows the work of Yang et al. [https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf]
        "Hierarchical Attention Networks for Document Classification"
        by using a context vector to assist the attention
        # Input shape
            3D tensor with shape: `(samples, steps, features)`.
        # Output shape
            2D tensor with shape: `(samples, features)`.
        :param kwargs:
        Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
        The dimensions are inferred based on the output shape of the RNN.
        Example:
            model.add(LSTM(64, return_sequences=True))
            model.add(AttentionWithContext())
        """

    def __init__(self,
                 W_regularizer=None, u_regularizer=None, b_regularizer=None,
                 W_constraint=None, u_constraint=None, b_constraint=None,
                 bias=True, **kwargs):

        self.supports_masking = True
        self.init = initializations.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.u_regularizer = regularizers.get(u_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.u_constraint = constraints.get(u_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        super(AttentionWithContext, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1], input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        if self.bias:
            self.b = self.add_weight((input_shape[-1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)

        self.u = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_u'.format(self.name),
                                 regularizer=self.u_regularizer,
                                 constraint=self.u_constraint)

        super(AttentionWithContext, self).build(input_shape)

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        uit = K.dot(x, self.W)

        if self.bias:
            uit += self.b

        uit = K.tanh(uit)
        ait = K.dot(uit, self.u)

        a = K.exp(ait)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        # and this results in NaN's. A workaround is to add a very small positive number ε to the sum.
        # a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx())
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def get_output_shape_for(self, input_shape):
        return input_shape[0], input_shape[-1]

In [30]:
# TRAIN MODEL 3
# Attention-based
start = time.time()

# define/initialize model
top_words = len(word_index) + 1
batch_size_ = 64   # 64

model = Sequential()
# --------------------------------------------------------------------------------------
# ---- Embedding layer -----------------------------------------------------------------
embedding_layer = Embedding(top_words, 
                            embedding_vecor_length, 
                            weights=[embedding_matrix], 
                            input_length = max_description_length,
                            trainable=False)
model.add(embedding_layer)

## LSTM 1
## ======================================================================================
LSTM_1 = LSTM(128,return_sequences=True)
# model.add(LSTM_1)

## Attention 1
## ======================================================================================
TimeDist_1 = TimeDistributed(Dense(200))  #, input_shape=(max_description_length, embedding_vecor_length)
model.add(TimeDist_1)
#model.add(AttentionWithContext())

LSTM_2 = LSTM(128,return_sequences=False)
model.add(LSTM_2)

## Dense 1
## ======================================================================================
Dense_1 = Dense(128,activation='sigmoid')
#model.add(Dense_1)

## Output classes layer
## ======================================================================================
model.add(Dense(len(nb_classes), activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])  # try loss=categorical_crossentropy
print(model.summary())

# start training multiple times with epoch=1
for ep in range(20):
    print('Epoch iter #' + str(ep+1))
    model.fit(train_texts_vec_mtx, y_ary_cat, nb_epoch=1, batch_size=batch_size_)
    
    scores = model.evaluate(train_texts_vec_mtx, y_ary_cat, verbose=0)
    if(best_model_aux['Best score'] < scores[1]):
        best_model_aux['Best score'] = scores[1]
        best_model = model
        print('Captured improved model')
        print('Valid accuracy: ',best_model_aux['Best score'])
        #print("Accuracy on test set: %.2f%%" % (scores[1]*100))
    else:
        break
    print()
 

print("Training took %g s" % (time.time() - start))

Accuracy: loss: 0.2677 - acc: 0.9152
Valid accuracy:  0.923365750258

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_8 (Embedding)          (None, 30, 64)        15036864    embedding_input_8[0][0]          
____________________________________________________________________________________________________
timedistributed_8 (TimeDistribut (None, 30, 200)       13000       embedding_8[0][0]                
____________________________________________________________________________________________________
lstm_11 (LSTM)                   (None, 128)           168448      timedistributed_8[0][0]          
____________________________________________________________________________________________________
dense_20 (Dense)                 (None, 398)           51342       lstm_11[0][0]                    
Total params: 15,269,654
Trainable params: 232,790
Non-trainable params: 15,036,864
_______

In [20]:
best_model_aux['Best score']

0

In [None]:
# Plot Nets design
#from keras.utils import plot_model
import matplotlib.pyplot as plt
from keras.utils.visualize_util import model_to_dot

plot(model, to_file='/Users/altay.amanbay/Desktop/model.png')
SVG(model_to_dot(model).create(prog='dot', format='svg'))


### Model 4

In [None]:
# TRAIN MODEL 4
start = time.time()

# define/initialize model
top_words = len(word_index) + 1
batch_size_ = 64   # 64

model = Sequential()
# --------------------------------------------------------------------------------------
# ---- Embedding layer -----------------------------------------------------------------
embedding_layer = Embedding(top_words, 
                            embedding_vecor_length, 
                            weights=[embedding_matrix], 
                            input_length = max_description_length,
                            trainable=False)
model.add(embedding_layer)

## LSTM 1
## ======================================================================================
LSTM_1 = LSTM(128,return_sequences=True, activation='softmax')
model.add(LSTM_1)

## Attention 1
## ======================================================================================
TimeDist_1 = TimeDistributed(Dense(200))  #, input_shape=(max_description_length, embedding_vecor_length)
model.add(TimeDist_1)
#model.add(AttentionWithContext())

LSTM_2 = LSTM(128,return_sequences=False)
model.add(LSTM_2)

## Output classes layer
## ======================================================================================
model.add(Dense(len(nb_classes), activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])  # try loss=categorical_crossentropy
print(model.summary())

# start training multiple times with epoch=1
for ep in range(20):
    print('Epoch iter #' + str(ep+1))
    model.fit(train_texts_vec_mtx, y_ary_cat, nb_epoch=1, batch_size=batch_size_)
    
    scores = model.evaluate(train_texts_vec_mtx, y_ary_cat, verbose=0)
    if(best_model_aux['Best score'] < scores[1]):
        best_model_aux['Best score'] = scores[1]
        best_model = model
        print('Captured improved model')
        print('Valid accuracy: ',best_model_aux['Best score'])
        #print("Accuracy on test set: %.2f%%" % (scores[1]*100))
    else:
        break
    print()
 

print("Training took %g s" % (time.time() - start))

# Fails with acc: 0.0636

### Model 5

In [24]:
# TRAIN MODEL 5
start = time.time()

# define/initialize model
top_words = len(word_index) + 1
batch_size_ = 64   # 64

model = Sequential()
# --------------------------------------------------------------------------------------
# ---- Embedding layer -----------------------------------------------------------------
embedding_layer = Embedding(top_words, 
                            embedding_vecor_length, 
                            weights=[embedding_matrix], 
                            input_length = max_description_length,
                            trainable=False)
model.add(embedding_layer)

## LSTM 1
## ======================================================================================
LSTM_1 = LSTM(128,return_sequences=True)
#model.add(LSTM_1)

## Attention 1
## ======================================================================================
TimeDist_1 = TimeDistributed(Dense(200))  #, input_shape=(max_description_length, embedding_vecor_length)
model.add(TimeDist_1)
#model.add(AttLayer())
#model.add(AttentionWithContext())

LSTM_2 = LSTM(128,return_sequences=True)
model.add(LSTM_2)
model.add(AttentionWithContext())
## Output classes layer
## ======================================================================================
model.add(Dense(len(nb_classes), activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])  # try loss=categorical_crossentropy
print(model.summary())

# start training multiple times with epoch=1
for ep in range(20):
    print('Epoch iter #' + str(ep+1))
    model.fit(train_texts_vec_mtx, y_ary_cat, nb_epoch=1, batch_size=batch_size_)
    
    scores = model.evaluate(train_texts_vec_mtx, y_ary_cat, verbose=0)
    if(best_model_aux['Best score'] < scores[1]):
        best_model_aux['Best score'] = scores[1]
        best_model = model
        print('Captured improved model')
        print('Valid accuracy: ',best_model_aux['Best score'])
        #print("Accuracy on test set: %.2f%%" % (scores[1]*100))
    else:
        break
    print()
 

print("Training took %g s" % (time.time() - start))

# Fails with acc: 0.0636

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_8 (Embedding)          (None, 30, 64)        15036864    embedding_input_8[0][0]          
____________________________________________________________________________________________________
timedistributed_7 (TimeDistribut (None, 30, 200)       13000       embedding_8[0][0]                
____________________________________________________________________________________________________
lstm_15 (LSTM)                   (None, 30, 128)       168448      timedistributed_7[0][0]          
____________________________________________________________________________________________________
attentionwithcontext_3 (Attentio (None, 128)           16640       lstm_15[0][0]                    
___________________________________________________________________________________________

KeyboardInterrupt: 

### Save model

In [16]:
# Save model and aux file

best_model.save('nets_category_all_unigrams__traindata5000_vectrain5000_model.h5')

best_model_aux_name = 'nets_category_all_unigrams__traindata5000_vectrain5000_aux.pkl'
with open(best_model_aux_name, 'wb') as pickle_file:
    pickle.dump(best_model_aux, pickle_file, protocol=pickle.HIGHEST_PROTOCOL)

### Evaluation (optional)

In [25]:
# Final evaluation of the model
start = time.time()

scores = model.evaluate(train_texts_vec_mtx, y_train_ary, verbose=0)
print("Accuracy on train set: %.2f%%" % (scores[1]*100))
scores = model.evaluate(test_texts_vec_mtx, y_test_ary, verbose=0)
print("Accuracy on test set: %.2f%%" % (scores[1]*100))

print("\nEvaluation took %g s" % (time.time() - start))

Accuracy on train set: 86.66%
Accuracy on test set: 83.86%

Evaluation took 390.131 s


In [15]:
# Get predictions
start = time.time()

predictions = model.predict_classes(test_texts_vec_mtx)
#predictions_rnd = np.round_(predictions, decimals=0, out=None)
predictions_probs = model.predict(test_texts_vec_mtx)

print('%-20s' % "predictions[0]",':', predictions[0])
#print('%-20s' % "predictions_rnd[0]:",':',predictions_rnd[0])
#print('%-20s' % "predictions_probs[0]",':', predictions_probs[0])
print("\nPrediction took %g s" % (time.time() - start))


Prediction took 242.941 s


In [75]:
# Manual check which nodes are not distinguishable for nets
class_ = 392
print(le.inverse_transform(class_))
print(X_test_ls[np_utils.categorical_probas_to_classes(y_test_ary)==class_],'\n')

class_ = 390
print(le.inverse_transform(class_))
print(X_test_ls[np_utils.categorical_probas_to_classes(y_test_ary)==class_],'\n')

class_ = 389
print(le.inverse_transform(class_))
print(X_test_ls[np_utils.categorical_probas_to_classes(y_test_ary)==class_],'\n')


Tools & Home Improvement > Lighting, Light Bulbs & Ceiling Fans > Other
[ 'progress lighting p299281 archiethree light bath vanity antique nickel finish'
 'bulbrite 60a15f 60watt incandescent a15 appliance bulb frost'
 'greenhouse indooroutdoor chandelier rasped iron finish' ...,
 'ge advantage fluorescent'
 'feit ctcdm500led 60w equivalent candelabra base torpedo tip chandelier led light soft white'
 'alena 97 arched floor lamp by house of hampton'] 

Tools & Home Improvement > Lighting, Light Bulbs & Ceiling Fans > Light Fixtures & Lamps
['cortina nightstand right door'
 'camino vintage candelabra twotier chandelier 72'
 'golden lighting 2501ba3' ..., 'aspect white 23 open unit'
 'harpwell 7light oilrubbed bronze chandelier'
 'connie 2light antique black flush mount'] 

Tools & Home Improvement > Lighting, Light Bulbs & Ceiling Fans > Light Bulbs
[ '3 pack led light bulbs lohas b35 7w soft white 3000k e12 candelabra bulb equivalent to 6065 watt incandescent'
 'br30 led bulbsluminwiz 

In [16]:
#pd.crosstab(pd.Series(y_test_ary.ravel()), pd.Series(predictions_rnd.ravel()), rownames=['True'], colnames=['Predicted'], margins=True)
pd.crosstab(pd.Series(np_utils.categorical_probas_to_classes(y_test_ary)), pd.Series(predictions), rownames=['True'], colnames=['Predicted'], margins=True)

Predicted,1,2,3,4,5,6,9,10,11,12,...,389,390,392,393,394,395,396,397,403,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,120,1,22,9,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,158
2,1,18,1,0,4,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,25
3,27,0,1226,189,70,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1540
4,6,0,190,1146,60,0,1,2,7,0,...,0,0,0,0,0,0,0,0,0,1451
5,4,1,146,311,1001,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1493
6,0,0,0,0,0,9,0,0,0,0,...,0,0,0,0,0,0,0,0,0,9
9,3,0,0,1,0,0,321,31,3,0,...,0,0,0,0,0,0,0,0,0,367
10,0,0,4,2,0,0,42,1291,146,0,...,0,0,0,0,0,0,0,0,0,1514
11,0,0,1,0,0,0,1,113,496,0,...,0,0,0,0,0,0,0,0,0,622
12,0,0,0,0,0,0,0,0,0,134,...,0,0,0,0,0,0,0,0,0,139


In [58]:
pd.crosstab(pd.Series(np_utils.categorical_probas_to_classes(y_test_ary)), pd.Series(predictions.ravel()), rownames=['True'], colnames=['Predicted'], margins=True)

Predicted,0.0,1.0,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,78363,103,78466
1,102,1493,1595
All,78465,1596,80061


### Testing

In [6]:
#model.get_config()
def prediction_to_str(clf_prediction, category_id):
    if(clf_prediction > 0.5):
        return str(category_id)
    else:
        return 'not ' + str(category_id)

def predict(description_str, tok_, clf_, max_length_, category_id_):
    #seq_ = tok_.texts_to_sequences([description_str])
    seq_ = texts_to_sequences_custom([description_str.lower()], tok_.word_index)
    seq_pad = sequence.pad_sequences(seq_, maxlen = max_length_)
    #clf_prediction = clf_.predict(seq_pad)
    clf_prediction = old_best_model_.predict_classes(seq_pad, verbose=0)
    
    #print(seq_)
    #print(seq_pad)
    
    # Prediction to string
    #clf_prediction_str = prediction_to_str(clf_prediction[0][0], category_id_)
    clf_prediction_str = clf_prediction
    
    return clf_prediction_str
    #return clf_prediction[0][0]

def predict_2(description_str, tok_, clf_, max_length_, category_id_):
    #seq_ = tok_.texts_to_sequences([description_str])
    seq_ = texts_to_sequences_custom([description_str.lower()], tok_.word_index)
    seq_pad = sequence.pad_sequences(seq_, maxlen = max_length_)
    #clf_prediction = clf_.predict(seq_pad)
    clf_prediction = model.predict_classes(seq_pad)
    
    #print(seq_)
    #print(seq_pad)
    
    # Prediction to string
    #clf_prediction_str = prediction_to_str(clf_prediction[0][0], category_id_)
    clf_prediction = le.inverse_transform(clf_prediction)
    
    if(clf_prediction == ['Positive']):
        return str(category_id_)
    else:
        return 'not ' + str(category_id_)
    
    
def predict_proba(description_str, tok_, clf_, max_length_):
    #seq_ = tok_.texts_to_sequences([description_str])
    seq_ = texts_to_sequences_custom([description_str], tok_.word_index)
    seq_pad = sequence.pad_sequences(seq_, maxlen = max_length_)
    clf_prediction_proba = clf_.predict_proba(seq_pad, verbose=0)
    
    return clf_prediction_proba[0][0]


# id_ = 'table Setr'
# p = predict(id_, best_model_aux['Tokenizer'], best_model, best_model_aux['Max length'], best_model_aux['Category ID'])
# pp = predict_proba(id_, best_model_aux['Tokenizer'], best_model, best_model_aux['Max length'])
# print(p)
# print(pp)

In [21]:
# model_file = "nets_category_all_unigrams__traindata5000_vectrain5000_model.h5"
# aux_file = "nets_category_all_unigrams__traindata5000_vectrain5000_aux.pkl"
# old_best_model_ = load_model(model_file)
# old_best_model_aux_ = get_model_file_aux(aux_file)
# old_tok_ = old_best_model_aux_['Tokenizer']
#old_word_index_ = old_best_model_aux_['Tokenizer'].word_index
old_word_index_ = best_model_aux['word_index']
le = best_model_aux['Label encoder']


# item_d = "tekton 2780 10slot screwdriver holder and organizer"
# print('Old model prediction:')
# print('item:',item_d)
# print('Seq max len:', old_best_model_aux_['Max length'])
# print(predict(item_d, old_tok_, old_best_model_, old_best_model_aux_['Max length'], '927'))


In [24]:
X_ls = np.array(list(train_df['description_mod1']))
seq_ = texts_to_sequences_custom(X_ls, old_word_index_)
seq_pad = sequence.pad_sequences(seq_, maxlen = 30)
predictions = old_best_model_.predict_classes(seq_pad)



In [25]:
train_df['Predictions_le'] = list(predictions)
train_df['Predictions'] = train_df['Predictions_le'].apply(lambda x: le.inverse_transform(x))
train_df.head()

Unnamed: 0,description_mod1,category_id_mod1,category_full_path_mod1,target_le,Predictions_le,Predictions
0,it jeans maternity skinny jeans dark wash m,100,Apparel & Accessories > Apparel > Maternity,27,27,Apparel & Accessories > Apparel > Maternity
1,1822 denim butter maternity skinny jeans rinse...,100,Apparel & Accessories > Apparel > Maternity,27,27,Apparel & Accessories > Apparel > Maternity
2,25 j brand maternity skinny jean nirvana blue,100,Apparel & Accessories > Apparel > Maternity,27,27,Apparel & Accessories > Apparel > Maternity
3,26 j brand maternity skinny jean nirvana blue,100,Apparel & Accessories > Apparel > Maternity,27,27,Apparel & Accessories > Apparel > Maternity
4,26 james jeans maternity skinny external mater...,100,Apparel & Accessories > Apparel > Maternity,27,27,Apparel & Accessories > Apparel > Maternity


In [26]:
train_df.to_csv('truth_and_predictions.csv',index=False)

In [103]:
# TEST

## load old model
model_file = "category_927_nets__traindata5000_vectrain5000_model.h5"
aux_file = "category_927_nets__traindata5000_vectrain5000_aux.pkl"
old_best_model_ = load_model(model_file)
old_best_model_aux_ = get_model_file_aux(aux_file)
old_tok_ = old_best_model_aux_['Tokenizer']
old_word_index_ = old_best_model_aux_['Tokenizer'].word_index

## use fresh model
best_model_ = best_model
best_model_aux_ = best_model_aux
tok_ = tok
word_index_ = word_index

item_d = 'NieR: Automata™ DEMO 120161128 (Playable Demo)'

# screwdrivers check
scrw_items = [
"tekton 2655 flare nut wrench set metric 6piece"
,"tekton 2780 10slot screwdriver holder and organizer"
,"titan 17237 insulated electrical screwdriver set  7 piece"
,"tool sorter screwdriver organizer red"
,"torin sdh15rt magnetic screwdriver holder"  #wrong predict
,"wera 05020013001 joker combination wrenchset 11 pieces"
,"wera kk vde 60i62i68i18 insulated pouch set with interchangeable blades 18piece" # tricky
,"wiha 28103 magnetic 14 bit holder stubby 57mm pliers screwdriver" # tricky, wrong predict
]

for n, i in enumerate(scrw_items):
    item_d = i
    
    print(str(n) + ' ' + '='*100)
    
    print('Old model prediction:')
    print('item:',item_d)
    print('Seq max len:', old_best_model_aux_['Max length'])
    print(predict(item_d, old_tok_, old_best_model_, old_best_model_aux_['Max length'], '927'))
    print(predict_proba(item_d, old_tok_, old_best_model_, old_best_model_aux_['Max length']))


    print('\nFresh model prediction:')
    print('item:',item_d)
    print('Seq max len:', best_model_aux_['Max length'])
    print(predict_2(item_d, tok_, best_model_, best_model_aux_['Max length'], '927'))
    #print(predict_proba(item_d, tok_, best_model_, best_model_aux_['Max length']))

    print()

    #tt = train_df.loc[0:10,['description_mod1']]
    #tt['pred'] = tt['description_mod1'].apply(lambda x: predict(x, best_model_aux_['Tokenizer'], best_model, best_model_aux_['Max length'], best_model_aux_['Category ID']))
    #tt['prob'] = tt['description_mod1'].apply(lambda x: predict_proba(x, best_model_aux_['Tokenizer'], best_model, best_model_aux_['Max length']))

Old model prediction:
item: tekton 2655 flare nut wrench set metric 6piece
Seq max len: 30
not 927
4.17323e-08

Fresh model prediction:
item: tekton 2655 flare nut wrench set metric 6piece
Seq max len: 30
not 927

Old model prediction:
item: tekton 2780 10slot screwdriver holder and organizer
Seq max len: 30
not 927
0.121584

Fresh model prediction:
item: tekton 2780 10slot screwdriver holder and organizer
Seq max len: 30
927

Old model prediction:
item: titan 17237 insulated electrical screwdriver set  7 piece
Seq max len: 30
927
0.998989

Fresh model prediction:
item: titan 17237 insulated electrical screwdriver set  7 piece
Seq max len: 30
927

Old model prediction:
item: tool sorter screwdriver organizer red
Seq max len: 30
not 927
0.405264

Fresh model prediction:
item: tool sorter screwdriver organizer red
Seq max len: 30
927

Old model prediction:
item: torin sdh15rt magnetic screwdriver holder
Seq max len: 30
927
0.994613

Fresh model prediction:
item: torin sdh15rt magnetic sc