In [1]:
# Keras 
# Classifier: LSTM 
# Classification type: multi-class (404 classes)
# Output nodes: #of classes with softmax

# word2vec model: 
# word2vect_class_specififc__vec64_win1__dict_sample_5000

In [2]:
from __future__ import print_function
import numpy as np
import pandas as pd
import pickle
import time

from keras.datasets import reuters
from keras.wrappers.scikit_learn import KerasClassifier
from keras.layers.wrappers import TimeDistributed
from keras.models import load_model, Sequential
from keras.layers.embeddings import Embedding
from keras.layers import Dense, Dropout, Activation, LSTM, Bidirectional, Flatten, Lambda
from keras.layers.recurrent import SimpleRNN
from keras.layers.convolutional import Conv1D
from keras.layers.pooling import MaxPooling1D
from keras.layers.noise import GaussianNoise
from keras.utils import np_utils
from keras.preprocessing.text import Tokenizer, one_hot, text_to_word_sequence
from keras.preprocessing import sequence
from keras import backend as K
from keras.engine.topology import Layer
from keras import initializations
from keras import regularizers
from keras import constraints
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping


from keras.utils.visualize_util import plot
from IPython.display import SVG
from keras.utils.visualize_util import model_to_dot

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import os    
os.environ['THEANO_FLAGS'] = "device=gpu0" 
import theano
#theano.config.device = 'gpu0'
theano.config.floatX = 'float32'


# import custom code
import os
import sys
pardir = os.path.abspath(os.path.join(os.getcwd(), '../'))
script_path = pardir + "/2_common_aux_script"
print('Importing process_string.py \nfrom ' + script_path + " ...\n")
sys.path.append(script_path)
from process_string import process_string
sys.path.remove(script_path)


print(sys.version)

Using Theano backend.


Importing process_string.py 
from /Users/altay.amanbay/Desktop/new node booster/experiments/3a.1 - Nets train/5 train model - keras (unigrams)/2_common_aux_script ...

3.5.2 |Anaconda 4.2.0 (x86_64)| (default, Jul  2 2016, 17:52:12) 
[GCC 4.2.1 Compatible Apple LLVM 4.2 (clang-425.0.28)]


In [3]:
import keras
print(theano.__version__)

0.8.2


In [3]:
# Functions

def NGramGenerator_wordwise_interval(phrase, min_ngram, max_ngram):
    all_ngram_lists = []

    printable_ = 'abcdefghijklmnopqrstuvwxyz0123456789 '
    s_split = "".join((char if char in printable_ else "") for char in phrase).split()
    #phrase_processed = process_string(phrase)
    #s_split = phrase_processed.split()
    
    for n in range(max_ngram, min_ngram - 1, -1):
        n_gram = [s_split[i:i+n] for i in range(len(s_split)-n+1)]
        all_ngram_lists.extend(n_gram)
        
    all_ngrams = []
    for n_gram in all_ngram_lists:
        all_ngrams.extend([' '.join(n_gram)])
    
    return all_ngrams

def get_word2index(texts_ls_):
    word2index_ = {}

    c = 1
    for text_str in texts_ls_:
        text_tokens_ls = text_str.lower().split()
        for token in text_tokens_ls:
            if(token not in word2index_):
                word2index_[token] = c
                c = c + 1
                
    return word2index_

def train_df_preprocess(top_words_, texts_ls_, max_pad_length_):
    # texts_ls_: list of texts strings
    
    tok = Tokenizer(top_words_)
    tok.fit_on_texts(texts_ls_)

    words = []
    for iter in range(top_words):
        words += [key for key,value in tok.word_index.items() if value==iter+1]

    #Class for vectorizing texts, or/and turning texts into sequences 
    #(=list of word indexes, where the word of rank i in the dataset (starting at 1) has index i).
    texts_vec_ls = tok.texts_to_sequences(texts_ls_)#turns text to sequence, stating which word comes in what place
    texts_vec_mtx = sequence.pad_sequences(texts_vec_ls, maxlen=max_pad_length_)#pad sequence, essentially padding it with 0's at the end
    
    return texts_vec_mtx

def text_2_vec(text_str, word2index_):
    # text_str: text string
    
    text_tokens_ls = text_str.lower().split()
    
    text_vec = []
    for token in text_tokens_ls:
        if token in word2index_:
            text_vec.append(word2index_[token])
        else:
            text_vec.append(0)
            
    return text_vec

def train_df_preprocess_2(texts_ls_, word2index_, max_pad_length_):
    # texts_ls_: list of texts strings
    
    texts_vec_ls = []
    for text_ in texts_ls_:
        #print(text_)
        #print(type(text_))
        text_vec = text_2_vec(text_, word2index_)
        texts_vec_ls.append(text_vec)
    
    texts_vec_ary = sequence.pad_sequences(texts_vec_ls, maxlen=max_pad_length_)
    
    return texts_vec_ary

def texts_to_sequences_custom(texts_ls, word_index_, new_words_to_zero = False):
    texts_seq = []
    
    for text in texts_ls:
        #text_split = text.lower().split()
        text_split = NGramGenerator_wordwise_interval(text,1,1)
        seq = []
        for token in text_split:
            if(token in word_index_):
                seq.append(word_index_[token])
            elif(new_words_to_zero):
                seq.append(0)
                
        texts_seq.append(seq)
#         for k,v in word_index_.items():
#             if(v == 395):
#                 print(k,v)
    return texts_seq


def get_model_file_aux(model_file_aux_name):
    with open(model_file_aux_name, 'rb') as pickle_file:
        model_file_aux = pickle.load(pickle_file)
    return model_file_aux

def clone_tokens(text_str, cloning_factor = 1):
    token_clones_ls = []
    for token in text_str.split():
        token_clones_ls.extend([token]*cloning_factor)
    return ' '.join(token_clones_ls)

In [5]:
# Read sampled descriptionary

path = pardir+'/1_data/'
file_name = 'sampled_descriptionary_sample_size_5000.csv'
file_name = 'scorecards_for_fasttext.csv'
samples_df = pd.read_csv(path + file_name)

# Rename columns
samples_df.rename(columns={
                           #'description': 'description_mod1', 
                           '\ufeff"description"': 'description_mod1',
                           'category_id': 'category_id_mod1',
                           'category_path': 'category_full_path_mod1'}, inplace=True)

# Drop rows with NaN in any column
samples_df.dropna()

# Process description_mod1 strings by process_string function
samples_df['description_mod1'] = samples_df['description_mod1'].apply(lambda x: process_string(x))

# Drop rows where token count less than 1 in description_mod1 column
selected_indices = samples_df['description_mod1'].apply(lambda x: len(str(x).split()) > 1)
samples_df = samples_df[selected_indices]

# Drop duplicates
samples_df.drop_duplicates(subset=['description_mod1','category_full_path_mod1'], inplace = True, keep='first')
samples_df.drop_duplicates(subset=['description_mod1'], inplace = True, keep=False)
samples_df.reset_index(drop=True, inplace=True) 

# Clone tokens
samples_df['description_cloned'] = samples_df['description_mod1'].apply(lambda x: clone_tokens(x, cloning_factor=5))

# Drop 'screwdrivers' from descriptionary
#samples_df = samples_df.loc[samples_df.category_id_mod1 != 927,:]

# Drop index column
#samples_df.drop(labels=['index'], axis=1, inplace=True)

print('samples data shape:',samples_df.shape)
samples_df.head()

samples data shape: (587983, 4)


Unnamed: 0,description_mod1,category_id_mod1,category_full_path_mod1,description_cloned
0,aveeno baby wash shampoo lightly scented 8 ou...,206,Baby Products > Bathing & Skin Care > Soaps & ...,aveeno aveeno aveeno aveeno aveeno baby baby b...
1,earths best chlorine free diapers size 4 120 c...,213,Baby Products > Diapering > Disposable Diapers,earths earths earths earths earths best best b...
2,doa5lr costume catalog lr10 addon content,320,Electronics & Accessories > Video Games > Other,doa5lr doa5lr doa5lr doa5lr doa5lr costume cos...
3,lightweight oxford l in slim and white,152,Apparel & Accessories > Apparel > Tops & Tees ...,lightweight lightweight lightweight lightweigh...
4,now foods nutritional yeast flakes 10ounce,525,Health & Beauty > Vitamins & Dietary Supplements,now now now now now foods foods foods foods fo...


### Tweak unbiased data (drop classes)

In [6]:
## Check for class count

stats_sr = samples_df["category_full_path_mod1"].value_counts()
stats_sr[0:10]

Photos & Paper Products                                  50400
Books                                                    33416
Other                                                    29343
Health & Beauty > Makeup                                 24738
Online Services                                          22120
Health & Beauty > Vitamins & Dietary Supplements         20874
Jewelry & Watches > Earrings > Women                     18535
Apparel & Accessories > Apparel > Tops & Tees > Women    13440
Pet Supplies > Dogs & Cats > Dog Food & Treats           11791
Grocery & Gourmet Food > Restaurant & Takeout            10333
Name: category_full_path_mod1, dtype: int64

In [7]:
# Delete certain categories

samples_df = samples_df.loc[samples_df['category_full_path_mod1'].str.contains("Other")==False,:]
samples_df = samples_df.loc[samples_df.category_full_path_mod1 != "Books",:]
#samples_df = samples_df.loc[samples_df.category_full_path_mod1 != "Photos & Paper Products",:]

class_1 = 'Pet Supplies > Dogs & Cats > Dog Food & Treats'
class_1 = 'Photos & Paper Products'
class_2 = 'Apparel & Accessories > Apparel > Tops & Tees > Women'
class_2 = 'Apparel & Accessories > Accessories > Luggage, Backpacks & Laptop Bags'
classes_selected = class_1+'|'+class_2
#samples_df = samples_df.loc[samples_df['category_full_path_mod1'].str.contains(classes_selected)==True,:]
print('Pruned samples data shape:',samples_df.shape)

Pruned samples data shape: (484432, 4)


### Tweak unbiased data (equalize class numbers)

In [7]:
## Map class counts

samples_df['count'] = samples_df['category_full_path_mod1'].map(stats_sr)
samples_df.head()

Unnamed: 0,description_mod1,category_id_mod1,category_full_path_mod1,description_cloned,count
0,aveeno baby wash shampoo lightly scented 8 ou...,206,Baby Products > Bathing & Skin Care > Soaps & ...,aveeno aveeno aveeno aveeno aveeno baby baby b...,297
1,earths best chlorine free diapers size 4 120 c...,213,Baby Products > Diapering > Disposable Diapers,earths earths earths earths earths best best b...,2456
3,lightweight oxford l in slim and white,152,Apparel & Accessories > Apparel > Tops & Tees ...,lightweight lightweight lightweight lightweigh...,13440
4,now foods nutritional yeast flakes 10ounce,525,Health & Beauty > Vitamins & Dietary Supplements,now now now now now foods foods foods foods fo...,20874
5,navitas naturals organic goji berries 8 ounce ...,407,Grocery & Gourmet Food > Snack Foods > Dried F...,navitas navitas navitas navitas navitas natura...,182


In [8]:
## samples_df is unbiased by classes (category_full_path_mod1)
## fix by picking N samples from each class

#N=samples_df.category_full_path_mod1.value_counts(normalize=True).iloc[0] * samples_df.shape[0]
samples_df = samples_df.sample(frac=1).groupby('category_full_path_mod1', sort=False).head(1000)
print('samples data shape after picking max N samples from each class:',samples_df.shape)

samples data shape after picking max N samples from each class: (203321, 5)


### Concat sample_df into train_df

In [8]:
## Concat original train set and sampled descriptionary

#train_df = pd.concat([train_df, samples_df], axis=0)
train_df = samples_df
#train_df.reset_index(drop=True, inplace=True)

# description into chars
#train_df['description_mod1'] = train_df['description_mod1'].apply(lambda x: process_string(x))

# deduplicate
#train_df.drop_duplicates(subset=['description_mod1'], inplace = True, keep=False)
#print('train data shape (deduplicate):',train_df.shape)
    
# Encode target feature
le = LabelEncoder()
le.fit(train_df['category_full_path_mod1'])
train_df['target_le'] = le.transform(train_df['category_full_path_mod1'])


train_df.head(2)

Unnamed: 0,description_mod1,category_id_mod1,category_full_path_mod1,description_cloned,target_le
0,aveeno baby wash shampoo lightly scented 8 ou...,206,Baby Products > Bathing & Skin Care > Soaps & ...,aveeno aveeno aveeno aveeno aveeno baby baby b...,169
1,earths best chlorine free diapers size 4 120 c...,213,Baby Products > Diapering > Disposable Diapers,earths earths earths earths earths best best b...,175


In [8]:
string_w_max_tokens_1 = train_df['description_mod1'].map(lambda x: len(str(x).split())).max()
string_w_max_tokens_2 = train_df['description_cloned'].map(lambda x: len(str(x).split())).max()
print('string_w_max_tokens_1:',string_w_max_tokens_1)
print('string_w_max_tokens_2:',string_w_max_tokens_2)

string_w_max_tokens_1: 81
string_w_max_tokens_2: 405


### Create input matrix for HIERARCHICAL LSTM

In [8]:
## Create input matrix for HIERARCHICAL LSTM

top_words = None
tok = Tokenizer(nb_words = top_words)
tok.fit_on_texts(train_df.description_mod1)
word_index = tok.word_index
print('word_index size:',len(word_index))

ngram_len = 1
max_description_length = 30-ngram_len+1

train_texts_vec_mtx_ = np.zeros((len(train_df), max_description_length, ngram_len), dtype='int32')
for i, sentence_str in enumerate(train_df.description_mod1):
    ngrams_ls = NGramGenerator_wordwise_interval(sentence_str, ngram_len, ngram_len)
    for j, ngram_str in enumerate(ngrams_ls):
        if j< max_description_length:
            tokens_ls = NGramGenerator_wordwise_interval(ngram_str,1,1)
            k=0
            for token in tokens_ls:
                if k<ngram_len:
                    train_texts_vec_mtx_[i,j,k] = word_index[token]
                    k=k+1  
                    
print('train_texts_vec_mtx_ shape:', train_texts_vec_mtx_.shape)

y_ary = np.array(list(train_df['target_le']))
y_ary_cat = np_utils.to_categorical(train_df['target_le'])
print('y_ary_cat shape:',y_ary_cat.shape)

word_index size: 141483
train_texts_vec_mtx_ shape: (484432, 30, 1)
y_ary_cat shape: (484432, 861)


### Create input matrix for normal LSTM

In [9]:
# Create input matrix for normal LSTM

# Split into train and test
X_ls = np.array(list(train_df['description_mod1']))
#X_ls = np.array(list(train_df['description_cloned']))
y_ary = np.array(list(train_df['target_le']))
y_ary_cat = np_utils.to_categorical(train_df['target_le'])

#X_train_ls, X_test_ls, y_train_ary, y_test_ary = train_test_split(X_ls, y_ary, test_size = 0.3)
#X_train_ls, X_test_ls, y_train_ary, y_test_ary = train_test_split(X_ls, y_ary_cat, test_size = 0.3)
print(len(X_ls))
print(y_ary_cat.shape)


top_words = None
max_description_length = 30 #string_w_max_tokens_1

tok = Tokenizer(nb_words = top_words)
tok.fit_on_texts(X_ls)
word_index = tok.word_index
print('word_index size:',len(word_index))

#train_texts_vec_ls = tok.texts_to_sequences(X_train_ls)
train_texts_vec_ls = texts_to_sequences_custom(X_ls, word_index, new_words_to_zero = False)
train_texts_vec_mtx = sequence.pad_sequences(train_texts_vec_ls, maxlen = max_description_length)

print('train_texts_vec_mtx shape:',train_texts_vec_mtx.shape)
list(word_index)[0:5]

# Delete objects
X_ls = None
y_ary = None
tok = None
train_texts_vec_ls = None

484432
(484432, 861)
word_index size: 141483
train_texts_vec_mtx shape: (484432, 30)


In [12]:
# test check
i = 100
print(train_texts_vec_mtx[i])
len(word_index)

[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0  44 147 886 107]


141483

### Create random embeddings

In [13]:
# Create RANDOM embedding vectors for each word in word index (lower cell code preferable)
# fix random seed for reproducibility
np.random.seed(12345) #8
embeddings_source = 'random'

embedding_vecor_length = 32
uniq_token_count = len(word_index)
print('word index size:', uniq_token_count)

is_random_embeddings = True
embedding_matrix = np.zeros((uniq_token_count + 1, embedding_vecor_length))
if(is_random_embeddings == True):
    for word, i in word_index.items():
        #embedding_vector = np.random.uniform(.1, size=(1, embedding_vecor_length))
        embedding_vector = np.random.uniform(-0.5, 0.5, embedding_vecor_length)
        embedding_matrix[i] = embedding_vector
else:
    c = 0
    for word, i in word_index.items():
        embedding_vector = np.random.uniform(c, c+100, size=(1, embedding_vecor_length))
        embedding_matrix[i] = embedding_vector
        c=c+6000
    scaler = int('1'+'0'*len(str(c)))
    embedding_matrix=embedding_matrix/scaler

        
print('embedding matrix shape:',embedding_matrix.shape)
print(embedding_matrix[0:2])


word index size: 141483
embedding matrix shape: (141484, 32)
[[ 0.          0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.        ]
 [-0.29038349 -0.3967836   0.46887872  0.41284853  0.15527183 -0.20647202
  -0.05619827  0.37759045 -0.30625056  0.29131078  0.40400027 -0.1658854
   0.06171373 -0.18156729  0.31655114 -0.32507217 -0.16188482  0.28201956
   0.44501744  0.2954131  -0.21233307  0.08122893 -0.05593474 -0.28424983
   0.12955629 -0.35590723  0.04077065 -0.42050908  0.20095938  0.45070885
   0.18986001 -0.33806296]]


### Create pre-trained embeddings

In [None]:
## Create word embeddings from trained Word2Vec model
from gensim.models import word2vec, Phrases
embeddings_source = 'word2vec'

# Load model
file_path_1_1 = pardir+"/3_model_word2vec_vec64_win1__dict_sample_5000/word2vect_class_specific_unigrams__vec64_win1__dict_sample_5000"
file_path_1_2 = pardir+"/3_model_word2vec_vec64_win1__dict_sample_5000/word2vect_class_specific_unigrams__vec128_win1__dict_sample_5000"
file_path_1_3 = pardir+"/3_model_word2vec_vec64_win1__dict_sample_5000/word2vect_vec_64_win30__dict_sample_5000"
file_path_1_4 = pardir+"/3_model_word2vec_vec64_win1__dict_sample_5000/word2vect_unigrams_interrelations__vec64_win1__dict_sample_5000"
file_path_1_5 = '/Users/altay.amanbay/Desktop/word2vec_new_trial/3_word2vec_modeling/word2vect_vec64_win1_sample10_iter100__dict_sample_5000'
file_path_1_6 = '/Users/altay.amanbay/Desktop/word2vec_new_trial/3_word2vec_modeling/word2vect_class_specififc_unigrams__vec64_win1_sample0_iter1000__dict_sample_5000'
file_path_1_7 = '/Users/altay.amanbay/Desktop/word2vec_new_trial/3_word2vec_modeling/word2vect_class_specififc_unigrams__vec64_win1_sample0_iter100__dict_sample_5000'
file_path_1_8 = '/Users/altay.amanbay/Desktop/word2vec_new_trial/3_word2vec_modeling/word2vect_class_specififc_unigrams__vec64_win1_sample10_iter100__dict_sample_5000'

file_path_1_9 ='/Users/altay.amanbay/Desktop/word2vec_new_trial/3_word2vec_modeling/word2vect_class_specififc_unigrams__vec64_win1_sample0.001_iter1000__dict_sample_5000'
file_path_1_10 ='/Users/altay.amanbay/Desktop/word2vec_new_trial/3_word2vec_modeling/'
#model = word2vec.Word2Vec.load(file_path_1_4)

file_path_2_1 = pardir+"/3_model_fasttext/fasttext__vec64_win5__dict_sampled_5000.vec"
file_path_2_2 = pardir+"/3_model_fasttext/fasttext__vec64_win1__dict_sampled_5000.vec"
file_path_2_3 = pardir+"/3_model_fasttext/fasttext__vec128_win1__dict_sampled_5000.vec"
file_path_2_4 = pardir+"/3_model_fasttext/fasttext__vec400_win2__scorecards.vec"
model = word2vec.Word2Vec.load_word2vec_format(file_path_2_4) # for fasttext model

#print(model.vocab.keys())
#sys.exit()

# word vector embeddings from model into dictionary
word2vec_dict={}
for word in model.vocab.keys():
    try:
        word2vec_dict[word]=model[word]
    except:    
        pass
print('Loaded %s word vectors.' % len(word2vec_dict))
    
embedding_vecor_length = len(model[word])
print('embedding_vecor_length:',embedding_vecor_length)


embedding_matrix = np.zeros((len(word_index) + 1, embedding_vecor_length))
for word, i in word_index.items():
    embedding_vector = word2vec_dict.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        
print('\nembedding matrix shape:',embedding_matrix.shape)
print(embedding_matrix[0]) # first cell should be all zeros
print(embedding_matrix[1])

In [10]:
# Best model result holder
best_model_aux = {}
best_model_aux['Max length'] = max_description_length
best_model_aux['Best score'] = 0
best_model_aux['texts_to_sequences'] = texts_to_sequences_custom
best_model_aux['word_index'] = word_index
best_model_aux['Label encoder'] = le


best_model = None

In [14]:
# Optional
# Load previous model (if needs to be compared in the following training)
#best_model = load_model('category_927_nets_1000_model.h5')
#best_model_aux = get_model_file_aux('category_927_nets_1000_model_aux.pkl')

In [11]:
# prediction nodes count
nb_classes = train_df['category_full_path_mod1'].unique()
print('Classes count:', len(nb_classes))

# keep the length of original (e.g. unboosted) train set
orig_dim = train_texts_vec_mtx.shape[0]

train_df = None
samples_df = None

Classes count: 861


In [26]:
model = None

### Model: Hierarchical LSTM

In [13]:
from keras.layers import Input
from keras.models import Model

# TRAIN MODEL 1
start = time.time()

# define/initialize model
top_words = len(word_index) + 1
batch_size_ = 64   # 64

embedding_layer = Embedding(top_words,
                            embedding_vecor_length,
                            weights=[embedding_matrix],
                            input_length=ngram_len,
                            trainable=False)

sentence_input = Input(shape=(ngram_len,), dtype='int32') 
embedded_sequences = embedding_layer(sentence_input)
LSTM_ngram = LSTM(16)(embedded_sequences)   # Bidirectional(LSTM(16))(embedded_sequences)
ngram_Encoder = Model(sentence_input, LSTM_ngram)

sentence_input = Input(shape=(max_description_length, ngram_len), dtype='int32')
sentence_encoder = TimeDistributed(ngram_Encoder)(sentence_input)
LSTM_sentence = LSTM(16)(sentence_encoder)   # Bidirectional(LSTM(16))(sentence_encoder)
preds = Dense(len(nb_classes), activation='softmax')(LSTM_sentence)
model = Model(sentence_input, preds)

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])  # try loss=categorical_crossentropy
print(model.summary())

# start training multiple times with epoch=1
for ep in range(1):
    print('Epoch iter #' + str(ep+1))
    model.fit(train_texts_vec_mtx_, y_ary_cat, nb_epoch=10, batch_size=batch_size_)
    
    scores = model.evaluate(train_texts_vec_mtx_, y_ary_cat, verbose=0)
    if(best_model_aux['Best score'] < scores[1]):
        best_model_aux['Best score'] = scores[1]
        best_model = model
        print('Captured improved model')
        print('Valid accuracy: ',best_model_aux['Best score'])
        #print("Accuracy on test set: %.2f%%" % (scores[1]*100))
    else:
        print('Model not improved')
        print('Valid accuracy: ',scores[1])
    #    break
    print()
 

print("Training took %g s" % (time.time() - start))


# 1  ================================================================================
# Condition 1:
# train data: scorecards_for_fasttext.csv
# trainable=False
# samples_df['category_full_path_mod1'].str.contains("Other")==False
# samples_df['category_full_path_mod1'] != 'Books'
# ngram_len = 1, classes = 861

# Accuracy progress:
# loss: 4.6068 - acc: 0.1441  Valid accuracy:  0.209977870991
# loss: 3.9868 - acc: 0.2639  Valid accuracy:  0.292889404498
# loss: 3.6656 - acc: 0.3083  Valid accuracy:  0.322813108961
# loss: 3.4465 - acc: 0.3395  Valid accuracy:  0.35384119959
# .
# .
# 10th
# loss: 2.8205 - acc: 0.4448  Valid accuracy:  0.448137199855

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_2 (InputLayer)             (None, 30, 1)         0                                            
____________________________________________________________________________________________________
timedistributed_1 (TimeDistribut (None, 30, 16)        4530624     input_2[0][0]                    
____________________________________________________________________________________________________
lstm_2 (LSTM)                    (None, 16)            2112        timedistributed_1[0][0]          
____________________________________________________________________________________________________
dense_1 (Dense)                  (None, 861)           14637       lstm_2[0][0]                     
Total params: 4,547,373
Trainable params: 19,885
Non-trainable params: 4,527,488
__________

### Model: Hierarchical LSTM with Attention layer

In [13]:
from keras.layers import Input
from keras.models import Model

# TRAIN MODEL 1
start = time.time()

# define/initialize model
top_words = len(word_index) + 1
batch_size_ = 64   # 64

embedding_layer = Embedding(top_words
                            ,embedding_vecor_length
                            ,weights=[embedding_matrix]
                            ,input_length=ngram_len
                            ,trainable=True
                            ,init='glorot_uniform'
                            )

sentence_input = Input(shape=(ngram_len,), dtype='int32') 
embedded_sequences = embedding_layer(sentence_input)
LSTM_ngram = LSTM(16, return_sequences=True)(embedded_sequences)    # Bidirectional(LSTM(16))(embedded_sequences)
l_dense = TimeDistributed(Dense(16))(LSTM_ngram)
l_att = AttLayer()(l_dense)
#l_att = AttentionWithContext()(l_dense)
ngram_Encoder = Model(sentence_input, l_att)

sentence_input = Input(shape=(max_description_length, ngram_len), dtype='int32')
sentence_encoder = TimeDistributed(ngram_Encoder)(sentence_input)
LSTM_sentence = LSTM(16, return_sequences=True)(sentence_encoder)   # Bidirectional(LSTM(16))(sentence_encoder)
l_dense_sent = TimeDistributed(Dense(16))(LSTM_sentence)
l_att_sent = AttLayer()(l_dense_sent)
#l_att_sent = AttentionWithContext()(l_dense_sent)
preds = Dense(len(nb_classes), activation='softmax')(l_att_sent)
model = Model(sentence_input, preds)

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])  # try loss=categorical_crossentropy
print(model.summary())

# start training multiple times with epoch=1
early_stopping = EarlyStopping(monitor='val_loss', patience=2) #stop training when loss didn't improve for 2 epochs
for ep in range(10):
    print('Epoch iter #' + str(ep+1))
    model.fit(train_texts_vec_mtx_, y_ary_cat, nb_epoch=1, batch_size=batch_size_, shuffle=True) #, callbacks=[early_stopping]
    
    scores = model.evaluate(train_texts_vec_mtx_, y_ary_cat, verbose=0)
    if(best_model_aux['Best score'] < scores[1]):
        best_model_aux['Best score'] = scores[1]
        best_model = model
        print('Captured improved model')
        print('Valid accuracy: ',best_model_aux['Best score'])
        #print("Accuracy on test set: %.2f%%" % (scores[1]*100))
    else:
        print('Model not improved')
        print('Valid accuracy: ',scores[1])
    #    break
    print()
 

print("Training took %g s" % (time.time() - start))

# 1  ================================================================================
# Condition 1:
# train dat: scorecards_for_fasttext.csv
# trainable=False
# samples_df['category_full_path_mod1'].str.contains("Other")==False
# samples_df['category_full_path_mod1'] != 'Books'
# ngram_len = 1, classes = 861 # no difference for ngram_len = 2
# no difference for AttLayer(), AttentionWithContext()

# Accuracy progress:
# loss: 4.1684 - acc: 0.2170  Valid accuracy:  0.302552267398
# loss: 3.3941 - acc: 0.3548  Valid accuracy:  0.383362370776
# loss: 3.0805 - acc: 0.4096  Valid accuracy:  0.428786702778
# loss: 2.8864 - acc: 0.4410  Valid accuracy:  0.455839003204
# .
# .
# .
# 10th
# loss: 2.4279 - acc: 0.5177  Valid accuracy:  0.521408577468

# 2  ================================================================================
# Condition 2:
# train data: scorecards_for_fasttext.csv
# trainable=True, init='glorot_uniform'
# samples_df['category_full_path_mod1'].str.contains("Other")==False
# samples_df['category_full_path_mod1'] != 'Books'
# ngram_len = 1, classes = 861
# no difference for AttLayer(), AttentionWithContext()

# Accuracy progress:
# loss: 3.3483 - acc: 0.3130  Valid accuracy:  0.499686230472
# loss: 1.9544 - acc: 0.5774  Valid accuracy:  0.643729150841
# loss: 1.4911 - acc: 0.6736  Valid accuracy:  0.718173035638
# loss: 1.2525 - acc: 0.7242  Valid accuracy:  0.755014119629
# loss: 1.0931 - acc: 0.7578  Valid accuracy:  0.788760858077
# loss: 0.9756 - acc: 0.7821  Valid accuracy:  0.805784919246
# loss: 0.8856 - acc: 0.8008  Valid accuracy:  0.823702810714
# loss: 0.8143 - acc: 0.8154  Valid accuracy:  0.837917396043
# loss: 0.7564 - acc: 0.8283  Valid accuracy:  0.845518050005
# loss: 0.7087 - acc: 0.8383  Valid accuracy:  0.856855038478
# .
# .
# 21th
# loss: 0.4684 - acc: 0.8889  Valid accuracy:  0.901172920038

# with AttLayer()
# Accuracy progress:

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_2 (InputLayer)             (None, 30, 1)         0                                            
____________________________________________________________________________________________________
timedistributed_2 (TimeDistribut (None, 30, 16)        4530912     input_2[0][0]                    
____________________________________________________________________________________________________
lstm_2 (LSTM)                    (None, 30, 16)        2112        timedistributed_2[0][0]          
____________________________________________________________________________________________________
timedistributed_3 (TimeDistribut (None, 30, 16)        272         lstm_2[0][0]                     
___________________________________________________________________________________________

### Fasttext

In [12]:
def means(x):
    return K.mean(x, axis=1)

def max_(x):
    return K.max(x, axis=1)

In [23]:
# TRAIN MODEL 1
start = time.time()

# define/initialize model
top_words = len(word_index) + 1
batch_size_ = 64   # 64

model = Sequential()
# --------------------------------------------------------------------------------------
# ---- Embedding layer -----------------------------------------------------------------
print('Embeddings source:',embeddings_source)
embedding_layer = Embedding(top_words
                            ,embedding_vecor_length
                            ,weights=[embedding_matrix]
                            ,input_length = max_description_length
                            ,trainable=True
                            ,init='glorot_uniform')
model.add(embedding_layer)
model.add(Lambda(means, output_shape=(embedding_vecor_length,)))

## OUTPUT: classes layer
## ======================================================================================
model.add(Dense(len(nb_classes), activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])  # try loss=categorical_crossentropy
print(model.summary())

# start training multiple times with epoch=1
for ep in range(4):
    print('Epoch iter #' + str(ep+1))
    model.fit(train_texts_vec_mtx, y_ary_cat, nb_epoch=1, batch_size=batch_size_)
    
    scores = model.evaluate(train_texts_vec_mtx, y_ary_cat, verbose=0)
    if(best_model_aux['Best score'] < scores[1]):
        best_model_aux['Best score'] = scores[1]
        best_model = model
        print('Captured improved model')
        print('Valid accuracy: ',best_model_aux['Best score'])
        #print("Accuracy on test set: %.2f%%" % (scores[1]*100))
    else:
        print('Model not improved')
        print('Valid accuracy: ',scores[1])
    #    break
    print()
 

print("Training took %g s" % (time.time() - start))


# 1  ================================================================================
# Condition 1:
# train data: scorecards_for_fasttext.csv
# trainable=True, init='glorot_uniform'
# samples_df['category_full_path_mod1'].str.contains("Other")==False
# samples_df['category_full_path_mod1'] != 'Books'
# ngram_len = 1, classes = 861
# trainable=True, init='glorot_uniform'

# Accuracy progress:
# loss: 3.4516 - acc: 0.3761  Valid accuracy:  0.578107969746
# loss: 1.8670 - acc: 0.6388  Valid accuracy:  0.687543349737
# loss: 1.3975 - acc: 0.7148  Valid accuracy:  0.742787429402
# loss: 1.1563 - acc: 0.7562  Valid accuracy:  0.77322926644
# 1000 sec

Embeddings source: random
____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_6 (Embedding)          (None, 30, 32)        4527488     embedding_input_6[0][0]          
____________________________________________________________________________________________________
lambda_4 (Lambda)                (None, 32)            0           embedding_6[0][0]                
____________________________________________________________________________________________________
dense_4 (Dense)                  (None, 861)           28413       lambda_4[0][0]                   
Total params: 4,555,901
Trainable params: 4,555,901
Non-trainable params: 0
____________________________________________________________________________________________________
None
Epoch iter #1
Epoch 1/1
Captured improved model
Valid accuracy:  0.578107969746

Epoc

### Model 1

In [18]:
# TRAIN MODEL 1
start = time.time()

# define/initialize model
top_words = len(word_index) + 1
batch_size_ = 64   # 64

model = Sequential()
# --------------------------------------------------------------------------------------
# ---- Embedding layer -----------------------------------------------------------------
#print('Embeddings source:',embeddings_source)
embedding_layer = Embedding(top_words
                            ,embedding_vecor_length
                            #,weights=[embedding_matrix]
                            ,input_length = max_description_length
                            ,init='glorot_uniform'
                            ,trainable=True)
model.add(embedding_layer)
#model.add(BatchNormalization()) #axis=1

## LSTM 1
## ======================================================================================
LSTM_1 = LSTM(32, return_sequences=False, activation='linear')
model.add(LSTM_1)
#LSTM_1 = SimpleRNN(128,return_sequences=False)
#model.add(LSTM(8,return_sequences=False, activation='linear'))
#model.add(LSTM(64,return_sequences=False, activation='linear'))
#LSTM_1 = LSTM(128,return_sequences=False, dropout_W = 0.3, dropout_U = 0.3)


## Dense 1
## ======================================================================================
Dense_1 = Dense(200,activation='sigmoid')
#model.add(Dense_1)


## OUTPUT: classes layer
## ======================================================================================
model.add(Dense(len(nb_classes), activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])  # try loss=categorical_crossentropy
print(model.summary())

# start training multiple times with epoch=1
for ep in range(10):
    print('Epoch iter #' + str(ep+1))
    model.fit(train_texts_vec_mtx, y_ary_cat, nb_epoch=1, batch_size=batch_size_)
    
    scores = model.evaluate(train_texts_vec_mtx[0:orig_dim], y_ary_cat[0:orig_dim], verbose=0)
    if(best_model_aux['Best score'] < scores[1]):
        best_model_aux['Best score'] = scores[1]
        best_model = model
        print('Captured improved model')
        print('Valid accuracy: ',best_model_aux['Best score'])
        #print("Accuracy on test set: %.2f%%" % (scores[1]*100))
    else:
        print('Model not improved')
        print('Valid accuracy: ',best_model_aux['Best score'])
    #    break
    print()
 

print("Training took %g s" % (time.time() - start))

# word2vect_vec64_win1_sample0_iter100__dict_sample_5000
# Accuracy: loss: start: 0.6854, end: 0.8602

# word2vect_vec64_win1_sample0_iter1000__dict_sample_5000
# Accuracy: loss: start: 0.6958

# word2vect_vec64_win1_sample10_iter100__dict_sample_5000
# Accuracy: loss: start: 0.7053

# word2vect_class_specififc_unigrams__vec64_win1_sample0_iter1000__dict_sample_5000
# Accuracy: loss: start: 0.7368, 0.8493

# word2vect_class_specififc_unigrams__vec64_win1_sample0_iter100__dict_sample_5000
# Accuracy: loss: start: 0.7198, 

# word2vect_class_specififc_unigrams__vec64_win1_sample10_iter100__dict_sample_5000
# Accuracy: loss: start: 0.7388


# 1  ================================================================================
# Condition 1:
# Embedding + LSTM
# train dat: scorecards_for_fasttext.csv
# trainable=False
# hidden units: 16
# samples_df['category_full_path_mod1'].str.contains("Other")==False
# samples_df['category_full_path_mod1'] != 'Books'

# Accuracy progress:
# loss: 3.9365 - acc: 0.2722  Valid accuracy:  0.339447022492
# loss: 3.3892 - acc: 0.3639  Valid accuracy:  0.381209333818
# loss: 3.2052 - acc: 0.3929  Valid accuracy:  0.401532103577
# loss: 3.0906 - acc: 0.4100  Valid accuracy:  0.416644647752
# loss: 3.0094 - acc: 0.4218  Valid accuracy:  0.427795851637
# .
# .
# 20th
# loss: 2.5908 - acc: 0.4885  Valid accuracy:  0.490392872477

# 2  ================================================================================
# Condition 2:
# Embedding + LSTM
# train dat: scorecards_for_fasttext.csv
# trainable=True, init='glorot_uniform'
# hidden units: 16, 32 (almost same results)
# samples_df['category_full_path_mod1'].str.contains("Other")==False
# samples_df['category_full_path_mod1'] != 'Books'

# Accuracy progress:
# loss: 2.2884 - acc: 0.5328  Valid accuracy:  0.656050797635
# loss: 1.3082 - acc: 0.7069  Valid accuracy:  0.758475905803
# loss: 1.0187 - acc: 0.7653  Valid accuracy:  0.805570234832
# loss: 0.8438 - acc: 0.8029  Valid accuracy:  0.840631915315
# loss: 0.7238 - acc: 0.8281  Valid accuracy:  0.85519742709
# .
# 13th
# loss: 0.3682 - acc: 0.9069  Valid accuracy:  0.924053324306

# for hidden units 32:
# 14th
# loss: 0.2676 - acc: 0.9291  Valid accuracy:  0.947082356244

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_5 (Embedding)          (None, 30, 1)         141484      embedding_input_5[0][0]          
____________________________________________________________________________________________________
lstm_5 (LSTM)                    (None, 32)            4352        embedding_5[0][0]                
____________________________________________________________________________________________________
dense_10 (Dense)                 (None, 861)           28413       lstm_5[0][0]                     
Total params: 174,249
Trainable params: 174,249
Non-trainable params: 0
____________________________________________________________________________________________________
None
Epoch iter #1
Epoch 1/1

KeyboardInterrupt: 

In [18]:
best_model_aux['Best score']

0.36778330085543481

In [28]:
for ep in range(5):
    print('Epoch iter #' + str(ep+1))
    model.fit(train_texts_vec_mtx, y_ary_cat, nb_epoch=1, batch_size=batch_size_)
    
    scores = model.evaluate(train_texts_vec_mtx[0:orig_dim], y_ary_cat[0:orig_dim], verbose=0)
    if(best_model_aux['Best score'] < scores[1]):
        best_model_aux['Best score'] = scores[1]
        best_model = model
        print('Captured improved model')
        print('Valid accuracy: ',best_model_aux['Best score'])
        #print("Accuracy on test set: %.2f%%" % (scores[1]*100))
    else:
        print('Model not improved')
        print('Valid accuracy: ',best_model_aux['Best score'])
    #    break
    print()

Epoch iter #1
Epoch 1/1
Captured improved model
Valid accuracy:  0.934331340622

Epoch iter #2
Epoch 1/1
Captured improved model
Valid accuracy:  0.938292680913

Epoch iter #3
Epoch 1/1
Captured improved model
Valid accuracy:  0.94294555273

Epoch iter #4
Epoch 1/1
Captured improved model
Valid accuracy:  0.947082356244

Epoch iter #5
Epoch 1/1
Model not improved
Valid accuracy:  0.947082356244



### Train set boosting

In [17]:
# Read sampled descriptionary

path = pardir+'/1_data/'
file_name = 'sampled_descriptionary_sample_size_5000.csv'
file_name = 'scorecards_for_fasttext.csv'
train_df = pd.read_csv(path + file_name)

# Rename columns
train_df.rename(columns={
                           #'description': 'description_mod1', 
                           '\ufeff"description"': 'description_mod1',
                           'category_id': 'category_id_mod1',
                           'category_path': 'category_full_path_mod1'}, inplace=True)

# Drop rows with NaN in any column
train_df.dropna()

# Process description_mod1 strings by process_string function
train_df['description_mod1'] = train_df['description_mod1'].apply(lambda x: process_string(x))

# Drop rows where token count less than 1 in description_mod1 column
selected_indices = train_df['description_mod1'].apply(lambda x: len(str(x).split()) > 1)
train_df = train_df[selected_indices]

# Drop duplicates
train_df.drop_duplicates(subset=['description_mod1','category_full_path_mod1'], inplace = True, keep='first')
train_df.drop_duplicates(subset=['description_mod1'], inplace = True, keep=False)
train_df.reset_index(drop=True, inplace=True) 

train_df['target_le'] = best_model_aux['Label encoder'].transform(train_df['category_full_path_mod1'])

print('samples data shape:',train_df.shape)
train_df.head()

samples data shape: (587983, 4)


Unnamed: 0,description_mod1,category_id_mod1,category_full_path_mod1,target_le
0,aveeno baby wash shampoo lightly scented 8 ou...,206,Baby Products > Bathing & Skin Care > Soaps & ...,206
1,earths best chlorine free diapers size 4 120 c...,213,Baby Products > Diapering > Disposable Diapers,212
2,doa5lr costume catalog lr10 addon content,320,Electronics & Accessories > Video Games > Other,298
3,lightweight oxford l in slim and white,152,Apparel & Accessories > Apparel > Tops & Tees ...,90
4,now foods nutritional yeast flakes 10ounce,525,Health & Beauty > Vitamins & Dietary Supplements,504


In [18]:
# 1 
# Get predictions
X_ls = np.array(list(train_df['description_mod1']))
seq_ = texts_to_sequences_custom(X_ls, best_model_aux['word_index'], new_words_to_zero = False)
seq_pad = sequence.pad_sequences(seq_, maxlen = 30)

predictions = best_model.predict_classes(seq_pad)

X_ls = None
seq_ = None
seq_pad = None
old_best_model_ = None



In [19]:
# 2
# Append predictions to df
train_df['Predictions_le'] = list(predictions)
train_df['Predictions'] = train_df['Predictions_le'].apply(lambda x: best_model_aux['Label encoder'].inverse_transform(x))
train_df.head()

Unnamed: 0,description_mod1,category_id_mod1,category_full_path_mod1,target_le,Predictions_le,Predictions
0,aveeno baby wash shampoo lightly scented 8 ou...,206,Baby Products > Bathing & Skin Care > Soaps & ...,206,312,Grocery & Gourmet Food > Beverages > Juices
1,earths best chlorine free diapers size 4 120 c...,213,Baby Products > Diapering > Disposable Diapers,212,212,Baby Products > Diapering > Disposable Diapers
2,doa5lr costume catalog lr10 addon content,320,Electronics & Accessories > Video Games > Other,298,298,Electronics & Accessories > Video Games > Other
3,lightweight oxford l in slim and white,152,Apparel & Accessories > Apparel > Tops & Tees ...,90,90,Apparel & Accessories > Apparel > Tops & Tees ...
4,now foods nutritional yeast flakes 10ounce,525,Health & Beauty > Vitamins & Dietary Supplements,504,240,Books


In [20]:
# 3
# Get FP items to be boosted
X_ls_boost = np.array(list(train_df.loc[train_df.target_le != train_df.Predictions_le,'description_mod1']))
print(X_ls_boost.shape)

train_texts_vec_ls_boost = texts_to_sequences_custom(X_ls_boost, best_model_aux['word_index'], new_words_to_zero = False)
train_texts_vec_mtx_boost = sequence.pad_sequences(train_texts_vec_ls_boost, maxlen = best_model_aux['Max length'])
print('train_texts_vec_mtx_boost shape:',train_texts_vec_mtx_boost.shape)

ind = train_df.loc[train_df.target_le != train_df.Predictions_le,'description_mod1'].index.values
y_ary_cat_boost = y_ary_cat[ind]
print('y_ary_cat_boost:',y_ary_cat_boost.shape)

X_ls_boost = None
train_texts_vec_ls_boost = None

(358126,)
train_texts_vec_mtx_boost shape: (358126, 30)
y_ary_cat_boost: (358126, 1021)


In [24]:
# 4
# Boost train set
#print('Original train_texts_vec_mtx:',train_texts_vec_mtx.shape)
#print('Original y_ary_cat:',y_ary_cat.shape)

#train_texts_vec_mtx = np.concatenate([train_texts_vec_mtx,train_texts_vec_mtx_boost], axis=0)
#print('Boosted train_texts_vec_mtx:',train_texts_vec_mtx.shape)

#y_ary_cat = np.concatenate([y_ary_cat,y_ary_cat_boost], axis=0)
#print('Boosted y_ary_cat:',y_ary_cat.shape)

#train_texts_vec_mtx_boost = None
#y_ary_cat_boost = None

Original train_texts_vec_mtx: (587983, 30)
Original y_ary_cat: (587983, 1021)
Boosted train_texts_vec_mtx: (978219, 30)
Boosted y_ary_cat: (978219, 1021)


In [44]:
# print('Original train_texts_vec_mtx:',train_texts_vec_mtx.shape)
# print('Original y_ary_cat:',y_ary_cat.shape)

# train_texts_vec_mtx = train_texts_vec_mtx_boost
# print('Boosted train_texts_vec_mtx:',train_texts_vec_mtx.shape)

# y_ary_cat = y_ary_cat_boost
# print('Boosted y_ary_cat:',y_ary_cat.shape)

# train_texts_vec_mtx_boost = None
# y_ary_cat_boost = None

Original train_texts_vec_mtx: (978219, 30)
Original y_ary_cat: (978219, 1021)
Boosted train_texts_vec_mtx: (342347, 30)
Boosted y_ary_cat: (342347, 1021)


In [21]:
# Continue training with boosted train set
print(model.summary())

# start training multiple times with epoch=1
for ep in range(3):
    print('Epoch iter #' + str(ep+1))
    model.fit(train_texts_vec_mtx_boost, y_ary_cat_boost, nb_epoch=1, batch_size=batch_size_)
    
    #scores = model.evaluate(train_texts_vec_mtx[0:orig_dim], y_ary_cat[0:orig_dim], verbose=0)
    scores = model.evaluate(train_texts_vec_mtx, y_ary_cat, verbose=0)
    if(best_model_aux['Best score'] < scores[1]):
        best_model_aux['Best score'] = scores[1]
        best_model = model
        print('Captured improved model')
        print('Valid accuracy: ',best_model_aux['Best score'])
        #print("Accuracy on test set: %.2f%%" % (scores[1]*100))
    else:
        print('Model not improved')
        print('Valid accuracy: ',scores[1])
    #    break
    print()
 

print("Training took %g s" % (time.time() - start))

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_1 (Embedding)          (None, 30, 32)        5317632     embedding_input_1[0][0]          
____________________________________________________________________________________________________
lstm_1 (LSTM)                    (None, 16)            3136        embedding_1[0][0]                
____________________________________________________________________________________________________
dense_2 (Dense)                  (None, 1021)          17357       lstm_1[0][0]                     
Total params: 5,338,125
Trainable params: 20,493
Non-trainable params: 5,317,632
____________________________________________________________________________________________________
None
Epoch iter #1
Epoch 1/1
Model not improved
Valid accuracy:  0.201453103236

Epoch iter #2
Epoch 1/1
Model 

In [37]:
best_model_aux['Best score']

0.41776037742603001

### Model 2 (CNN)

In [20]:
# TRAIN MODEL 2 (CNN)
start = time.time()

# define/initialize model
top_words = len(word_index) + 1
batch_size_ = 64   # 64

model = Sequential()
# --------------------------------------------------------------------------------------
# ---- Embedding layer -----------------------------------------------------------------
print('Embeddings source:',embeddings_source)
embedding_layer = Embedding(top_words, 
                            embedding_vecor_length, 
                            weights=[embedding_matrix], 
                            input_length = max_description_length,
                            trainable=False)
model.add(embedding_layer)
#model.add(BatchNormalization()) #axis=1

nb_feature_maps = 8
n_gram = 1
#model.add(Reshape(1, max_description_length, embedding_vecor_length))
#model.add(Convolution2D(nb_feature_maps, 1, n_gram, embedding_vecor_length))
#model.add(MaxPooling2D(poolsize=(max_description_length - n_gram + 1, 1)))
#model.add(Flatten())

# Conv 1
nb_filter_1 = 64
conv1D_1 = Conv1D(nb_filter=nb_filter_1   # feature maps
                  ,filter_length=3        # kernel size
                  ,subsample_length=1     # strides
                  ,border_mode='valid'    # padding: same, valid
                  ,activation='relu'
                 )
model.add(conv1D_1)
model.add(MaxPooling1D(pool_length = 3, stride = 1)) #model.output_shape[1]

# Conv 2
nb_filter_2 = 64
conv1D_2 = Conv1D(nb_filter=nb_filter_2,filter_length=2, subsample_length=1,border_mode='same',activation='relu')
model.add(conv1D_2)
model.add(MaxPooling1D(pool_length = 3, stride = 1))

# Conv 3
nb_filter_3 = 128
conv1D_3 = Conv1D(nb_filter=nb_filter_3,filter_length=4, subsample_length=2,border_mode='same',activation='relu')
model.add(conv1D_3)
model.add(MaxPooling1D(pool_length = 3, stride = 1))

# Conv 4
nb_filter_4 = 128
conv1D_4 = Conv1D(nb_filter=nb_filter_4,filter_length=4, subsample_length=2,border_mode='same',activation='relu')
model.add(conv1D_4)
model.add(MaxPooling1D(pool_length = 3, stride = 1))



model.add(Flatten())


## Dense 1
## ======================================================================================
Dense_1 = Dense(600,activation='relu')
model.add(Dense_1)


## OUTPUT: classes layer
## ======================================================================================
model.add(Dense(len(nb_classes), activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])  # try loss=categorical_crossentropy
print(model.summary())

# start training multiple times with epoch=1
for ep in range(20):
    print('Epoch iter #' + str(ep+1))
    model.fit(train_texts_vec_mtx, y_ary_cat, nb_epoch=1, batch_size=batch_size_)
    
    scores = model.evaluate(train_texts_vec_mtx, y_ary_cat, verbose=0)
    if(best_model_aux['Best score'] < scores[1]):
        best_model_aux['Best score'] = scores[1]
        best_model = model
        print('Captured improved model')
        print('Valid accuracy: ',best_model_aux['Best score'])
        #print("Accuracy on test set: %.2f%%" % (scores[1]*100))
    #else:
    #    break
    print()
 

print("Training took %g s" % (time.time() - start))

Embeddings source: random
____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_8 (Embedding)          (None, 30, 64)        10635264    embedding_input_8[0][0]          
____________________________________________________________________________________________________
convolution1d_29 (Convolution1D) (None, 28, 64)        12352       embedding_8[0][0]                
____________________________________________________________________________________________________
maxpooling1d_29 (MaxPooling1D)   (None, 26, 64)        0           convolution1d_29[0][0]           
____________________________________________________________________________________________________
convolution1d_30 (Convolution1D) (None, 26, 64)        8256        maxpooling1d_29[0][0]            
_________________________________________________________________

KeyboardInterrupt: 

### Model 2

In [12]:
# FUNCTION FOR TRAIN MODEL 2

from keras import backend as K
from keras.engine.topology import Layer
from keras import initializations

class AttLayer(Layer):
    def __init__(self, **kwargs):
        self.init = initializations.get('normal')
        #self.input_spec = [InputSpec(ndim=3)]
        super(AttLayer, self).__init__(** kwargs)

    def build(self, input_shape):
        assert len(input_shape)==3
        #self.W = self.init((input_shape[-1],1))
        self.W = self.init((input_shape[-1],))
        #self.input_spec = [InputSpec(shape=input_shape)]
        self.trainable_weights = [self.W]
        super(AttLayer, self).build(input_shape)  # be sure you call this somewhere!

    def call(self, x, mask=None):
        eij = K.tanh(K.dot(x, self.W))

        ai = K.exp(eij)
        weights = ai/K.sum(ai, axis=1).dimshuffle(0,'x')

        weighted_input = x*weights.dimshuffle(0,1,'x')
        return weighted_input.sum(axis=1)

    def get_output_shape_for(self, input_shape):
        return (input_shape[0], input_shape[-1])

In [17]:
# TRAIN MODEL 2
# Attention-based
start = time.time()

# define/initialize model
top_words = len(word_index) + 1
batch_size_ = 64   # 64

model = Sequential()
# --------------------------------------------------------------------------------------
# ---- Embedding layer -----------------------------------------------------------------
embedding_layer = Embedding(top_words, 
                            embedding_vecor_length, 
                            weights=[embedding_matrix], 
                            input_length = max_description_length,
                            trainable=False)
model.add(embedding_layer)

## LSTM 1
## ======================================================================================
LSTM_1 = LSTM(32,return_sequences=True)
model.add(LSTM_1)

## Attention 1
## ======================================================================================
TimeDist_1 = TimeDistributed(Dense(32))  #, input_shape=(max_description_length, embedding_vecor_length)
model.add(TimeDist_1)
model.add(LSTM(32))
#model.add(AttLayer())

## Dense 1
## ======================================================================================
Dense_1 = Dense(128,activation='sigmoid')
#model.add(Dense_1)

## Output classes layer
## ======================================================================================
model.add(Dense(len(nb_classes), activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])  # try loss=categorical_crossentropy
print(model.summary())

# start training multiple times with epoch=1
for ep in range(20):
    print('Epoch iter #' + str(ep+1))
    model.fit(train_texts_vec_mtx, y_ary_cat, nb_epoch=1, batch_size=batch_size_)
    
    scores = model.evaluate(train_texts_vec_mtx, y_ary_cat, verbose=0)
    if(best_model_aux['Best score'] < scores[1]):
        best_model_aux['Best score'] = scores[1]
        best_model = model
        print('Captured improved model')
        print('Valid accuracy: ',best_model_aux['Best score'])
        #print("Accuracy on test set: %.2f%%" % (scores[1]*100))
    else:
        break
    print()
 

print("Training took %g s" % (time.time() - start))

# Accuracy: loss: 0.2722 - acc: 0.9140

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_2 (Embedding)          (None, 30, 32)        4527488     embedding_input_2[0][0]          
____________________________________________________________________________________________________
lstm_2 (LSTM)                    (None, 30, 32)        8320        embedding_2[0][0]                
____________________________________________________________________________________________________
timedistributed_1 (TimeDistribut (None, 30, 32)        1056        lstm_2[0][0]                     
____________________________________________________________________________________________________
lstm_3 (LSTM)                    (None, 32)            8320        timedistributed_1[0][0]          
___________________________________________________________________________________________

KeyboardInterrupt: 

In [None]:
best_model_aux['Best score']

In [None]:
# Plot Nets design
#from keras.utils import plot_model
import matplotlib.pyplot as plt
from keras.utils.visualize_util import model_to_dot

plot(model, to_file='/Users/altay.amanbay/Desktop/model.png')
SVG(model_to_dot(model).create(prog='dot', format='svg'))


### Model 3

In [12]:
# FUNCTION FOR TRAIN MODEL 3

class AttentionWithContext(Layer):
    """
        Attention operation, with a context/query vector, for temporal data.
        Supports Masking.
        Follows the work of Yang et al. [https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf]
        "Hierarchical Attention Networks for Document Classification"
        by using a context vector to assist the attention
        # Input shape
            3D tensor with shape: `(samples, steps, features)`.
        # Output shape
            2D tensor with shape: `(samples, features)`.
        :param kwargs:
        Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
        The dimensions are inferred based on the output shape of the RNN.
        Example:
            model.add(LSTM(64, return_sequences=True))
            model.add(AttentionWithContext())
        """

    def __init__(self,
                 W_regularizer=None, u_regularizer=None, b_regularizer=None,
                 W_constraint=None, u_constraint=None, b_constraint=None,
                 bias=True, **kwargs):

        self.supports_masking = True
        self.init = initializations.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.u_regularizer = regularizers.get(u_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.u_constraint = constraints.get(u_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        super(AttentionWithContext, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1], input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        if self.bias:
            self.b = self.add_weight((input_shape[-1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)

        self.u = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_u'.format(self.name),
                                 regularizer=self.u_regularizer,
                                 constraint=self.u_constraint)

        super(AttentionWithContext, self).build(input_shape)

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        uit = K.dot(x, self.W)

        if self.bias:
            uit += self.b

        uit = K.tanh(uit)
        ait = K.dot(uit, self.u)

        a = K.exp(ait)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        # and this results in NaN's. A workaround is to add a very small positive number ε to the sum.
        # a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx())
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def get_output_shape_for(self, input_shape):
        return input_shape[0], input_shape[-1]

In [None]:
# TRAIN MODEL 3
# Attention-based
start = time.time()

# define/initialize model
top_words = len(word_index) + 1
batch_size_ = 64   # 64

model = Sequential()
# --------------------------------------------------------------------------------------
# ---- Embedding layer -----------------------------------------------------------------
embedding_layer = Embedding(top_words, 
                            embedding_vecor_length, 
                            weights=[embedding_matrix], 
                            input_length = max_description_length,
                            trainable=False)
model.add(embedding_layer)

## LSTM 1
## ======================================================================================
LSTM_1 = LSTM(128,return_sequences=True)
# model.add(LSTM_1)

## Attention 1
## ======================================================================================
TimeDist_1 = TimeDistributed(Dense(200))  #, input_shape=(max_description_length, embedding_vecor_length)
model.add(TimeDist_1)
#model.add(AttentionWithContext())

LSTM_2 = LSTM(128,return_sequences=False)
model.add(LSTM_2)

## Dense 1
## ======================================================================================
Dense_1 = Dense(128,activation='sigmoid')
#model.add(Dense_1)

## Output classes layer
## ======================================================================================
model.add(Dense(len(nb_classes), activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])  # try loss=categorical_crossentropy
print(model.summary())

# start training multiple times with epoch=1
for ep in range(20):
    print('Epoch iter #' + str(ep+1))
    model.fit(train_texts_vec_mtx, y_ary_cat, nb_epoch=1, batch_size=batch_size_)
    
    scores = model.evaluate(train_texts_vec_mtx, y_ary_cat, verbose=0)
    if(best_model_aux['Best score'] < scores[1]):
        best_model_aux['Best score'] = scores[1]
        best_model = model
        print('Captured improved model')
        print('Valid accuracy: ',best_model_aux['Best score'])
        #print("Accuracy on test set: %.2f%%" % (scores[1]*100))
    else:
        break
    print()
 

print("Training took %g s" % (time.time() - start))

Accuracy: loss: 0.2677 - acc: 0.9152
Valid accuracy:  0.923365750258

In [None]:
best_model_aux['Best score']

In [None]:
# Plot Nets design
#from keras.utils import plot_model
import matplotlib.pyplot as plt
from keras.utils.visualize_util import model_to_dot

plot(model, to_file='/Users/altay.amanbay/Desktop/model.png')
SVG(model_to_dot(model).create(prog='dot', format='svg'))


### Model 4

In [None]:
# TRAIN MODEL 4
start = time.time()

# define/initialize model
top_words = len(word_index) + 1
batch_size_ = 64   # 64

model = Sequential()
# --------------------------------------------------------------------------------------
# ---- Embedding layer -----------------------------------------------------------------
embedding_layer = Embedding(top_words, 
                            embedding_vecor_length, 
                            weights=[embedding_matrix], 
                            input_length = max_description_length,
                            trainable=False)
model.add(embedding_layer)

## LSTM 1
## ======================================================================================
LSTM_1 = LSTM(128,return_sequences=True, activation='softmax')
model.add(LSTM_1)

## Attention 1
## ======================================================================================
TimeDist_1 = TimeDistributed(Dense(200))  #, input_shape=(max_description_length, embedding_vecor_length)
model.add(TimeDist_1)
#model.add(AttentionWithContext())

LSTM_2 = LSTM(128,return_sequences=False)
model.add(LSTM_2)

## Output classes layer
## ======================================================================================
model.add(Dense(len(nb_classes), activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])  # try loss=categorical_crossentropy
print(model.summary())

# start training multiple times with epoch=1
for ep in range(20):
    print('Epoch iter #' + str(ep+1))
    model.fit(train_texts_vec_mtx, y_ary_cat, nb_epoch=1, batch_size=batch_size_)
    
    scores = model.evaluate(train_texts_vec_mtx, y_ary_cat, verbose=0)
    if(best_model_aux['Best score'] < scores[1]):
        best_model_aux['Best score'] = scores[1]
        best_model = model
        print('Captured improved model')
        print('Valid accuracy: ',best_model_aux['Best score'])
        #print("Accuracy on test set: %.2f%%" % (scores[1]*100))
    else:
        break
    print()
 

print("Training took %g s" % (time.time() - start))

# Fails with acc: 0.0636

### Model 5

In [None]:
# TRAIN MODEL 5
start = time.time()

# define/initialize model
top_words = len(word_index) + 1
batch_size_ = 64   # 64

model = Sequential()
# --------------------------------------------------------------------------------------
# ---- Embedding layer -----------------------------------------------------------------
embedding_layer = Embedding(top_words, 
                            embedding_vecor_length, 
                            weights=[embedding_matrix], 
                            input_length = max_description_length,
                            trainable=False)
model.add(embedding_layer)

## LSTM 1
## ======================================================================================
LSTM_1 = LSTM(128,return_sequences=True)
#model.add(LSTM_1)

## Attention 1
## ======================================================================================
TimeDist_1 = TimeDistributed(Dense(200))  #, input_shape=(max_description_length, embedding_vecor_length)
model.add(TimeDist_1)
#model.add(AttLayer())
#model.add(AttentionWithContext())

LSTM_2 = LSTM(128,return_sequences=True)
model.add(LSTM_2)
model.add(AttentionWithContext())
## Output classes layer
## ======================================================================================
model.add(Dense(len(nb_classes), activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])  # try loss=categorical_crossentropy
print(model.summary())

# start training multiple times with epoch=1
for ep in range(20):
    print('Epoch iter #' + str(ep+1))
    model.fit(train_texts_vec_mtx, y_ary_cat, nb_epoch=1, batch_size=batch_size_)
    
    scores = model.evaluate(train_texts_vec_mtx, y_ary_cat, verbose=0)
    if(best_model_aux['Best score'] < scores[1]):
        best_model_aux['Best score'] = scores[1]
        best_model = model
        print('Captured improved model')
        print('Valid accuracy: ',best_model_aux['Best score'])
        #print("Accuracy on test set: %.2f%%" % (scores[1]*100))
    else:
        break
    print()
 

print("Training took %g s" % (time.time() - start))

# Fails with acc: 

### Model 6

In [None]:
# TRAIN MODEL 6 
# with word vect doc2vect_vec_64_win30__dict_sample_5000
start = time.time()

# define/initialize model
top_words = len(word_index) + 1
batch_size_ = 64   # 64

model = Sequential()
# --------------------------------------------------------------------------------------
# ---- Embedding layer -----------------------------------------------------------------
embedding_layer = Embedding(top_words, 
                            embedding_vecor_length, 
                            weights=[embedding_matrix], 
                            input_length = max_description_length,
                            trainable=False)
model.add(embedding_layer)

## LSTM 1
## ======================================================================================
LSTM_1 = LSTM(128,return_sequences=True)
model.add(LSTM_1)

## Attention 1
## ======================================================================================
TimeDist_1 = TimeDistributed(Dense(200))  #, input_shape=(max_description_length, embedding_vecor_length)
model.add(TimeDist_1)
model.add(AttLayer())

## Output classes layer
## ======================================================================================
model.add(Dense(len(nb_classes), activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])  # try loss=categorical_crossentropy
print(model.summary())

# start training multiple times with epoch=1
for ep in range(20):
    print('Epoch iter #' + str(ep+1))
    model.fit(train_texts_vec_mtx, y_ary_cat, nb_epoch=1, batch_size=batch_size_)
    
    scores = model.evaluate(train_texts_vec_mtx, y_ary_cat, verbose=0)
    if(best_model_aux['Best score'] < scores[1]):
        best_model_aux['Best score'] = scores[1]
        best_model = model
        print('Captured improved model')
        print('Valid accuracy: ',best_model_aux['Best score'])
        #print("Accuracy on test set: %.2f%%" % (scores[1]*100))
    else:
        break
    print()
 

print("Training took %g s" % (time.time() - start))

# Fails
# ____________________________________________________________________________________________________
# Layer (type)                     Output Shape          Param #     Connected to                     
# ====================================================================================================
# embedding_4 (Embedding)          (None, 30, 64)        15036864    embedding_input_4[0][0]          
# ____________________________________________________________________________________________________
# lstm_4 (LSTM)                    (None, 30, 128)       98816       embedding_4[0][0]                
# ____________________________________________________________________________________________________
# timedistributed_4 (TimeDistribut (None, 30, 200)       25800       lstm_4[0][0]                     
# ____________________________________________________________________________________________________
# attlayer_1 (AttLayer)            (None, 200)           200         timedistributed_4[0][0]          
# ____________________________________________________________________________________________________
# dense_7 (Dense)                  (None, 398)           79998       attlayer_1[0][0]                 
# ====================================================================================================
# Total params: 15,241,678
# Trainable params: 204,814
# Non-trainable params: 15,036,864
# ____________________________________________________________________________________________________
# None
# Epoch iter #1
# Epoch 1/1
# 938810/938810 [==============================] - 1057s - loss: 1.0193 - acc: 0.7235  
# Captured improved model
# Valid accuracy:  0.801597767386

### Model 7

In [None]:
# TRAIN MODEL 6 
# with word word2vect_unigrams_interrelations__vec64_win1__dict_sample_5000
start = time.time()

# define/initialize model
top_words = len(word_index) + 1
batch_size_ = 64   # 64

model = Sequential()
# --------------------------------------------------------------------------------------
# ---- Embedding layer -----------------------------------------------------------------
embedding_layer = Embedding(top_words, 
                            embedding_vecor_length, 
                            weights=[embedding_matrix], 
                            input_length = max_description_length,
                            trainable=False)
model.add(embedding_layer)

## LSTM 1
## ======================================================================================
LSTM_1 = LSTM(128,return_sequences=False, dropout_W = 0.3, dropout_U = 0.3)
model.add(LSTM_1)

## Attention 1
## ======================================================================================
TimeDist_1 = TimeDistributed(Dense(200))  #, input_shape=(max_description_length, embedding_vecor_length)
#model.add(TimeDist_1)
#model.add(AttLayer())

## Output classes layer
## ======================================================================================
model.add(Dense(len(nb_classes), activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])  # try loss=categorical_crossentropy
print(model.summary())

# start training multiple times with epoch=1
for ep in range(20):
    print('Epoch iter #' + str(ep+1))
    model.fit(train_texts_vec_mtx, y_ary_cat, nb_epoch=1, batch_size=batch_size_)
    
    scores = model.evaluate(train_texts_vec_mtx, y_ary_cat, verbose=0)
    if(best_model_aux['Best score'] < scores[1]):
        best_model_aux['Best score'] = scores[1]
        best_model = model
        print('Captured improved model')
        print('Valid accuracy: ',best_model_aux['Best score'])
        #print("Accuracy on test set: %.2f%%" % (scores[1]*100))
    else:
        break
    print()
 

print("Training took %g s" % (time.time() - start))

### Model 8

In [None]:
from keras import backend as K
from keras.layers import LSTM

class AttentionLSTM(LSTM):
    def __init__(self, output_dim, attention_vec, **kwargs):
        self.attention_vec = attention_vec
        super(AttentionLSTM, self).__init__(output_dim, **kwargs)

    def build(self, input_shape):
        super(AttentionLSTM, self).build(input_shape)

        assert hasattr(self.attention_vec, '_keras_shape')
        attention_dim = self.attention_vec._keras_shape[1]

        self.U_a = self.inner_init((self.output_dim, self.output_dim),
                                   name='{}_U_a'.format(self.name))
        self.b_a = K.zeros((self.output_dim,), name='{}_b_a'.format(self.name))

        self.U_m = self.inner_init((attention_dim, self.output_dim),
                                   name='{}_U_m'.format(self.name))
        self.b_m = K.zeros((self.output_dim,), name='{}_b_m'.format(self.name))

        self.U_s = self.inner_init((self.output_dim, self.output_dim),
                                   name='{}_U_s'.format(self.name))
        self.b_s = K.zeros((self.output_dim,), name='{}_b_s'.format(self.name))

        self.trainable_weights += [self.U_a, self.U_m, self.U_s,
                                   self.b_a, self.b_m, self.b_s]

        if self.initial_weights is not None:
            self.set_weights(self.initial_weights)
            del self.initial_weights

    def step(self, x, states):
        h, [h, c] = super(AttentionLSTM, self).step(x, states)
        attention = states[4]

        m = K.tanh(K.dot(h, self.U_a) + attention + self.b_a)
        s = K.exp(K.dot(m, self.U_s) + self.b_s)
        h = h * s

        return h, [h, c]

    def get_constants(self, x):
        constants = super(AttentionLSTM, self).get_constants(x)
        constants.append(K.dot(self.attention_vec, self.U_m) + self.b_m)
        return constants

In [None]:
# TRAIN MODEL 8
# embeddings source: word2vect_class_specific_unigrams__vec64_win1__dict_sample_5000
from keras.layers import Lambda, Input
start = time.time()

# define/initialize model
top_words = len(word_index) + 1
batch_size_ = 64   # 64

model = Sequential()
# --------------------------------------------------------------------------------------
# ---- Embedding layer -----------------------------------------------------------------
embedding_layer = Embedding(top_words, 
                            embedding_vecor_length, 
                            weights=[embedding_matrix], 
                            input_length = max_description_length,
                            trainable=False)
model.add(embedding_layer)
model.add(BatchNormalization(axis=1))
LSTM_1 = LSTM(128,return_sequences=False)
model.add(LSTM_1)
#model.add(Flatten())
#model.add(Dense(1920))
#model.add(Dense(960))
#model.add(Dense(480))
#model.add(Dense(240))
#model.add(Dense(120))
#model.add(Dense(240))

#model.add(Lambda(lambda x: 2 * x))
#model.add(MultiplicationLayer())
#model.add(AttentionWithContext())
#model.add(AttLayer())

## LSTM 1
## ======================================================================================
LSTM_1 = LSTM(64,return_sequences=False, dropout_W = 0.3, dropout_U = 0.3)
#model.add(LSTM_1)

## Attention 1
## ======================================================================================
TimeDist_1 = TimeDistributed(Dense(200))  #, input_shape=(max_description_length, embedding_vecor_length)
#model.add(TimeDist_1)

## Output classes layer
## ======================================================================================
model.add(Dense(len(nb_classes), activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])  # try loss=categorical_crossentropy
print(model.summary())

# start training multiple times with epoch=1
for ep in range(20):
    print('Epoch iter #' + str(ep+1))
    model.fit(train_texts_vec_mtx, y_ary_cat, nb_epoch=1, batch_size=batch_size_)
    
    scores = model.evaluate(train_texts_vec_mtx, y_ary_cat, verbose=0)
    if(best_model_aux['Best score'] < scores[1]):
        best_model_aux['Best score'] = scores[1]
        best_model = model
        print('Captured improved model')
        print('Valid accuracy: ',best_model_aux['Best score'])
        #print("Accuracy on test set: %.2f%%" % (scores[1]*100))
    else:
        break
    print()
 

print("Training took %g s" % (time.time() - start))

### Save model

In [29]:
# Save model and aux file

best_model.save('keras_model__LSTM_32__val_accuracy_94.h5')

best_model_aux_name = 'keras_model__LSTM_32__val_accuracy_94.pkl'
with open(best_model_aux_name, 'wb') as pickle_file:
    pickle.dump(best_model_aux, pickle_file, protocol=pickle.HIGHEST_PROTOCOL)

### Evaluation (optional)

In [None]:
# Final evaluation of the model
start = time.time()

scores = model.evaluate(train_texts_vec_mtx, y_train_ary, verbose=0)
print("Accuracy on train set: %.2f%%" % (scores[1]*100))
scores = model.evaluate(test_texts_vec_mtx, y_test_ary, verbose=0)
print("Accuracy on test set: %.2f%%" % (scores[1]*100))

print("\nEvaluation took %g s" % (time.time() - start))

In [None]:
# Get predictions
start = time.time()

predictions = model.predict_classes(test_texts_vec_mtx)
#predictions_rnd = np.round_(predictions, decimals=0, out=None)
predictions_probs = model.predict(test_texts_vec_mtx)

print('%-20s' % "predictions[0]",':', predictions[0])
#print('%-20s' % "predictions_rnd[0]:",':',predictions_rnd[0])
#print('%-20s' % "predictions_probs[0]",':', predictions_probs[0])
print("\nPrediction took %g s" % (time.time() - start))

In [None]:
# Manual check which nodes are not distinguishable for nets
class_ = 392
print(le.inverse_transform(class_))
print(X_test_ls[np_utils.categorical_probas_to_classes(y_test_ary)==class_],'\n')

class_ = 390
print(le.inverse_transform(class_))
print(X_test_ls[np_utils.categorical_probas_to_classes(y_test_ary)==class_],'\n')

class_ = 389
print(le.inverse_transform(class_))
print(X_test_ls[np_utils.categorical_probas_to_classes(y_test_ary)==class_],'\n')


In [None]:
#pd.crosstab(pd.Series(y_test_ary.ravel()), pd.Series(predictions_rnd.ravel()), rownames=['True'], colnames=['Predicted'], margins=True)
pd.crosstab(pd.Series(np_utils.categorical_probas_to_classes(y_test_ary)), pd.Series(predictions), rownames=['True'], colnames=['Predicted'], margins=True)

In [None]:
pd.crosstab(pd.Series(np_utils.categorical_probas_to_classes(y_test_ary)), pd.Series(predictions.ravel()), rownames=['True'], colnames=['Predicted'], margins=True)

### Testing

In [13]:
#model.get_config()
def prediction_to_str(clf_prediction, category_id):
    if(clf_prediction > 0.5):
        return str(category_id)
    else:
        return 'not ' + str(category_id)

def predict(description_str, word_index, clf_, max_length_, category_id_):
    #seq_ = tok_.texts_to_sequences([description_str])
    seq_ = texts_to_sequences_custom([description_str.lower()], word_index)
    seq_pad = sequence.pad_sequences(seq_, maxlen = max_length_)
    #clf_prediction = clf_.predict(seq_pad)
    clf_prediction = old_best_model_.predict_classes(seq_pad, verbose=0)
    
    #print(seq_)
    #print(seq_pad)
    
    # Prediction to string
    #clf_prediction_str = prediction_to_str(clf_prediction[0][0], category_id_)
    clf_prediction_str = clf_prediction
    
    return clf_prediction_str
    #return clf_prediction[0][0]

def predict_2(description_str, word_index, clf_, max_length_, category_id_):
    #seq_ = tok_.texts_to_sequences([description_str])
    seq_ = texts_to_sequences_custom([description_str.lower()], word_index)
    seq_pad = sequence.pad_sequences(seq_, maxlen = max_length_)
    #clf_prediction = clf_.predict(seq_pad)
    clf_prediction = model.predict_classes(seq_pad)
    
    #print(seq_)
    #print(seq_pad)
    
    # Prediction to string
    #clf_prediction_str = prediction_to_str(clf_prediction[0][0], category_id_)
    clf_prediction = le.inverse_transform(clf_prediction)
    
    if(clf_prediction == ['Positive']):
        return str(category_id_)
    else:
        return 'not ' + str(category_id_)
    
    
def predict_proba(description_str, word_index, clf_, max_length_):
    #seq_ = tok_.texts_to_sequences([description_str])
    seq_ = texts_to_sequences_custom([description_str], word_index)
    seq_pad = sequence.pad_sequences(seq_, maxlen = max_length_)
    clf_prediction_proba = clf_.predict_proba(seq_pad, verbose=0)
    
    return clf_prediction_proba[0][0]


# id_ = 'table Setr'
# p = predict(id_, best_model_aux['Tokenizer'], best_model, best_model_aux['Max length'], best_model_aux['Category ID'])
# pp = predict_proba(id_, best_model_aux['Tokenizer'], best_model, best_model_aux['Max length'])
# print(p)
# print(pp)

In [None]:
model_file = "whole set train model/nets_model__word2vec_class_specific_unigrams.h5"
aux_file = "whole set train model/nets_model__word2vec_class_specific_unigrams_aux.pkl"
old_best_model_ = load_model(model_file)
old_best_model_aux_ = get_model_file_aux(aux_file)

old_word_index_ = old_best_model_aux_['word_index']
le = old_best_model_aux_['Label encoder']

In [None]:
predictions = old_best_model_.predict_classes(train_texts_vec_mtx)

In [None]:
from sklearn.metrics import confusion_matrix
import itertools
import matplotlib.pyplot as plt

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=90)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    

# Compute confusion matrix
cnf_matrix = confusion_matrix(np_utils.categorical_probas_to_classes(y_ary_cat), predictions)
np.set_printoptions(precision=2)

#class_names = [ix_to_class[i] for i in range(101)]
class_names = predictions

plt.figure()
fig = plt.gcf()
fig.set_size_inches(32, 32)
plot_confusion_matrix(cnf_matrix, classes=class_names,
                      title='Confusion matrix, without normalization',
                      cmap=plt.cm.cool)
plt.show()

pd.crosstab(pd.Series(np_utils.categorical_probas_to_classes(y_ary_cat)), pd.Series(predictions), rownames=['True'], colnames=['Predicted'], margins=True)


In [None]:
X_ls = np.array(list(train_df['description_mod1']))
seq_ = texts_to_sequences_custom(X_ls, old_word_index_)
seq_pad = sequence.pad_sequences(seq_, maxlen = 30)

old_best_model_ = best_model
predictions = old_best_model_.predict_classes(seq_pad)

train_df['Predictions_le'] = list(predictions)
train_df['Predictions'] = train_df['Predictions_le'].apply(lambda x: le.inverse_transform(x))
train_df.head()

In [None]:
train_df.to_csv('Predictions__unigrams_interrelations.csv',index=False)

In [37]:
print(samples_df.loc[0,'description_mod1'])
print(samples_df.loc[0,'category_full_path_mod1'])

samples_df.loc[samples_df.category_full_path_mod1.str.contains('Hand Tools'),['description_mod1']].head()

aveeno baby wash  shampoo lightly scented 8 ounce pack of 2
Baby Products > Bathing & Skin Care > Soaps & Cleansers


Unnamed: 0,description_mod1
991,bosch dcb724b3 714inch by 24t framing circular...
1273,heavy metal bluray disc
2444,chemical guys bufhexkits8p hexlogic buffing p...
4259,greenworks 20672 gmax 40v liion 8inch cordless...
4310,cd 5 pan wip tc


In [38]:
# TEST

## load old model
model_file = "LSTM_16 w val_accuracy 92/keras_model__LSTM_16__val_acc_92.h5"
aux_file = "LSTM_16 w val_accuracy 92/keras_model__LSTM_16__val_acc_92.pkl"
old_best_model_ = load_model(model_file)
old_best_model_aux_ = get_model_file_aux(aux_file)
print(old_best_model_aux_.keys())
#old_tok_ = old_best_model_aux_['Tokenizer']
#old_word_index_ = old_best_model_aux_['Tokenizer'].word_index
old_word_index_ = old_best_model_aux_['word_index']
texts_to_sequences_custom = old_best_model_aux_['texts_to_sequences']


## use fresh model
# best_model_ = best_model
# best_model_aux_ = best_model_aux
# tok_ = tok
# word_index_ = word_index

item_d = 'NieR: Automata™ DEMO 120161128 (Playable Demo)'

# screwdrivers check
scrw_items = [
"aveeno baby wash  shampoo lightly scented 8 ounce pack of 2"
#,"tekton 2655 flare nut wrench set metric 6piece"
#,"tekton 2780 10slot screwdriver holder and organizer"
#,"titan 17237 insulated electrical screwdriver set  7 piece"
#,"tool sorter screwdriver organizer red"
#,"torin sdh15rt magnetic screwdriver holder"  #wrong predict
#,"wera 05020013001 joker combination wrenchset 11 pieces"
#,"wera kk vde 60i62i68i18 insulated pouch set with interchangeable blades 18piece" # tricky
#,"wiha 28103 magnetic 14 bit holder stubby 57mm pliers screwdriver" # tricky, wrong predict
]

for n, i in enumerate(scrw_items):
    item_d = i
    
    print(str(n) + ' ' + '='*100)
    
    print('Old model prediction:')
    print('item:',item_d)
    print('Seq max len:', old_best_model_aux_['Max length'])
    pred = predict(item_d, old_word_index_, old_best_model_, old_best_model_aux_['Max length'], '927')
    print(pred)
    print(old_best_model_aux_['Label encoder'].inverse_transform([pred])[0])
    print(predict_proba(item_d, old_word_index_, old_best_model_, old_best_model_aux_['Max length']))


#     print('\nFresh model prediction:')
#     print('item:',item_d)
#     print('Seq max len:', best_model_aux_['Max length'])
#     print(predict_2(item_d, tok_, best_model_, best_model_aux_['Max length'], '927'))
#     #print(predict_proba(item_d, tok_, best_model_, best_model_aux_['Max length']))

    print()

    #tt = train_df.loc[0:10,['description_mod1']]
    #tt['pred'] = tt['description_mod1'].apply(lambda x: predict(x, best_model_aux_['Tokenizer'], best_model, best_model_aux_['Max length'], best_model_aux_['Category ID']))
    #tt['prob'] = tt['description_mod1'].apply(lambda x: predict_proba(x, best_model_aux_['Tokenizer'], best_model, best_model_aux_['Max length']))

dict_keys(['Label encoder', 'Max length', 'texts_to_sequences', 'word_index', 'Best score'])
Old model prediction:
item: aveeno baby wash  shampoo lightly scented 8 ounce pack of 2
Seq max len: 30
[169]
['Baby Products > Bathing & Skin Care > Soaps & Cleansers']
8.36832e-28



In [62]:
w = old_best_model_.layers[1].get_weights()
len(w)


12