In [1]:
# Keras 
# Classifier: LSTM 
# Classification type: multi-class (404 classes)
# Output nodes: #of classes with softmax

# word2vec model: 
# word2vect_class_specififc__vec64_win1__dict_sample_5000

In [18]:
from __future__ import print_function
import numpy as np
import pandas as pd
import pickle
import time

from keras.datasets import reuters
from keras.wrappers.scikit_learn import KerasClassifier
from keras.layers.wrappers import TimeDistributed
from keras.models import load_model, Sequential
from keras.layers.embeddings import Embedding
from keras.layers import Dense, Dropout, Input, Activation, LSTM, Bidirectional, Flatten, Lambda, RepeatVector
from keras.layers.recurrent import SimpleRNN
from keras.layers.convolutional import Conv1D
from keras.layers.pooling import MaxPooling1D
from keras.layers.noise import GaussianNoise
from keras.utils import np_utils
from keras.preprocessing.text import Tokenizer, one_hot, text_to_word_sequence
from keras.preprocessing import sequence
from keras import backend as K
from keras.engine.topology import Layer
from keras import initializations
from keras import regularizers
from keras import constraints
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping


from keras.utils.visualize_util import plot
from IPython.display import SVG
from keras.utils.visualize_util import model_to_dot

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import os    
os.environ['THEANO_FLAGS'] = "device=gpu0" 
import theano
#theano.config.device = 'gpu0'
theano.config.floatX = 'float32'


# import custom code
import os
import sys
pardir = os.path.abspath(os.path.join(os.getcwd(), '../'))
script_path = pardir + "/2_common_aux_script"
print('Importing process_string.py \nfrom ' + script_path + " ...\n")
sys.path.append(script_path)
from process_string import process_string
sys.path.remove(script_path)


print(sys.version)

Importing process_string.py 
from /Users/altay.amanbay/Desktop/new node booster/experiments/3a.1 - Nets train/5 train model - keras (unigrams)/2_common_aux_script ...

3.5.2 |Anaconda 4.2.0 (x86_64)| (default, Jul  2 2016, 17:52:12) 
[GCC 4.2.1 Compatible Apple LLVM 4.2 (clang-425.0.28)]


In [3]:
import keras
print(theano.__version__)

0.8.2


In [4]:
# Functions

def NGramGenerator_wordwise_interval(phrase, min_ngram, max_ngram):
    all_ngram_lists = []

    #printable_ = 'abcdefghijklmnopqrstuvwxyz0123456789 '
    #s_split = "".join((char if char in printable_ else "") for char in phrase).split()
    phrase_processed = process_string(phrase)
    s_split = phrase_processed.split()
    
    for n in range(max_ngram, min_ngram - 1, -1):
        n_gram = [s_split[i:i+n] for i in range(len(s_split)-n+1)]
        all_ngram_lists.extend(n_gram)
        
    all_ngrams = []
    for n_gram in all_ngram_lists:
        all_ngrams.extend([' '.join(n_gram)])
    
    return all_ngrams

def get_word2index(texts_ls_):
    word2index_ = {}

    c = 1
    for text_str in texts_ls_:
        text_tokens_ls = text_str.lower().split()
        for token in text_tokens_ls:
            if(token not in word2index_):
                word2index_[token] = c
                c = c + 1
                
    return word2index_

def train_df_preprocess(top_words_, texts_ls_, max_pad_length_):
    # texts_ls_: list of texts strings
    
    tok = Tokenizer(top_words_)
    tok.fit_on_texts(texts_ls_)

    words = []
    for iter in range(top_words):
        words += [key for key,value in tok.word_index.items() if value==iter+1]

    #Class for vectorizing texts, or/and turning texts into sequences 
    #(=list of word indexes, where the word of rank i in the dataset (starting at 1) has index i).
    texts_vec_ls = tok.texts_to_sequences(texts_ls_)#turns text to sequence, stating which word comes in what place
    texts_vec_mtx = sequence.pad_sequences(texts_vec_ls, maxlen=max_pad_length_)#pad sequence, essentially padding it with 0's at the end
    
    return texts_vec_mtx

def text_2_vec(text_str, word2index_):
    # text_str: text string
    
    text_tokens_ls = text_str.lower().split()
    
    text_vec = []
    for token in text_tokens_ls:
        if token in word2index_:
            text_vec.append(word2index_[token])
        else:
            text_vec.append(0)
            
    return text_vec

def train_df_preprocess_2(texts_ls_, word2index_, max_pad_length_):
    # texts_ls_: list of texts strings
    
    texts_vec_ls = []
    for text_ in texts_ls_:
        #print(text_)
        #print(type(text_))
        text_vec = text_2_vec(text_, word2index_)
        texts_vec_ls.append(text_vec)
    
    texts_vec_ary = sequence.pad_sequences(texts_vec_ls, maxlen=max_pad_length_)
    
    return texts_vec_ary

def texts_to_sequences_custom(texts_ls, word_index_, new_words_to_zero = False):
    texts_seq = []
    
    for text in texts_ls:
        #text_split = text.lower().split()
        text_split = NGramGenerator_wordwise_interval(text,1,1)
        seq = []
        for token in text_split:
            if(token in word_index_):
                seq.append(word_index_[token])
            elif(new_words_to_zero):
                seq.append(0)
                
        texts_seq.append(seq)
#         for k,v in word_index_.items():
#             if(v == 395):
#                 print(k,v)
    return texts_seq


def get_model_file_aux(model_file_aux_name):
    with open(model_file_aux_name, 'rb') as pickle_file:
        model_file_aux = pickle.load(pickle_file)
    return model_file_aux

def clone_tokens(text_str, cloning_factor = 1):
    token_clones_ls = []
    for token in text_str.split():
        token_clones_ls.extend([token]*cloning_factor)
    return ' '.join(token_clones_ls)

In [5]:
# Read sampled descriptionary

path = pardir+'/1_data/'
file_name = 'sampled_descriptionary_sample_size_5000.csv'
file_name = 'scorecards_for_fasttext.csv'
samples_df = pd.read_csv(path + file_name)

# Rename columns
samples_df.rename(columns={
                           #'description': 'description_mod1', 
                           '\ufeff"description"': 'description_mod1',
                           'category_id': 'category_id_mod1',
                           'category_path': 'category_full_path_mod1'}, inplace=True)

# Drop rows with NaN in any column
samples_df.dropna()

# Process description_mod1 strings by process_string function
samples_df['description_mod1'] = samples_df['description_mod1'].apply(lambda x: process_string(x))

# Drop rows where token count less than 1 in description_mod1 column
selected_indices = samples_df['description_mod1'].apply(lambda x: len(str(x).split()) > 1)
samples_df = samples_df[selected_indices]

# Drop duplicates
samples_df.drop_duplicates(subset=['description_mod1','category_full_path_mod1'], inplace = True, keep='first')
samples_df.drop_duplicates(subset=['description_mod1'], inplace = True, keep=False)
samples_df.reset_index(drop=True, inplace=True) 

# Clone tokens
samples_df['description_cloned'] = samples_df['description_mod1'].apply(lambda x: clone_tokens(x, cloning_factor=5))

# Drop 'screwdrivers' from descriptionary
#samples_df = samples_df.loc[samples_df.category_id_mod1 != 927,:]

# Drop index column
#samples_df.drop(labels=['index'], axis=1, inplace=True)

print('samples data shape:',samples_df.shape)
samples_df.head()

samples data shape: (587983, 4)


Unnamed: 0,description_mod1,category_id_mod1,category_full_path_mod1,description_cloned
0,aveeno baby wash shampoo lightly scented 8 ou...,206,Baby Products > Bathing & Skin Care > Soaps & ...,aveeno aveeno aveeno aveeno aveeno baby baby b...
1,earths best chlorine free diapers size 4 120 c...,213,Baby Products > Diapering > Disposable Diapers,earths earths earths earths earths best best b...
2,doa5lr costume catalog lr10 addon content,320,Electronics & Accessories > Video Games > Other,doa5lr doa5lr doa5lr doa5lr doa5lr costume cos...
3,lightweight oxford l in slim and white,152,Apparel & Accessories > Apparel > Tops & Tees ...,lightweight lightweight lightweight lightweigh...
4,now foods nutritional yeast flakes 10ounce,525,Health & Beauty > Vitamins & Dietary Supplements,now now now now now foods foods foods foods fo...


### Tweak unbiased data (drop classes)

In [6]:
## Check for class count

stats_sr = samples_df["category_full_path_mod1"].value_counts()
stats_sr[0:10]

Photos & Paper Products                                  50400
Books                                                    33416
Other                                                    29343
Health & Beauty > Makeup                                 24738
Online Services                                          22120
Health & Beauty > Vitamins & Dietary Supplements         20874
Jewelry & Watches > Earrings > Women                     18535
Apparel & Accessories > Apparel > Tops & Tees > Women    13440
Pet Supplies > Dogs & Cats > Dog Food & Treats           11791
Grocery & Gourmet Food > Restaurant & Takeout            10333
Name: category_full_path_mod1, dtype: int64

In [7]:
# Delete certain categories

samples_df = samples_df.loc[samples_df['category_full_path_mod1'].str.contains("Other")==False,:]
samples_df = samples_df.loc[samples_df.category_full_path_mod1 != "Books",:]
#samples_df = samples_df.loc[samples_df.category_full_path_mod1 != "Photos & Paper Products",:]

class_1 = 'Pet Supplies > Dogs & Cats > Dog Food & Treats'
class_1 = 'Photos & Paper Products'
class_2 = 'Apparel & Accessories > Apparel > Tops & Tees > Women'
class_2 = 'Apparel & Accessories > Accessories > Luggage, Backpacks & Laptop Bags'
classes_selected = class_1+'|'+class_2
#samples_df = samples_df.loc[samples_df['category_full_path_mod1'].str.contains(classes_selected)==True,:]
print('Pruned samples data shape:',samples_df.shape)

Pruned samples data shape: (484432, 4)


### Tweak unbiased data (equalize class numbers)

In [None]:
## Map class counts

samples_df['count'] = samples_df['category_full_path_mod1'].map(stats_sr)
samples_df.head()

In [None]:
## samples_df is unbiased by classes (category_full_path_mod1)
## fix by picking N samples from each class

#N=samples_df.category_full_path_mod1.value_counts(normalize=True).iloc[0] * samples_df.shape[0]
samples_df = samples_df.sample(frac=1).groupby('category_full_path_mod1', sort=False).head(1000)
print('samples data shape after picking max N samples from each class:',samples_df.shape)

### Concat sample_df into train_df

In [8]:
## Concat original train set and sampled descriptionary

#train_df = pd.concat([train_df, samples_df], axis=0)
train_df = samples_df
#train_df.reset_index(drop=True, inplace=True)

# description into chars
#train_df['description_mod1'] = train_df['description_mod1'].apply(lambda x: process_string(x))

# deduplicate
#train_df.drop_duplicates(subset=['description_mod1'], inplace = True, keep=False)
#print('train data shape (deduplicate):',train_df.shape)
    
# Encode target feature
le = LabelEncoder()
le.fit(train_df['category_full_path_mod1'])
train_df['target_le'] = le.transform(train_df['category_full_path_mod1'])


train_df.head(2)

Unnamed: 0,description_mod1,category_id_mod1,category_full_path_mod1,description_cloned,target_le
0,aveeno baby wash shampoo lightly scented 8 ou...,206,Baby Products > Bathing & Skin Care > Soaps & ...,aveeno aveeno aveeno aveeno aveeno baby baby b...,169
1,earths best chlorine free diapers size 4 120 c...,213,Baby Products > Diapering > Disposable Diapers,earths earths earths earths earths best best b...,175


In [None]:
string_w_max_tokens_1 = train_df['description_mod1'].map(lambda x: len(str(x).split())).max()
string_w_max_tokens_2 = train_df['description_cloned'].map(lambda x: len(str(x).split())).max()
print('string_w_max_tokens_1:',string_w_max_tokens_1)
print('string_w_max_tokens_2:',string_w_max_tokens_2)

In [None]:
# test check
i = 100
print(train_texts_vec_mtx[i])
len(word_index)

### Create input matrix for normal LSTM

In [13]:
# Create input matrix for normal LSTM

# Split into train and test
X_ls = np.array(list(train_df['description_mod1']))
#X_ls = np.array(list(train_df['description_cloned']))
y_ary = np.array(list(train_df['target_le']))
y_ary_cat = np_utils.to_categorical(train_df['target_le'])

#X_train_ls, X_test_ls, y_train_ary, y_test_ary = train_test_split(X_ls, y_ary, test_size = 0.3)
#X_train_ls, X_test_ls, y_train_ary, y_test_ary = train_test_split(X_ls, y_ary_cat, test_size = 0.3)
print(len(X_ls))
print(y_ary_cat.shape)


top_words = None
max_description_length = 30 #string_w_max_tokens_1

tok = Tokenizer(nb_words = top_words)
tok.fit_on_texts(X_ls)
word_index = tok.word_index
print('word_index size:',len(word_index))

#train_texts_vec_ls = tok.texts_to_sequences(X_train_ls)
train_texts_vec_ls = texts_to_sequences_custom(X_ls, word_index, new_words_to_zero = False)
train_texts_vec_mtx = sequence.pad_sequences(train_texts_vec_ls, maxlen = max_description_length)

print('train_texts_vec_mtx shape:',train_texts_vec_mtx.shape)
list(word_index)[0:5]

# Delete objects
X_ls = None
y_ary = None
tok = None
train_texts_vec_ls = None

484432
(484432, 861)
word_index size: 141483
train_texts_vec_mtx shape: (484432, 30)


### Create random embeddings

In [14]:
# Create RANDOM embedding vectors for each word in word index (lower cell code preferable)
# fix random seed for reproducibility
np.random.seed(12345) #8
embeddings_source = 'random'

embedding_vecor_length = 32
uniq_token_count = len(word_index)
print('word index size:', uniq_token_count)

is_random_embeddings = True
embedding_matrix = np.zeros((uniq_token_count + 1, embedding_vecor_length))
if(is_random_embeddings == True):
    for word, i in word_index.items():
        #embedding_vector = np.random.uniform(.1, size=(1, embedding_vecor_length))
        embedding_vector = np.random.uniform(-0.5, 0.5, embedding_vecor_length)
        embedding_matrix[i] = embedding_vector
else:
    c = 0
    for word, i in word_index.items():
        embedding_vector = np.random.uniform(c, c+100, size=(1, embedding_vecor_length))
        embedding_matrix[i] = embedding_vector
        c=c+6000
    scaler = int('1'+'0'*len(str(c)))
    embedding_matrix=embedding_matrix/scaler

        
print('embedding matrix shape:',embedding_matrix.shape)
print(embedding_matrix[0:2])


word index size: 141483
embedding matrix shape: (141484, 32)
[[ 0.          0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.        ]
 [ 0.49490242  0.23302929 -0.38785738 -0.39527143  0.44383371 -0.24519056
   0.25464731 -0.03190993  0.32747988 -0.19694983  0.10502732  0.29460503
  -0.08434479 -0.1714146   0.00165524 -0.24145589 -0.1944489   0.144096
   0.21336866  0.11472475 -0.4821466  -0.43704531 -0.31667933  0.00870557
   0.26015829  0.30997441 -0.27700669  0.44542248 -0.32381271 -0.34026166
  -0.44787171  0.30137245]]


### Create pre-trained embeddings

In [None]:
## Create word embeddings from trained Word2Vec model
from gensim.models import word2vec, Phrases
embeddings_source = 'word2vec'

# Load model
file_path_1_1 = pardir+"/3_model_word2vec_vec64_win1__dict_sample_5000/word2vect_class_specific_unigrams__vec64_win1__dict_sample_5000"
file_path_1_2 = pardir+"/3_model_word2vec_vec64_win1__dict_sample_5000/word2vect_class_specific_unigrams__vec128_win1__dict_sample_5000"
file_path_1_3 = pardir+"/3_model_word2vec_vec64_win1__dict_sample_5000/word2vect_vec_64_win30__dict_sample_5000"
file_path_1_4 = pardir+"/3_model_word2vec_vec64_win1__dict_sample_5000/word2vect_unigrams_interrelations__vec64_win1__dict_sample_5000"
file_path_1_5 = '/Users/altay.amanbay/Desktop/word2vec_new_trial/3_word2vec_modeling/word2vect_vec64_win1_sample10_iter100__dict_sample_5000'
file_path_1_6 = '/Users/altay.amanbay/Desktop/word2vec_new_trial/3_word2vec_modeling/word2vect_class_specififc_unigrams__vec64_win1_sample0_iter1000__dict_sample_5000'
file_path_1_7 = '/Users/altay.amanbay/Desktop/word2vec_new_trial/3_word2vec_modeling/word2vect_class_specififc_unigrams__vec64_win1_sample0_iter100__dict_sample_5000'
file_path_1_8 = '/Users/altay.amanbay/Desktop/word2vec_new_trial/3_word2vec_modeling/word2vect_class_specififc_unigrams__vec64_win1_sample10_iter100__dict_sample_5000'

file_path_1_9 ='/Users/altay.amanbay/Desktop/word2vec_new_trial/3_word2vec_modeling/word2vect_class_specififc_unigrams__vec64_win1_sample0.001_iter1000__dict_sample_5000'
file_path_1_10 ='/Users/altay.amanbay/Desktop/word2vec_new_trial/3_word2vec_modeling/'
#model = word2vec.Word2Vec.load(file_path_1_4)

file_path_2_1 = pardir+"/3_model_fasttext/fasttext__vec64_win5__dict_sampled_5000.vec"
file_path_2_2 = pardir+"/3_model_fasttext/fasttext__vec64_win1__dict_sampled_5000.vec"
file_path_2_3 = pardir+"/3_model_fasttext/fasttext__vec128_win1__dict_sampled_5000.vec"
file_path_2_4 = pardir+"/3_model_fasttext/fasttext__vec400_win2__scorecards.vec"
model = word2vec.Word2Vec.load_word2vec_format(file_path_2_4) # for fasttext model

#print(model.vocab.keys())
#sys.exit()

# word vector embeddings from model into dictionary
word2vec_dict={}
for word in model.vocab.keys():
    try:
        word2vec_dict[word]=model[word]
    except:    
        pass
print('Loaded %s word vectors.' % len(word2vec_dict))
    
embedding_vecor_length = len(model[word])
print('embedding_vecor_length:',embedding_vecor_length)


embedding_matrix = np.zeros((len(word_index) + 1, embedding_vecor_length))
for word, i in word_index.items():
    embedding_vector = word2vec_dict.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        
print('\nembedding matrix shape:',embedding_matrix.shape)
print(embedding_matrix[0]) # first cell should be all zeros
print(embedding_matrix[1])

In [16]:
# Best model result holder
best_model_aux = {}
best_model_aux['Max length'] = max_description_length
best_model_aux['Best score'] = 0
best_model_aux['texts_to_sequences'] = texts_to_sequences_custom
best_model_aux['word_index'] = word_index
best_model_aux['Label encoder'] = le


best_model = None

In [None]:
# Optional
# Load previous model (if needs to be compared in the following training)
#best_model = load_model('category_927_nets_1000_model.h5')
#best_model_aux = get_model_file_aux('category_927_nets_1000_model_aux.pkl')

In [None]:
# prediction nodes count
nb_classes = train_df['category_full_path_mod1'].unique()
print('Classes count:', len(nb_classes))

# keep the length of original (e.g. unboosted) train set
orig_dim = train_texts_vec_mtx.shape[0]

train_df = None
samples_df = None

In [None]:
model = None

### Experimental Model 0.1 (search for significant words using softmax)

In [None]:
# TRAIN MODEL 1
start = time.time()

# define/initialize model
top_words = len(word_index) + 1
batch_size_ = 64   # 64

model = Sequential()
# --------------------------------------------------------------------------------------
# ---- Embedding layer -----------------------------------------------------------------
print('Embeddings source:',embeddings_source)
embedding_layer = Embedding(top_words
                            ,1  # embedding vecor length
                            #,weights=[embedding_matrix]
                            ,input_length = max_description_length
                            ,init='glorot_uniform'
                            ,trainable=True)
model.add(embedding_layer)
model.add(Flatten())

## OUTPUT: classes layer
## ======================================================================================
model.add(Dense(len(nb_classes), activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])  # try loss=categorical_crossentropy
print(model.summary())

# start training multiple times with epoch=1
for ep in range(10):
    print('Epoch iter #' + str(ep+1))
    model.fit(train_texts_vec_mtx, y_ary_cat, nb_epoch=1, batch_size=batch_size_)
    
    scores = model.evaluate(train_texts_vec_mtx[0:orig_dim], y_ary_cat[0:orig_dim], verbose=0)
    if(best_model_aux['Best score'] < scores[1]):
        best_model_aux['Best score'] = scores[1]
        best_model = model
        print('Captured improved model')
        print('Valid accuracy: ',best_model_aux['Best score'])
        #print("Accuracy on test set: %.2f%%" % (scores[1]*100))
    else:
        print('Model not improved')
        print('Valid accuracy: ',best_model_aux['Best score'])
    #    break
    print()
 

print("Training took %g s" % (time.time() - start))

In [None]:
# Encoder model (gives assigned embeddings for words from above model)
model_enc = Sequential()
model_enc.add(embedding_layer)
model_enc.add(Flatten())

In [None]:
j = 33                                      # row number of description from train set
t = train_texts_vec_mtx[j]                  # get description from train set (word indexes)
print(t.reshape(1,30))                      # print it
print(model_enc.predict(t.reshape(1,30)))   # encode the description to get embedding vectors


p = model.predict_classes(t.reshape(1,30), verbose = False)  # get the class number of the description
print(le.inverse_transform(p[0]))                            # convert the class number into class name

# convert word indexes from description into words
for i in t:
    if(i != 0):
        print(list(word_index.keys())[list(word_index.values()).index(i)], end=' ')

In [25]:
from keras.datasets import mnist
import numpy as np
(x_train, _), (x_test, _) = mnist.load_data()
print(x_train.shape)
print(x_test.shape)

(60000, 28, 28)
(10000, 28, 28)


In [26]:
x_train = x_train.astype('float32') / 255.
x_test = x_test.astype('float32') / 255.
x_train = x_train.reshape((len(x_train), np.prod(x_train.shape[1:])))
x_test = x_test.reshape((len(x_test), np.prod(x_test.shape[1:])))
print(x_train.shape)
print(x_test.shape)

(60000, 784)
(10000, 784)


### Experimental 0.4 (encoder for clustering)

In [46]:
# CREATE INPUT MATRIX FOR AUTOENCODER
start = time.time()

# define/initialize model
top_words = len(word_index) + 1
batch_size_ = 64   # 64

model = Sequential()
# --------------------------------------------------------------------------------------
# ---- Embedding layer -----------------------------------------------------------------
print('Embeddings source:',embeddings_source)
embedding_layer = Embedding(top_words
                            ,embedding_vecor_length
                            ,weights=[embedding_matrix]
                            ,input_length = max_description_length
                            #,init='glorot_uniform'
                            ,trainable=False)
model.add(embedding_layer)
model.compile('rmsprop', 'mse')
input_array = model.predict(train_texts_vec_mtx)

Embeddings source: random


In [59]:
# TRAIN AUTOENCODER
from keras.models import Model

start = time.time()

# define/initialize model
batch_size_ = 64   # 64

inputs = Input(shape=(max_description_length, embedding_vecor_length))
latent_dim = 10
encoded = LSTM(latent_dim)(inputs)
#td_1 = TimeDistributed(Dense(32))(inputs)
#encoded = Lambda(max_, output_shape=(embedding_vecor_length,))(td_1)

decoded = RepeatVector(max_description_length)(encoded)
decoded = LSTM(embedding_vecor_length, return_sequences=True)(decoded)

#model_encoder = Model(inputs, encoded)
model_autoencoder = Model(inputs, decoded)


## OUTPUT: classes layer
## ======================================================================================
#model.add(Dense(len(nb_classes), activation='softmax'))
model_autoencoder.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])  # try loss=categorical_crossentropy
print(model_autoencoder.summary())

# start training multiple times with epoch=1
for ep in range(10):
    print('Epoch iter #' + str(ep+1))
    model_autoencoder.fit(input_array, input_array, nb_epoch=1, batch_size=batch_size_)
    
    scores = model_autoencoder.evaluate(input_array, input_array, verbose=0)
    if(best_model_aux['Best score'] < scores[1]):
        best_model_aux['Best score'] = scores[1]
        best_model = model_autoencoder
        print('Captured improved model')
        print('Valid accuracy: ',best_model_aux['Best score'])
        #print("Accuracy on test set: %.2f%%" % (scores[1]*100))
    else:
        print('Model not improved')
        print('Valid accuracy: ',best_model_aux['Best score'])
        break
    print()
 

print("Training took %g s" % (time.time() - start))

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_19 (InputLayer)            (None, 30, 32)        0                                            
____________________________________________________________________________________________________
timedistributed_8 (TimeDistribut (None, 30, 32)        1056        input_19[0][0]                   
____________________________________________________________________________________________________
lambda_6 (Lambda)                (None, 32)            0           timedistributed_8[0][0]          
____________________________________________________________________________________________________
repeatvector_14 (RepeatVector)   (None, 30, 32)        0           lambda_6[0][0]                   
___________________________________________________________________________________________

### Cluster encodings

In [77]:
encodings_mtx = best_model.predict(input_array)
encodings_mtx_resh = encodings_mtx.reshape(484432, 30*32)

In [105]:
from sklearn.cluster import KMeans
k_means = KMeans(n_clusters=850, random_state=0, max_iter=10, n_jobs=-1)

start = time.time()
k_means.fit(encodings_mtx_resh)
#np.array([[[1,1,1],[2,2,2]],[[1,1,1],[2,2,2]]]).reshape(2,6)
print("Training took %g s" % (time.time() - start))


Training took 3861.63 s


In [88]:
print(encodings_mtx[0])
#print(input_array[0][-1])

[[ 0.00678394  0.00799087 -0.02210714 -0.01669397  0.00652681  0.01796212
   0.02027131 -0.02707245  0.02296207 -0.06715409  0.00207059 -0.03023864
  -0.02352969 -0.08128262 -0.01729795 -0.09831153 -0.04109302 -0.00228851
   0.03313054 -0.04783145 -0.08433102  0.00091587 -0.03782459  0.020688
  -0.04172391 -0.04488586  0.0157574  -0.01861063 -0.02058901  0.01759877
  -0.01189658  0.01106455]
 [ 0.01125409  0.00853695 -0.02553343 -0.01259037  0.00618082  0.02462783
   0.02399347 -0.04864526  0.0336272  -0.10120874  0.00530424 -0.03515679
  -0.02759841 -0.12664749 -0.02308232 -0.17102937 -0.09735309 -0.00255409
   0.04413624 -0.05858684 -0.14689499 -0.00384735 -0.04937905  0.02309752
  -0.05952082 -0.05680497  0.02181251 -0.02119935 -0.06000166  0.01550376
  -0.01037072  0.01127833]
 [ 0.01302237  0.00808915 -0.02376049 -0.00806668  0.00530856  0.02442732
   0.02235194 -0.05994024  0.03609109 -0.11077517  0.00583272 -0.03300028
  -0.02487346 -0.14628698 -0.02412327 -0.2203858  -0.1491804

### Experimental Model 0.2

In [54]:
def max_(x):
    return K.max(x, axis=1)

In [None]:
# TRAIN MODEL 1
start = time.time()

# define/initialize model
top_words = len(word_index) + 1
batch_size_ = 64   # 64

model = Sequential()
# --------------------------------------------------------------------------------------
# ---- Embedding layer -----------------------------------------------------------------
print('Embeddings source:',embeddings_source)
embedding_layer = Embedding(top_words
                            ,embedding_vecor_length
                            #,weights=[embedding_matrix]
                            ,input_length = max_description_length
                            ,init='glorot_uniform'
                            ,trainable=True)
model.add(embedding_layer)

## Time Distributed
## ======================================================================================
TimeDist_1 = TimeDistributed(Dense(32))  #, input_shape=(max_description_length, embedding_vecor_length)
model.add(TimeDist_1)
model.add(Lambda(max_, output_shape=(embedding_vecor_length,)))
model.add(Flatten())

## OUTPUT: classes layer
## ======================================================================================
model.add(Dense(len(nb_classes), activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])  # try loss=categorical_crossentropy
print(model.summary())

# start training multiple times with epoch=1
for ep in range(10):
    print('Epoch iter #' + str(ep+1))
    model.fit(train_texts_vec_mtx, y_ary_cat, nb_epoch=1, batch_size=batch_size_)
    
    scores = model.evaluate(train_texts_vec_mtx[0:orig_dim], y_ary_cat[0:orig_dim], verbose=0)
    if(best_model_aux['Best score'] < scores[1]):
        best_model_aux['Best score'] = scores[1]
        best_model = model
        print('Captured improved model')
        print('Valid accuracy: ',best_model_aux['Best score'])
        #print("Accuracy on test set: %.2f%%" % (scores[1]*100))
    else:
        print('Model not improved')
        print('Valid accuracy: ',best_model_aux['Best score'])
    #    break
    print()
 

print("Training took %g s" % (time.time() - start))

### Experimental model 0.3

In [None]:
# TRAIN MODEL 1
start = time.time()

# define/initialize model
top_words = len(word_index) + 1
batch_size_ = 64   # 64

model = Sequential()
# --------------------------------------------------------------------------------------
# ---- Embedding layer -----------------------------------------------------------------
print('Embeddings source:',embeddings_source)
embedding_layer = Embedding(top_words
                            ,1 #embedding_vecor_length
                            #,weights=[embedding_matrix]
                            ,input_length = max_description_length
                            ,init='glorot_uniform'
                            ,trainable=True)
model.add(embedding_layer)
#model.add(BatchNormalization()) #axis=1

## LSTM 1
## ======================================================================================
LSTM_1 = LSTM(32, return_sequences=True, activation='linear')
#model.add(LSTM_1)
#LSTM_1 = SimpleRNN(128,return_sequences=False)
#model.add(LSTM(8,return_sequences=False, activation='linear'))
#model.add(LSTM(64,return_sequences=False, activation='linear'))
#LSTM_1 = LSTM(128,return_sequences=False, dropout_W = 0.3, dropout_U = 0.3)


## Gaussian Noise (optional)
## ======================================================================================
percent_noise = 0.1
noise = 1.0 * percent_noise
noise_layer = GaussianNoise(noise)
#model.add(noise_layer)

# Conv 1
nb_filter_1 = 64
conv1D_1 = Conv1D(nb_filter=nb_filter_1   # feature maps
                  ,filter_length=3        # kernel size
                  ,subsample_length=1     # strides
                  ,border_mode='valid'    # padding: same, valid
                  ,activation='linear'
                 )
model.add(conv1D_1)
model.add(MaxPooling1D(pool_length = 3, stride = 1))
model.add(LSTM(64,return_sequences=False, activation='linear'))

## OUTPUT: classes layer
## ======================================================================================
model.add(Dense(len(nb_classes), activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])  # try loss=categorical_crossentropy
print(model.summary())

# start training multiple times with epoch=1
for ep in range(10):
    print('Epoch iter #' + str(ep+1))
    model.fit(train_texts_vec_mtx, y_ary_cat, nb_epoch=1, batch_size=batch_size_)
    
    scores = model.evaluate(train_texts_vec_mtx[0:orig_dim], y_ary_cat[0:orig_dim], verbose=0)
    if(best_model_aux['Best score'] < scores[1]):
        best_model_aux['Best score'] = scores[1]
        best_model = model
        print('Captured improved model')
        print('Valid accuracy: ',best_model_aux['Best score'])
        #print("Accuracy on test set: %.2f%%" % (scores[1]*100))
    else:
        print('Model not improved')
        print('Valid accuracy: ',best_model_aux['Best score'])
    #    break
    print()
 

print("Training took %g s" % (time.time() - start))

In [None]:
# FUNCTION: Attention Layer

from keras import backend as K
from keras.engine.topology import Layer
from keras import initializations

class AttLayer(Layer):
    def __init__(self, **kwargs):
        self.init = initializations.get('normal')
        #self.input_spec = [InputSpec(ndim=3)]
        super(AttLayer, self).__init__(** kwargs)

    def build(self, input_shape):
        assert len(input_shape)==3
        #self.W = self.init((input_shape[-1],1))
        self.W = self.init((input_shape[-1],))
        #self.input_spec = [InputSpec(shape=input_shape)]
        self.trainable_weights = [self.W]
        super(AttLayer, self).build(input_shape)  # be sure you call this somewhere!

    def call(self, x, mask=None):
        eij = K.tanh(K.dot(x, self.W))

        ai = K.exp(eij)
        weights = ai/K.sum(ai, axis=1).dimshuffle(0,'x')

        weighted_input = x*weights.dimshuffle(0,1,'x')
        return weighted_input.sum(axis=1)

    def get_output_shape_for(self, input_shape):
        return (input_shape[0], input_shape[-1])

In [None]:
# Plot Nets design
#from keras.utils import plot_model
import matplotlib.pyplot as plt
from keras.utils.visualize_util import model_to_dot

plot(model, to_file='/Users/altay.amanbay/Desktop/model.png')
SVG(model_to_dot(model).create(prog='dot', format='svg'))
