In [6]:
# Keras 
# Classifier: LSTM 
# Classification type: multi-class (404 classes)
# Output nodes: #of classes with softmax

# word2vec model: 
# word2vect_class_specififc__vec64_win1__dict_sample_5000

In [7]:
from __future__ import print_function
import numpy as np
import pandas as pd
import pickle
import time

from keras.datasets import reuters
from keras.wrappers.scikit_learn import KerasClassifier
from keras.layers.wrappers import TimeDistributed
from keras.models import load_model, Sequential
from keras.layers.embeddings import Embedding
from keras.layers import Dense, Dropout, Activation, LSTM, Bidirectional, Flatten
from keras.layers.recurrent import SimpleRNN
from keras.layers.convolutional import Conv1D
from keras.layers.pooling import MaxPooling1D
from keras.layers.noise import GaussianNoise
from keras.utils import np_utils
from keras.preprocessing.text import Tokenizer, one_hot, text_to_word_sequence
from keras.preprocessing import sequence
from keras import backend as K
from keras.engine.topology import Layer
from keras import initializations
from keras import regularizers
from keras import constraints
from keras.layers.normalization import BatchNormalization


from keras.utils.visualize_util import plot
from IPython.display import SVG
from keras.utils.visualize_util import model_to_dot

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import os    
os.environ['THEANO_FLAGS'] = "device=gpu0" 
import theano
#theano.config.device = 'gpu0'
theano.config.floatX = 'float32'


# import custom code
import os
import sys
pardir = os.path.abspath(os.path.join(os.getcwd(), '../'))
script_path = pardir + "/2_common_aux_script"
print('Importing process_string.py \nfrom ' + script_path + " ...\n")
sys.path.append(script_path)
from process_string import process_string
sys.path.remove(script_path)


print(sys.version)

Importing process_string.py 
from /Users/altay.amanbay/Desktop/new node booster/experiments/3a.1 - Nets train/5 train model - keras (unigrams)/2_common_aux_script ...

3.5.2 |Anaconda 4.2.0 (x86_64)| (default, Jul  2 2016, 17:52:12) 
[GCC 4.2.1 Compatible Apple LLVM 4.2 (clang-425.0.28)]


In [8]:
# Functions

def NGramGenerator_wordwise_interval(phrase, min_ngram, max_ngram):
    all_ngram_lists = []

    #printable_ = 'abcdefghijklmnopqrstuvwxyz0123456789 '
    #s_split = "".join((char if char in printable_ else "") for char in phrase).split()
    phrase_processed = process_string(phrase)
    s_split = phrase_processed.split()
    
    for n in range(max_ngram, min_ngram - 1, -1):
        n_gram = [s_split[i:i+n] for i in range(len(s_split)-n+1)]
        all_ngram_lists.extend(n_gram)
        
    all_ngrams = []
    for n_gram in all_ngram_lists:
        all_ngrams.extend([' '.join(n_gram)])
    
    return all_ngrams

def get_word2index(texts_ls_):
    word2index_ = {}

    c = 1
    for text_str in texts_ls_:
        text_tokens_ls = text_str.lower().split()
        for token in text_tokens_ls:
            if(token not in word2index_):
                word2index_[token] = c
                c = c + 1
                
    return word2index_

def train_df_preprocess(top_words_, texts_ls_, max_pad_length_):
    # texts_ls_: list of texts strings
    
    tok = Tokenizer(top_words_)
    tok.fit_on_texts(texts_ls_)

    words = []
    for iter in range(top_words):
        words += [key for key,value in tok.word_index.items() if value==iter+1]

    #Class for vectorizing texts, or/and turning texts into sequences 
    #(=list of word indexes, where the word of rank i in the dataset (starting at 1) has index i).
    texts_vec_ls = tok.texts_to_sequences(texts_ls_)#turns text to sequence, stating which word comes in what place
    texts_vec_mtx = sequence.pad_sequences(texts_vec_ls, maxlen=max_pad_length_)#pad sequence, essentially padding it with 0's at the end
    
    return texts_vec_mtx

def text_2_vec(text_str, word2index_):
    # text_str: text string
    
    text_tokens_ls = text_str.lower().split()
    
    text_vec = []
    for token in text_tokens_ls:
        if token in word2index_:
            text_vec.append(word2index_[token])
        else:
            text_vec.append(0)
            
    return text_vec

def train_df_preprocess_2(texts_ls_, word2index_, max_pad_length_):
    # texts_ls_: list of texts strings
    
    texts_vec_ls = []
    for text_ in texts_ls_:
        #print(text_)
        #print(type(text_))
        text_vec = text_2_vec(text_, word2index_)
        texts_vec_ls.append(text_vec)
    
    texts_vec_ary = sequence.pad_sequences(texts_vec_ls, maxlen=max_pad_length_)
    
    return texts_vec_ary

def texts_to_sequences_custom(texts_ls, word_index_, new_words_to_zero = False):
    texts_seq = []
    
    for text in texts_ls:
        #text_split = text.lower().split()
        text_split = NGramGenerator_wordwise_interval(text,1,1)
        seq = []
        for token in text_split:
            if(token in word_index_):
                seq.append(word_index_[token])
            elif(new_words_to_zero):
                seq.append(0)
                
        texts_seq.append(seq)
#         for k,v in word_index_.items():
#             if(v == 395):
#                 print(k,v)
    return texts_seq

def text_chars_to_sequences_custom(text_chars_ls, char_index_):
    texts_seq = []
    
    for text in text_chars_ls:
        text_split = text.lower().split()
        seq = []
        for token in text_split:
            if(token in char_index_):
                seq.append(char_index_[token])
            else:
                seq.append(0)
                
        texts_seq.append(seq)
#         for k,v in word_index_.items():
#             if(v == 395):
#                 print(k,v)
    return texts_seq

def get_model_file_aux(model_file_aux_name):
    with open(model_file_aux_name, 'rb') as pickle_file:
        model_file_aux = pickle.load(pickle_file)
    return model_file_aux

def modif_1(phrase):
    #phrase_mod =  " ".join((char if char.isalnum() else "") for char in phrase)  # checks for alnum chars
    phrase_mod =  " ".join(char for char in phrase)
    phrase_mod = ' '.join(phrase_mod.split())
    return phrase_mod

In [9]:
# Read sampled descriptionary

path = pardir+'/1_data/'
file_name = 'sampled_descriptionary_sample_size_5000.csv'
file_name = 'scorecards_for_fasttext.csv'
train_df = pd.read_csv(path + file_name)

# Rename columns
train_df.rename(columns={
                           #'description': 'description_mod1', 
                           '\ufeff"description"': 'description_mod1',
                           'category_id': 'category_id_mod1',
                           'category_path': 'category_full_path_mod1'}, inplace=True)

# Drop rows with NaN in any column
train_df.dropna()

# Process description_mod1 strings by process_string function
train_df['description_mod1'] = train_df['description_mod1'].apply(lambda x: process_string(x))

# Drop rows where token count less than 1 in description_mod1 column
selected_indices = train_df['description_mod1'].apply(lambda x: len(str(x).split()) > 1)
train_df = train_df[selected_indices]

# Drop duplicates
train_df.drop_duplicates(subset=['description_mod1','category_full_path_mod1'], inplace = True, keep='first')
train_df.drop_duplicates(subset=['description_mod1'], inplace = True, keep=False)
train_df.reset_index(drop=True, inplace=True) 

# Description into chars
train_df['description_mod1_chars'] = train_df['description_mod1'].apply(lambda x: modif_1(x))

# Drop 'screwdrivers' from descriptionary
#samples_df = samples_df.loc[samples_df.category_id_mod1 != 927,:]

# Drop index column
#samples_df.drop(labels=['index'], axis=1, inplace=True)

print('samples data shape:',train_df.shape)
train_df.head()

samples data shape: (587983, 4)


Unnamed: 0,description_mod1,category_id_mod1,category_full_path_mod1,description_mod1_chars
0,aveeno baby wash shampoo lightly scented 8 ou...,206,Baby Products > Bathing & Skin Care > Soaps & ...,a v e e n o b a b y w a s h s h a m p o o l i ...
1,earths best chlorine free diapers size 4 120 c...,213,Baby Products > Diapering > Disposable Diapers,e a r t h s b e s t c h l o r i n e f r e e d ...
2,doa5lr costume catalog lr10 addon content,320,Electronics & Accessories > Video Games > Other,d o a 5 l r c o s t u m e c a t a l o g l r 1 ...
3,lightweight oxford l in slim and white,152,Apparel & Accessories > Apparel > Tops & Tees ...,l i g h t w e i g h t o x f o r d l i n s l i ...
4,now foods nutritional yeast flakes 10ounce,525,Health & Beauty > Vitamins & Dietary Supplements,n o w f o o d s n u t r i t i o n a l y e a s ...


In [10]:
# Encode target feature
le = LabelEncoder()
le.fit(train_df['category_full_path_mod1'])
train_df['target_le'] = le.transform(train_df['category_full_path_mod1'])

train_df.head(2)

Unnamed: 0,description_mod1,category_id_mod1,category_full_path_mod1,description_mod1_chars,target_le
0,aveeno baby wash shampoo lightly scented 8 ou...,206,Baby Products > Bathing & Skin Care > Soaps & ...,a v e e n o b a b y w a s h s h a m p o o l i ...,206
1,earths best chlorine free diapers size 4 120 c...,213,Baby Products > Diapering > Disposable Diapers,e a r t h s b e s t c h l o r i n e f r e e d ...,212


In [11]:
# Split into train and test
#X = train_df.loc[:,['description_mod1']]
X_ls = np.array(list(train_df['description_mod1_chars']))
y_ary = np.array(list(train_df['target_le']))
y_ary_cat = np_utils.to_categorical(train_df['target_le'])
# X_ls = train_df[['description_mod1']]
# y_ary = train_df[['target_le']]

#print(type(X_ls))
#print(type(y_ary))
#print(type(y_ary_cat))

#X_train_ls, X_test_ls, y_train_ary, y_test_ary = train_test_split(X_ls, y_ary, test_size = 0.3)
#X_train_ls, X_test_ls, y_train_ary, y_test_ary = train_test_split(X_ls, y_ary_cat, test_size = 0.3)

print(len(X_ls))
print(y_ary_cat.shape)

587983
(587983, 1021)


In [12]:
# Convert train set into sequences for nets

max_description_length = 100

ls = '0123456789abcdefghijklmnopqrstuvwxyz'
word_index = {}
for i, c in enumerate(ls):
    word_index[c] = i+1 #ord(c)
print('word_index size:',len(word_index))

# train_texts_vec_ls = tok.texts_to_sequences(X_train_ls)
# train_texts_vec_ls = texts_to_sequences_custom(X_train_ls, word_index)
train_texts_vec_ls = text_chars_to_sequences_custom(X_ls, word_index)
train_texts_vec_mtx = sequence.pad_sequences(train_texts_vec_ls, maxlen = max_description_length)

print('train_texts_vec_mtx shape:',train_texts_vec_mtx.shape)

# Delete objects
X_ls = None
y_ary = None
tok = None
train_texts_vec_ls = None

word_index size: 36
train_texts_vec_mtx shape: (587983, 100)


In [13]:
# test check
i = 100
print(train_texts_vec_mtx[i])
len(word_index)

[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0 25 34 16 25 28 14 12 31 30 30 25 24 14 25 33 24 22 19 24 18
 15 11 30 18 15 28 13 18 11 28 13 25 11 22 11 24 14 29 30 11 24 14 11 28 14]


36

In [14]:
## Bag-of-chars embedding

# fix random seed for reproducibility
np.random.seed(7)

one_hot = {}
for i, l in enumerate(ls):
    bits = list(np.zeros(len(ls)))
    bits[i] = 1
    one_hot[l] = bits

embedding_vecor_length = len(word_index)
uniq_token_count = len(word_index)
print('word index size:', uniq_token_count)

embedding_matrix = np.zeros((uniq_token_count + 1, embedding_vecor_length))
for word, i in word_index.items():
    embedding_vector = one_hot[word]
    #embedding_vector = np.random.uniform(-0.5, 0.5, embedding_vecor_length)     # different option
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

print('embedding matrix shape:',embedding_matrix.shape)
print(embedding_matrix[:3])

word index size: 36
embedding matrix shape: (37, 36)
[[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]]


In [59]:
## Random number embedding

# fix random seed for reproducibility
np.random.seed(7)

embedding_vecor_length = 64
uniq_token_count = len(word_index)
print('word index size:', uniq_token_count)

embedding_matrix = np.zeros((uniq_token_count + 1, embedding_vecor_length))
for word, i in word_index.items():
    embedding_vector = np.random.uniform(.1, size=(1, embedding_vecor_length))
    #embedding_vector = np.random.uniform(-0.5, 0.5, embedding_vecor_length)     # different option
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

print('embedding matrix shape:',embedding_matrix.shape)
print(embedding_matrix[:3])

word index size: 36
embedding matrix shape: (37, 64)
[[ 0.          0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.          0.
   0.        ]
 [ 0.30230748  0.32373194  0.25703765  0.47064589  0.96386909  0.45156333
   0.9333742   0.26312559  0.315438    0.55624092  0.2359669   0.20198507
   0.70300764  0.40815342  0.72889207  0.30452215  0.36982918  0.90355525
   0.621

In [16]:
# Best model result holder
best_model_aux = {}
best_model_aux['Max length'] = max_description_length
best_model_aux['Best score'] = 0
best_model_aux['texts_to_sequences'] = texts_to_sequences_custom
best_model_aux['word_index'] = word_index
best_model_aux['Label encoder'] = le


best_model = None

In [14]:
# Optional
# Load previous model (if needs to be compared in the following training)
#best_model = load_model('category_927_nets_1000_model.h5')
#best_model_aux = get_model_file_aux('category_927_nets_1000_model_aux.pkl')

In [17]:
# prediction nodes count
nb_classes = train_df['category_full_path_mod1'].unique()
print('Classes count:', len(nb_classes))

train_df = None
samples_df = None

Classes count: 1021


In [16]:
model = None

### Model 1 (CNN - LSTM)

In [None]:
# TRAIN MODEL 1
start = time.time()

# define/initialize model
top_words = len(word_index) + 1
batch_size_ = 64   # 64

model = Sequential()
# --------------------------------------------------------------------------------------
# ---- Embedding layer -----------------------------------------------------------------
embedding_layer = Embedding(top_words, 
                            embedding_vecor_length, 
                            weights=[embedding_matrix], 
                            input_length = max_description_length,
                            trainable=False)
model.add(embedding_layer)
#model.add(BatchNormalization()) #axis=1


## CNN 1
## ======================================================================================
nb_feature_maps = 8
n_gram = 1
#model.add(Reshape(1, max_description_length, embedding_vecor_length))
#model.add(Convolution2D(nb_feature_maps, 1, n_gram, embedding_vecor_length))
#model.add(MaxPooling2D(poolsize=(max_description_length - n_gram + 1, 1)))
#model.add(Flatten())

# Conv 1
nb_filter_1 = 36
conv1D_1 = Conv1D(nb_filter=nb_filter_1   # feature maps
                  ,filter_length=4        # kernel size
                  #,kernel_size=2
                  ,subsample_length=1     # strides
                  #,strides=1
                  ,border_mode='valid'
                  #,padding='valid'  # same, valid
                  ,activation='sigmoid'
                 )
model.add(conv1D_1)
model.add(MaxPooling1D(pool_length = 2, stride = 1)) #model.output_shape[1]


# Conv 2
nb_filter_2 = 36
conv1D_2 = Conv1D(nb_filter=nb_filter_2
                  ,filter_length=5,subsample_length=2,border_mode='valid',activation='sigmoid')
model.add(conv1D_2)
model.add(MaxPooling1D(pool_length = 2, stride = 1))

# Conv 3
nb_filter_3 = 36
conv1D_3 = Conv1D(nb_filter=nb_filter_3   # feature maps
                  ,filter_length=5,subsample_length=2,border_mode='valid',activation='sigmoid')
model.add(conv1D_3)
model.add(MaxPooling1D(pool_length = 2, stride = 1))

# Conv 4
nb_filter_4 = 20
conv1D_4 = Conv1D(nb_filter=nb_filter_4   # feature maps
                  ,filter_length=5,subsample_length=2,border_mode='valid',activation='sigmoid')
model.add(conv1D_4)
model.add(MaxPooling1D(pool_length = 2, stride = 1))


LSTM_1 = LSTM(36, return_sequences=False, activation='sigmoid')
model.add(LSTM_1)

#model.add(Flatten())

## Dense 1
## ======================================================================================
Dense_1 = Dense(200,activation='sigmoid')
#model.add(Dense_1)


## OUTPUT: classes layer
## ======================================================================================
model.add(Dense(len(nb_classes), activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])  # try loss=categorical_crossentropy
print(model.summary())

# start training multiple times with epoch=1
for ep in range(20):
    print('Epoch iter #' + str(ep+1))
    model.fit(train_texts_vec_mtx, y_ary_cat, nb_epoch=1, batch_size=batch_size_)
    
    scores = model.evaluate(train_texts_vec_mtx, y_ary_cat, verbose=0)
    if(best_model_aux['Best score'] < scores[1]):
        best_model_aux['Best score'] = scores[1]
        best_model = model
        print('Captured improved model')
        print('Valid accuracy: ',best_model_aux['Best score'])
        #print("Accuracy on test set: %.2f%%" % (scores[1]*100))
    #else:
    #    break
    print()
 

print("Training took %g s" % (time.time() - start))

In [108]:
best_model_aux['Best score']

0

### Model 2

In [34]:
# TRAIN MODEL 2
# TRAIN MODEL 1
start = time.time()

# define/initialize model
top_words = len(word_index) + 1
batch_size_ = 64   # 64

model = Sequential()
# --------------------------------------------------------------------------------------
# ---- Embedding layer -----------------------------------------------------------------
embedding_layer = Embedding(top_words, 
                            embedding_vecor_length, 
                            weights=[embedding_matrix], 
                            input_length = max_description_length,
                            trainable=False)
model.add(embedding_layer)
#model.add(BatchNormalization()) #axis=1


## CNN 1
## ======================================================================================
nb_feature_maps = 8
n_gram = 1
#model.add(Reshape(1, max_description_length, embedding_vecor_length))
#model.add(Convolution2D(nb_feature_maps, 1, n_gram, embedding_vecor_length))
#model.add(MaxPooling2D(poolsize=(max_description_length - n_gram + 1, 1)))
#model.add(Flatten())

# Conv 1
nb_filter_1 = 36
conv1D_1 = Conv1D(nb_filter=nb_filter_1   # feature maps
                  ,filter_length=90        # kernel size
                  #,kernel_size=2
                  ,subsample_length=1     # strides
                  #,strides=1
                  ,border_mode='valid'
                  #,padding='valid'  # same, valid
                  ,activation='relu'
                 )
#model.add(conv1D_1)
#model.add(MaxPooling1D(pool_length = 2, stride = 1)) #model.output_shape[1]

## LSTM 1
## ======================================================================================
LSTM_1 = LSTM(36, return_sequences=False, activation='relu')
model.add(LSTM_1)
#model.add(LSTM(36, return_sequences=False, activation='linear'))
#LSTM_1 = LSTM(128,return_sequences=False, dropout_W = 0.3, dropout_U = 0.3)


#model.add(Flatten())

## Dense 1
## ======================================================================================
Dense_1 = Dense(200,activation='sigmoid')
#model.add(Dense_1)


## OUTPUT: classes layer
## ======================================================================================
model.add(Dense(len(nb_classes), activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])  # try loss=categorical_crossentropy
print(model.summary())

# start training multiple times with epoch=1
for ep in range(20):
    print('Epoch iter #' + str(ep+1))
    model.fit(train_texts_vec_mtx, y_ary_cat, nb_epoch=1, batch_size=batch_size_)
    
    scores = model.evaluate(train_texts_vec_mtx, y_ary_cat, verbose=0)
    if(best_model_aux['Best score'] < scores[1]):
        best_model_aux['Best score'] = scores[1]
        best_model = model
        print('Captured improved model')
        print('Valid accuracy: ',best_model_aux['Best score'])
        #print("Accuracy on test set: %.2f%%" % (scores[1]*100))
    #else:
    #    break
    print()
 

print("Training took %g s" % (time.time() - start))

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_17 (Embedding)         (None, 100, 36)       1332        embedding_input_17[0][0]         
____________________________________________________________________________________________________
lstm_18 (LSTM)                   (None, 36)            10512       embedding_17[0][0]               
____________________________________________________________________________________________________
dense_32 (Dense)                 (None, 1021)          37777       lstm_18[0][0]                    
Total params: 49,621
Trainable params: 48,289
Non-trainable params: 1,332
____________________________________________________________________________________________________
None
Epoch iter #1
Epoch 1/1
 49600/587983 [=>............................] - ETA: 384s - loss: nan - acc: 0.0000e+00

KeyboardInterrupt: 

In [None]:
best_model_aux['Best score']

In [None]:
# Plot Nets design
#from keras.utils import plot_model
import matplotlib.pyplot as plt
from keras.utils.visualize_util import model_to_dot

plot(model, to_file='/Users/altay.amanbay/Desktop/model.png')
SVG(model_to_dot(model).create(prog='dot', format='svg'))


### Save model

In [None]:
# Save model and aux file

best_model.save('nets_model__word2vec_unigrams_interrelations.h5')

best_model_aux_name = 'nets_model__word2vec_unigrams_interrelations_aux.pkl'
with open(best_model_aux_name, 'wb') as pickle_file:
    pickle.dump(best_model_aux, pickle_file, protocol=pickle.HIGHEST_PROTOCOL)

### Evaluation (optional)

In [None]:
# Final evaluation of the model
start = time.time()

scores = model.evaluate(train_texts_vec_mtx, y_train_ary, verbose=0)
print("Accuracy on train set: %.2f%%" % (scores[1]*100))
scores = model.evaluate(test_texts_vec_mtx, y_test_ary, verbose=0)
print("Accuracy on test set: %.2f%%" % (scores[1]*100))

print("\nEvaluation took %g s" % (time.time() - start))

In [None]:
# Get predictions
start = time.time()

predictions = model.predict_classes(test_texts_vec_mtx)
#predictions_rnd = np.round_(predictions, decimals=0, out=None)
predictions_probs = model.predict(test_texts_vec_mtx)

print('%-20s' % "predictions[0]",':', predictions[0])
#print('%-20s' % "predictions_rnd[0]:",':',predictions_rnd[0])
#print('%-20s' % "predictions_probs[0]",':', predictions_probs[0])
print("\nPrediction took %g s" % (time.time() - start))

In [None]:
# Manual check which nodes are not distinguishable for nets
class_ = 392
print(le.inverse_transform(class_))
print(X_test_ls[np_utils.categorical_probas_to_classes(y_test_ary)==class_],'\n')

class_ = 390
print(le.inverse_transform(class_))
print(X_test_ls[np_utils.categorical_probas_to_classes(y_test_ary)==class_],'\n')

class_ = 389
print(le.inverse_transform(class_))
print(X_test_ls[np_utils.categorical_probas_to_classes(y_test_ary)==class_],'\n')


In [None]:
#pd.crosstab(pd.Series(y_test_ary.ravel()), pd.Series(predictions_rnd.ravel()), rownames=['True'], colnames=['Predicted'], margins=True)
pd.crosstab(pd.Series(np_utils.categorical_probas_to_classes(y_test_ary)), pd.Series(predictions), rownames=['True'], colnames=['Predicted'], margins=True)

In [None]:
pd.crosstab(pd.Series(np_utils.categorical_probas_to_classes(y_test_ary)), pd.Series(predictions.ravel()), rownames=['True'], colnames=['Predicted'], margins=True)

### Testing

In [None]:
#model.get_config()
def prediction_to_str(clf_prediction, category_id):
    if(clf_prediction > 0.5):
        return str(category_id)
    else:
        return 'not ' + str(category_id)

def predict(description_str, tok_, clf_, max_length_, category_id_):
    #seq_ = tok_.texts_to_sequences([description_str])
    seq_ = texts_to_sequences_custom([description_str.lower()], tok_.word_index)
    seq_pad = sequence.pad_sequences(seq_, maxlen = max_length_)
    #clf_prediction = clf_.predict(seq_pad)
    clf_prediction = old_best_model_.predict_classes(seq_pad, verbose=0)
    
    #print(seq_)
    #print(seq_pad)
    
    # Prediction to string
    #clf_prediction_str = prediction_to_str(clf_prediction[0][0], category_id_)
    clf_prediction_str = clf_prediction
    
    return clf_prediction_str
    #return clf_prediction[0][0]

def predict_2(description_str, tok_, clf_, max_length_, category_id_):
    #seq_ = tok_.texts_to_sequences([description_str])
    seq_ = texts_to_sequences_custom([description_str.lower()], tok_.word_index)
    seq_pad = sequence.pad_sequences(seq_, maxlen = max_length_)
    #clf_prediction = clf_.predict(seq_pad)
    clf_prediction = model.predict_classes(seq_pad)
    
    #print(seq_)
    #print(seq_pad)
    
    # Prediction to string
    #clf_prediction_str = prediction_to_str(clf_prediction[0][0], category_id_)
    clf_prediction = le.inverse_transform(clf_prediction)
    
    if(clf_prediction == ['Positive']):
        return str(category_id_)
    else:
        return 'not ' + str(category_id_)
    
    
def predict_proba(description_str, tok_, clf_, max_length_):
    #seq_ = tok_.texts_to_sequences([description_str])
    seq_ = texts_to_sequences_custom([description_str], tok_.word_index)
    seq_pad = sequence.pad_sequences(seq_, maxlen = max_length_)
    clf_prediction_proba = clf_.predict_proba(seq_pad, verbose=0)
    
    return clf_prediction_proba[0][0]


# id_ = 'table Setr'
# p = predict(id_, best_model_aux['Tokenizer'], best_model, best_model_aux['Max length'], best_model_aux['Category ID'])
# pp = predict_proba(id_, best_model_aux['Tokenizer'], best_model, best_model_aux['Max length'])
# print(p)
# print(pp)

In [None]:
model_file = "whole set train model/nets_model__word2vec_class_specific_unigrams.h5"
aux_file = "whole set train model/nets_model__word2vec_class_specific_unigrams_aux.pkl"
old_best_model_ = load_model(model_file)
old_best_model_aux_ = get_model_file_aux(aux_file)

old_word_index_ = old_best_model_aux_['word_index']
le = old_best_model_aux_['Label encoder']

In [None]:
predictions = old_best_model_.predict_classes(train_texts_vec_mtx)

In [None]:
from sklearn.metrics import confusion_matrix
import itertools
import matplotlib.pyplot as plt

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=90)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    

# Compute confusion matrix
cnf_matrix = confusion_matrix(np_utils.categorical_probas_to_classes(y_ary_cat), predictions)
np.set_printoptions(precision=2)

#class_names = [ix_to_class[i] for i in range(101)]
class_names = predictions

plt.figure()
fig = plt.gcf()
fig.set_size_inches(32, 32)
plot_confusion_matrix(cnf_matrix, classes=class_names,
                      title='Confusion matrix, without normalization',
                      cmap=plt.cm.cool)
plt.show()

pd.crosstab(pd.Series(np_utils.categorical_probas_to_classes(y_ary_cat)), pd.Series(predictions), rownames=['True'], colnames=['Predicted'], margins=True)


In [None]:
X_ls = np.array(list(train_df['description_mod1']))
seq_ = texts_to_sequences_custom(X_ls, old_word_index_)
seq_pad = sequence.pad_sequences(seq_, maxlen = 30)

In [None]:
old_best_model_ = best_model
predictions = old_best_model_.predict_classes(seq_pad)

In [None]:
train_df['Predictions_le'] = list(predictions)
train_df['Predictions'] = train_df['Predictions_le'].apply(lambda x: le.inverse_transform(x))
train_df.head()

In [None]:
train_df.to_csv('Predictions__unigrams_interrelations.csv',index=False)

In [None]:
# TEST

## load old model
model_file = "category_927_nets__traindata5000_vectrain5000_model.h5"
aux_file = "category_927_nets__traindata5000_vectrain5000_aux.pkl"
old_best_model_ = load_model(model_file)
old_best_model_aux_ = get_model_file_aux(aux_file)
old_tok_ = old_best_model_aux_['Tokenizer']
old_word_index_ = old_best_model_aux_['Tokenizer'].word_index

## use fresh model
best_model_ = best_model
best_model_aux_ = best_model_aux
tok_ = tok
word_index_ = word_index

item_d = 'NieR: Automata™ DEMO 120161128 (Playable Demo)'

# screwdrivers check
scrw_items = [
"tekton 2655 flare nut wrench set metric 6piece"
,"tekton 2780 10slot screwdriver holder and organizer"
,"titan 17237 insulated electrical screwdriver set  7 piece"
,"tool sorter screwdriver organizer red"
,"torin sdh15rt magnetic screwdriver holder"  #wrong predict
,"wera 05020013001 joker combination wrenchset 11 pieces"
,"wera kk vde 60i62i68i18 insulated pouch set with interchangeable blades 18piece" # tricky
,"wiha 28103 magnetic 14 bit holder stubby 57mm pliers screwdriver" # tricky, wrong predict
]

for n, i in enumerate(scrw_items):
    item_d = i
    
    print(str(n) + ' ' + '='*100)
    
    print('Old model prediction:')
    print('item:',item_d)
    print('Seq max len:', old_best_model_aux_['Max length'])
    print(predict(item_d, old_tok_, old_best_model_, old_best_model_aux_['Max length'], '927'))
    print(predict_proba(item_d, old_tok_, old_best_model_, old_best_model_aux_['Max length']))


    print('\nFresh model prediction:')
    print('item:',item_d)
    print('Seq max len:', best_model_aux_['Max length'])
    print(predict_2(item_d, tok_, best_model_, best_model_aux_['Max length'], '927'))
    #print(predict_proba(item_d, tok_, best_model_, best_model_aux_['Max length']))

    print()

    #tt = train_df.loc[0:10,['description_mod1']]
    #tt['pred'] = tt['description_mod1'].apply(lambda x: predict(x, best_model_aux_['Tokenizer'], best_model, best_model_aux_['Max length'], best_model_aux_['Category ID']))
    #tt['prob'] = tt['description_mod1'].apply(lambda x: predict_proba(x, best_model_aux_['Tokenizer'], best_model, best_model_aux_['Max length']))