In [2]:
from __future__ import print_function
import numpy as np
import pandas as pd

from keras.datasets import reuters
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import LSTM
from keras.utils import np_utils
from keras.preprocessing.text import Tokenizer, one_hot, text_to_word_sequence
from keras.preprocessing import sequence
from keras.layers.embeddings import Embedding

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

Using TensorFlow backend.


In [22]:
# Functions

def get_word2index(texts_ls_):
    word2index_ = {}

    c = 1
    for text_str in texts_ls_:
        text_tokens_ls = text_str.lower().split()
        for token in text_tokens_ls:
            if(token not in word2index_):
                word2index_[token] = c
                c = c + 1
                
    return word2index_

def train_df_preprocess(top_words_, texts_ls_, max_pad_length_):
    # texts_ls_: list of texts strings
    
    tok = Tokenizer(top_words_)
    tok.fit_on_texts(texts_ls_)

    words = []
    for iter in range(top_words):
        words += [key for key,value in tok.word_index.items() if value==iter+1]

    #Class for vectorizing texts, or/and turning texts into sequences 
    #(=list of word indexes, where the word of rank i in the dataset (starting at 1) has index i).
    texts_vec_ls = tok.texts_to_sequences(texts_ls_)#turns text to sequence, stating which word comes in what place
    texts_vec_mtx = sequence.pad_sequences(texts_vec_ls, maxlen=max_pad_length_)#pad sequence, essentially padding it with 0's at the end
    
    return texts_vec_mtx

def text_2_vec(text_str, word2index_):
    # text_str: text string
    
    text_tokens_ls = text_str.lower().split()
    
    text_vec = []
    for token in text_tokens_ls:
        if token in word2index_:
            text_vec.append(word2index_[token])
        else:
            text_vec.append(0)
            
    return text_vec

def train_df_preprocess_2(texts_ls_, word2index_, max_pad_length_):
    # texts_ls_: list of texts strings
    
    texts_vec_ls = []
    for text_ in texts_ls_:
        #print(text_)
        #print(type(text_))
        text_vec = text_2_vec(text_, word2index_)
        texts_vec_ls.append(text_vec)
    
    texts_vec_ary = sequence.pad_sequences(texts_vec_ls, maxlen=max_pad_length_)
    
    return texts_vec_ary

In [53]:
# Read train set (screwdrivers)
pkl_file = '/Users/altay.amanbay/Desktop/new node booster/experiments/2.1/train_data.pkl'
train_df = pd.read_pickle(pkl_file)

# Create target feature
train_df['target'] = train_df['type'].apply(lambda x: 'False' if x == 'False Positive' else 'Positive')

# Drop index column
train_df.drop(labels=['type'], axis=1, inplace=True)

# Encode target feature
#le = LabelEncoder()
#le.fit(train_df['target'])
#train_df['target_le'] = le.transform(train_df['target'])


# cat = 'Tools & Home Improvement > Power & Hand Tools > Hand Tools > Screwdrivers'
# positives, negatives = get_positives_negatives(train_df, cat)
# X_train = input_text = pd.concat([positives, negatives])
# y_train = [1] * len(positives) + [0] * len(negatives)

print('train data shape:',train_df.shape)
train_df.head(2)

train data shape: (6822, 4)


Unnamed: 0,description_mod1,category_id_mod1,category_full_path_mod1,target
0,RION TECH 5 point 6 point 3 Piece Tool Kit Pen...,927,Tools & Home Improvement > Power & Hand Tools ...,Positive
1,Stalwart 25-piece 4.8V Cordless Screwdriver Set,927,Tools & Home Improvement > Power & Hand Tools ...,False


In [40]:
# Read sampled descriptionary

path = '/Users/altay.amanbay/Desktop/new node booster/experiments/Sampling nodes from descriptionary/3 - Picking samples from each node/sampled descriptionary/'
file_name = 'sampled_descriptionary_sample_size_30.csv'
samples_df = pd.read_csv(path + file_name)

# Rename columns
samples_df.rename(columns={'description': 'description_mod1', 
                           'category_id': 'category_id_mod1',
                           'category_path': 'category_full_path_mod1'}, inplace=True)

# Drop 'screwdrivers' from descriptionary
samples_df = samples_df.loc[samples_df.category_id_mod1 != 927,:]

# Drop index column
samples_df.drop(labels=['index'], axis=1, inplace=True)

# Add target column and make all false as all items are not screwdrivers
samples_df['target'] = 'False'

print('samples data shape:',samples_df.shape)
samples_df.head()

samples data shape: (9722, 4)


Unnamed: 0,description_mod1,category_id_mod1,category_full_path_mod1,target
0,!iT Jeans Maternity Skinny Jeans Dark Wash M,100,Apparel & Accessories > Apparel > Maternity,False
1,Citizens of Humanity Avedon Skinny Maternity A...,100,Apparel & Accessories > Apparel > Maternity,False
2,DL1961 Maternity Angel Jeans - Riker-30,100,Apparel & Accessories > Apparel > Maternity,False
3,James Jeans - Twiggy Maternity Legging in Dark...,100,Apparel & Accessories > Apparel > Maternity,False
4,James Jeans Twiggy Maternity Under Belly Pull ...,100,Apparel & Accessories > Apparel > Maternity,False


In [54]:
# Concat original train set and sampled descriptionary
train_df = pd.concat([train_df,samples_df], axis=0)
train_df.reset_index(drop=True, inplace=True)

# Encode target feature
le = LabelEncoder()
le.fit(train_df['target'])
train_df['target_le'] = le.transform(train_df['target'])


print('train data shape:',train_df.shape)
train_df.head(2)

train data shape: (16544, 5)


Unnamed: 0,description_mod1,category_id_mod1,category_full_path_mod1,target,target_le
0,RION TECH 5 point 6 point 3 Piece Tool Kit Pen...,927,Tools & Home Improvement > Power & Hand Tools ...,Positive,1
1,Stalwart 25-piece 4.8V Cordless Screwdriver Set,927,Tools & Home Improvement > Power & Hand Tools ...,False,0


In [55]:
# Split into train and test
#X = train_df.loc[:,['description_mod1']]
X_ls = list(train_df['description_mod1'])
y_ary = np.array(list(train_df['target_le']))
print(type(X_ls))
print(type(y_ary))

X_train_ls, X_test_ls, y_train_ary, y_test_ary = train_test_split(X_ls, y_ary, test_size = 0.3)

# print(X_train_df.shape)
# print(X_test_df.shape)
print(len(X_train_ls))
print(len(X_test_ls))
print(y_train_ary.shape)
print(y_test_ary.shape)

<class 'list'>
<class 'numpy.ndarray'>
11580
4964
(11580,)
(4964,)


In [57]:
# Convert train set into sequences for nets

top_words = 20000
max_description_length = 30

tok = Tokenizer(nb_words = top_words)
tok.fit_on_texts(X_train_ls)
word_index = tok.word_index

train_texts_vec_ls = tok.texts_to_sequences(X_train_ls)
train_texts_vec_mtx = sequence.pad_sequences(train_texts_vec_ls, maxlen = max_description_length)

print('train_texts_vec_mtx shape:',train_texts_vec_mtx.shape)
list(tok.word_index)[0:5]

train_texts_vec_mtx shape: (11580, 30)


['drivers', 'tr', 'sconce', 'frappe', 'dinner']

In [8]:
# Convert test set into sequences for nets

test_texts_vec_ls = tok.texts_to_sequences(X_test_ls)
test_texts_vec_mtx = sequence.pad_sequences(test_texts_vec_ls, maxlen = max_description_length)

In [10]:
# Create embedding vectors for each word in word index
# fix random seed for reproducibility
np.random.seed(7)

embedding_vecor_length = 32
uniq_token_count = len(tok.word_index)
print('word index size:', uniq_token_count)

embedding_matrix = np.zeros((len(word_index) + 1, embedding_vecor_length))
for word, i in word_index.items():
    embedding_vector = np.random.uniform(.1, size=(1, 32))
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

print('embedding matrix shape:',embedding_matrix.shape)
print(embedding_matrix[0])
print(embedding_matrix[1])

word index size: 5500
embedding matrix shape: (5501, 32)
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
[ 0.43000128  0.86506031  0.83779409  0.24633669  0.57024406  0.82124156
  0.72401312  0.65143898  0.46307937  0.53842921  0.5821674   0.56437874
  0.51761817  0.21353142  0.55181363  0.58230838  0.13797185  0.81335379
  0.74359554  0.9659438   0.4794416   0.85213847  0.16980623  0.45701813
  0.67093386  0.94342833  0.57794982  0.2044182   0.62356217  0.61968723
  0.38667406  0.24876632]


In [24]:
# train the model
#embedding_vecor_length = 32
top_words = len(word_index) + 1
batch_size_ = 25 #64

model = Sequential()
#model.add(Embedding(top_words, embedding_vecor_length, input_length = max_description_length))
embedding_layer = Embedding(top_words, 
                            embedding_vecor_length, 
                            weights=[embedding_matrix], 
                            input_length = max_description_length,
                            trainable=True)
model.add(embedding_layer)
model.add(LSTM(10))
model.add(Dense(1, activation='sigmoid'))
#model.layers[0].trainable = False
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
#model.fit(X_train_ary, y_train_ary, validation_data=(X_train_ary, y_train_ary), nb_epoch=5, batch_size=64)
model.fit(train_texts_vec_mtx, y_train_ary, validation_data=(test_texts_vec_mtx, y_test_ary), nb_epoch=10, batch_size=batch_size_)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_5 (Embedding)          (None, 30, 32)        176032      embedding_input_5[0][0]          
____________________________________________________________________________________________________
lstm_5 (LSTM)                    (None, 10)            1720        embedding_5[0][0]                
____________________________________________________________________________________________________
dense_5 (Dense)                  (None, 1)             11          lstm_5[0][0]                     
Total params: 177,763
Trainable params: 177,763
Non-trainable params: 0
____________________________________________________________________________________________________
None
Train on 4775 samples, validate on 2047 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
E

<keras.callbacks.History at 0x12a21a710>

In [16]:
model.get_config()

[{'class_name': 'Embedding',
  'config': {'W_constraint': None,
   'W_regularizer': None,
   'activity_regularizer': None,
   'batch_input_shape': (None, 30),
   'dropout': 0.0,
   'init': 'uniform',
   'input_dim': 5501,
   'input_dtype': 'int32',
   'input_length': 30,
   'mask_zero': False,
   'name': 'embedding_2',
   'output_dim': 32,
   'trainable': False}},
 {'class_name': 'LSTM',
  'config': {'U_regularizer': None,
   'W_regularizer': None,
   'activation': 'tanh',
   'b_regularizer': None,
   'consume_less': 'cpu',
   'dropout_U': 0.0,
   'dropout_W': 0.0,
   'forget_bias_init': 'one',
   'go_backwards': False,
   'init': 'glorot_uniform',
   'inner_activation': 'hard_sigmoid',
   'inner_init': 'orthogonal',
   'input_dim': 32,
   'input_length': None,
   'name': 'lstm_2',
   'output_dim': 100,
   'return_sequences': False,
   'stateful': False,
   'trainable': True,
   'unroll': False}},
 {'class_name': 'Dense',
  'config': {'W_constraint': None,
   'W_regularizer': None,
   

In [27]:
# Final evaluation of the model
scores = model.evaluate(train_texts_vec_mtx, y_train_ary, verbose=0)
print("Accuracy on train set: %.2f%%" % (scores[1]*100))

scores = model.evaluate(test_texts_vec_mtx, y_test_ary, verbose=0)
print("Accuracy on test set: %.2f%%" % (scores[1]*100))

Accuracy on train set: 99.56%
Accuracy on test set: 92.43%


In [25]:
predictions = model.predict(test_texts_vec_mtx)
predictions_rnd = np.round_(predictions, decimals=0, out=None)
predictions_rnd[0]

array([ 1.], dtype=float32)

In [48]:
# misc check
# for i in range(predictions.shape[0]):
#     if(np.round_(predictions[i], decimals=0, out=None) == 1):
#         print(predictions[i])
#         np.round_(predictions[i], decimals=0, out=None)



array([ 0.], dtype=float32)

In [28]:
pd.crosstab(pd.Series(y_train_ary.ravel()), pd.Series(predictions_rnd.ravel()), rownames=['True'], colnames=['Predicted'], margins=True)
#pd.crosstab(pd.Series(y_test_ary.ravel()), pd.Series(predictions_rnd.ravel()), rownames=['True'], colnames=['Predicted'], margins=True)

Predicted,0.0,1.0,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,88,368,456
1,297,1294,1591
All,385,1662,2047


In [56]:
pd.crosstab(pd.Series(y_test_ary.ravel()), pd.Series(predictions_rnd.ravel()), rownames=['True'], colnames=['Predicted'], margins=True)

Predicted,0.0,1.0,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,398,80,478
1,63,1506,1569
All,461,1586,2047
