In [1]:
import pandas as pd
import numpy as np
import json
import nltk
import re
import csv
import matplotlib.pyplot as plt 
import seaborn as sns
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [2]:
from random import randint
from numpy import array, argmax, asarray, zeros
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Embedding

Using TensorFlow backend.


## Import data

In [3]:
import pandas as pd

df = pd.read_csv("vacation_NN.csv")
df.head(3)


Unnamed: 0.2,Unnamed: 0,key_0,Unnamed: 0.1,product_id,final_list,product_color_id,attribute_name,attribute_value,coldweather,daytonight,nightout,vacation,weekend,work,workout
0,5972,44436,51471,01E607BHRQAJDZ76MJFN7RPRK1,"['rost', 'belted', 'short', 'cinched', 'natura...",01E607BHSBCWH034NX5TMJH3Z7,occasion,vacation,0,0,0,1,0,0,0
1,15069,104366,123805,01E6079QFKH4HPZFQ31T6WDRRX,"['elsa', 'slim', 'jean', 'slim', 'knee', 'hem'...",01E6079QG5GTP5JYRCWGRWY4ZC,occasion,weekend,0,0,0,0,1,0,0
2,13435,93250,110404,01E6079DG58YW9K78D57C6J2Y1,"['ravine', 'cotton', 'midi', 'dress', 'puff', ...",01E6079DGRR4CHKCPKNY96G15P,occasion,vacation,0,0,0,1,0,0,0


In [4]:
df.shape

(3914, 15)

In [5]:
test = df.drop_duplicates(subset=['product_id', 'attribute_value'], keep=False)
test.head(3)


Unnamed: 0.2,Unnamed: 0,key_0,Unnamed: 0.1,product_id,final_list,product_color_id,attribute_name,attribute_value,coldweather,daytonight,nightout,vacation,weekend,work,workout
0,5972,44436,51471,01E607BHRQAJDZ76MJFN7RPRK1,"['rost', 'belted', 'short', 'cinched', 'natura...",01E607BHSBCWH034NX5TMJH3Z7,occasion,vacation,0,0,0,1,0,0,0
1,15069,104366,123805,01E6079QFKH4HPZFQ31T6WDRRX,"['elsa', 'slim', 'jean', 'slim', 'knee', 'hem'...",01E6079QG5GTP5JYRCWGRWY4ZC,occasion,weekend,0,0,0,0,1,0,0
2,13435,93250,110404,01E6079DG58YW9K78D57C6J2Y1,"['ravine', 'cotton', 'midi', 'dress', 'puff', ...",01E6079DGRR4CHKCPKNY96G15P,occasion,vacation,0,0,0,1,0,0,0


In [6]:
test.shape

(3914, 15)

## Get Your Feature Space and Target Labels

In [7]:
labels = test["vacation"]
# docs = test["final"]
docs = test["final_list"]

## Perform Label Categorical Encoding 

In [8]:
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
labels = to_categorical(encoder.fit_transform(labels))

## Preprocessing

In [9]:
import re
def clean_text(text):
    # remove backslash-apostrophe 
    text = re.sub("/'", "", text) 
    # remove everything except alphabets 
    text = re.sub("[^a-zA-Z]"," ",text) 
    # remove whitespaces 
    text = ' '.join(text.split()) 
    # convert text to lowercase 
    text = text.lower() 
    
    return text
docs = docs.apply(str)
docs = docs.apply(lambda x: clean_text(x))

In [10]:
import spacy
nlp = spacy.load('en_core_web_sm')

## Tokenize the Text

In [11]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=10000, oov_token="UNKNOWN_TOKEN")
tokenizer.fit_on_texts(docs)

## Integer Encode Tokens

In [12]:
def integer_encode_documents(docs, tokenizer):
    return tokenizer.texts_to_sequences(docs)

## Get Max Length Per Token

In [13]:
from typing import List
def get_max_token_length_per_doc(docs: List[List[str]])-> int:
    return max(list(map(lambda x: len(x.split()), docs)))

# get the max length in terms of token length
max_length = get_max_token_length_per_doc(docs)

In [14]:
from keras.preprocessing.sequence import pad_sequences
MAX_SEQUENCE_LENGTH = 300
# integer encode the documents
encoded_docs = integer_encode_documents(docs, tokenizer)
# this is a list of lists, the numbers represent the index position of that word.
# for instance, 33 means the 33rd word in the vocabulary
padded_docs = pad_sequences(encoded_docs, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

In [15]:
docs.shape

(3914,)

In [16]:
labels.shape

(3914, 2)

## Split into Train/Test Split

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(padded_docs, labels, test_size=0.1)

## Keras RNN Architecture

In [18]:
VOCAB_SIZE = int(len(tokenizer.word_index) * 1.1)

## Load in Glove Vectors

In [19]:
def load_glove_vectors():
    embeddings_index = {}
    with open('glove.6B.100d.txt') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    print('Loaded %s word vectors.' % len(embeddings_index))
    return embeddings_index


embeddings_index = load_glove_vectors()

Loaded 400000 word vectors.


## Load in the Embeddings

In [20]:
# create a weight matrix for words in training docs
embedding_matrix = zeros((VOCAB_SIZE, 100))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: # check that it is an actual word that we have embeddings for
        embedding_matrix[i] = embedding_vector

## Define Model

In [21]:
from keras.layers.recurrent import SimpleRNN
from keras.layers import Flatten, Masking
# define model
model = Sequential()
model.add(Embedding(VOCAB_SIZE, 100, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False))
model.add(Masking(mask_value=0.0)) # masking layer, masks any words that don't have an embedding as 0s.
model.add(SimpleRNN(units=64, input_shape=(1, MAX_SEQUENCE_LENGTH)))
model.add(Dense(32))
model.add(Dense(2, activation='softmax'))

## Compile the Model

In [22]:
from keras.utils.vis_utils import plot_model

# Compile the model
model.compile(
    optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
# summarize the model
print(model.summary())
plot_model(model, to_file='model.png', show_shapes=True)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 300, 100)          723500    
_________________________________________________________________
masking_1 (Masking)          (None, 300, 100)          0         
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, 64)                10560     
_________________________________________________________________
dense_1 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 66        
Total params: 736,206
Trainable params: 12,706
Non-trainable params: 723,500
_________________________________________________________________
None


OSError: `pydot` failed to call GraphViz.Please install GraphViz (https://www.graphviz.org/) and ensure that its executables are in the $PATH.

## Fit the model

In [23]:
test["attribute_value"].value_counts(normalize=True)

weekend        0.294584
daytonight     0.266735
vacation       0.165304
work           0.138222
nightout       0.091978
workout        0.025294
coldweather    0.017885
Name: attribute_value, dtype: float64

In [24]:
# fit the model
model.fit(X_train, y_train, epochs=15, verbose=1)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.callbacks.History at 0x159e8aed0>

## Evaluate the Model

In [25]:
# evaluate the model
loss, accuracy = model.evaluate(X_test, y_test, verbose=1)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 82.908165


In [27]:
test_docs = [
    "Rosie Velvet is a sheer mesh lace, layered with intricate allover embroidery, finished with scalloped eyelash and trimmed with luxurious velvet. Elastic waistband trimmed with velvet. Sheer and unlined. Made in Italy. 55% polyester, 45% cotton. Hand wash. Imported. Cheeky fit. Hits at the top of the thighs."
]

test_docs = list(
    map(lambda doc: " ".join([token.text for token in nlp(doc) if not token.is_stop]), test_docs))

encoded_test_sample = integer_encode_documents(test_docs, tokenizer)

padded_test_docs = pad_sequences(encoded_test_sample, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

In [28]:
model.predict_classes(padded_test_docs)

array([0])

In [29]:
prediction = model.predict_classes(padded_test_docs)
encoder.inverse_transform(prediction)

array([0])