In [3]:
import pandas as pd
import numpy as np
import json
import nltk
import re
import csv
import matplotlib.pyplot as plt 
import seaborn as sns
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [4]:
df = pd.read_csv("/Users/rachel/Downloads/blazerscoatsjackets_NN.csv")
df.head()

Unnamed: 0.2,Unnamed: 0,key_0,Unnamed: 0.1,product_id,final_list,product_color_id,attribute_name,attribute_value,accessory,blazerscoatsjackets,bottom,onepiece,shoe,sweater,sweatshirthoodie,top
0,2419,44437,51473,01E607BHRQAJDZ76MJFN7RPRK1,"['rost', 'belted', 'short', 'cinched', 'natura...",01E607BHSBCWH034NX5TMJH3Z7,category,bottom,0,0,1,0,0,0,0,0
1,6498,104365,123804,01E6079QFKH4HPZFQ31T6WDRRX,"['elsa', 'slim', 'jean', 'slim', 'knee', 'hem'...",01E6079QG5GTP5JYRCWGRWY4ZC,category,bottom,0,0,1,0,0,0,0,0
2,5707,93240,110390,01E6079DG58YW9K78D57C6J2Y1,"['ravine', 'cotton', 'midi', 'dress', 'puff', ...",01E6079DGRR4CHKCPKNY96G15P,category,onepiece,0,0,0,1,0,0,0,0
3,2808,50512,58649,01E6078G3GRATF2C96VKYYWSGD,"['high', 'waist', 'moto', 'legging', 'pintucke...",01E6078G42KZPG57NCP4YG1TBB,category,bottom,0,0,1,0,0,0,0,0
4,6284,101553,120143,01E6076GTCE5P3VH76VWJH4MY9,"['krissy', 'espadrille', 'flat', 'woven', 'esp...",01E6076GTY5Z31KAGGW9HSYRDN,category,shoe,0,0,0,0,1,0,0,0


In [5]:
from random import randint
from numpy import array, argmax, asarray, zeros
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Embedding

Using TensorFlow backend.


## Get Your Feature Space and Target Labels

In [6]:
labels = df["blazerscoatsjackets"]
docs = df["final_list"]

## Perform Label Categorical Encoding 

In [7]:
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
labels = to_categorical(encoder.fit_transform(labels))

## Preprocessing

In [8]:
import re
def clean_text(text):
    # remove backslash-apostrophe 
    text = re.sub("/'", "", text) 
    # remove everything except alphabets 
    text = re.sub("[^a-zA-Z]"," ",text) 
    # remove whitespaces 
    text = ' '.join(text.split()) 
    # convert text to lowercase 
    text = text.lower() 
    
    return text
docs = docs.apply(str)
docs = docs.apply(lambda x: clean_text(x))

In [9]:
import spacy
nlp = spacy.load('en_core_web_sm')

## Tokenize the Text

In [10]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=10000, oov_token="UNKNOWN_TOKEN")
tokenizer.fit_on_texts(docs)

## Integer Encode Tokens

In [11]:
def integer_encode_documents(docs, tokenizer):
    return tokenizer.texts_to_sequences(docs)

## Get Max Length Per Token

In [12]:
from typing import List
def get_max_token_length_per_doc(docs: List[List[str]])-> int:
    return max(list(map(lambda x: len(x.split()), docs)))

# get the max length in terms of token length
max_length = get_max_token_length_per_doc(docs)

In [13]:
from keras.preprocessing.sequence import pad_sequences
MAX_SEQUENCE_LENGTH = 300
# integer encode the documents
encoded_docs = integer_encode_documents(docs, tokenizer)
# this is a list of lists, the numbers represent the index position of that word.
# for instance, 33 means the 33rd word in the vocabulary
padded_docs = pad_sequences(encoded_docs, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

In [14]:
docs.shape

(3969,)

In [15]:
labels.shape

(3969, 2)

## Split into Train/Test Split

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(padded_docs, labels, test_size=0.1)

## Keras RNN Architecture

In [17]:
VOCAB_SIZE = int(len(tokenizer.word_index) * 1.1)

## Load in Glove Vectors

In [18]:
def load_glove_vectors():
    embeddings_index = {}
    with open('glove.6B.100d.txt') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    print('Loaded %s word vectors.' % len(embeddings_index))
    return embeddings_index


embeddings_index = load_glove_vectors()

Loaded 400000 word vectors.


## Load in the Embeddings

In [19]:
# create a weight matrix for words in training docs
embedding_matrix = zeros((VOCAB_SIZE, 100))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: # check that it is an actual word that we have embeddings for
        embedding_matrix[i] = embedding_vector

## Define Model

In [20]:
from keras.layers.recurrent import SimpleRNN
from keras.layers import Flatten, Masking
# define model
model = Sequential()
model.add(Embedding(VOCAB_SIZE, 100, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False))
model.add(Masking(mask_value=0.0)) # masking layer, masks any words that don't have an embedding as 0s.
model.add(SimpleRNN(units=64, input_shape=(1, MAX_SEQUENCE_LENGTH)))
model.add(Dense(32))
model.add(Dense(2, activation='softmax'))

## Compile the Model

In [21]:
from keras.utils.vis_utils import plot_model

# Compile the model
model.compile(
    optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
# summarize the model
print(model.summary())
plot_model(model, to_file='model.png', show_shapes=True)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 300, 100)          728800    
_________________________________________________________________
masking_1 (Masking)          (None, 300, 100)          0         
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, 64)                10560     
_________________________________________________________________
dense_1 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 66        
Total params: 741,506
Trainable params: 12,706
Non-trainable params: 728,800
_________________________________________________________________
None


OSError: `pydot` failed to call GraphViz.Please install GraphViz (https://www.graphviz.org/) and ensure that its executables are in the $PATH.

## Fit the model

In [22]:
df["blazerscoatsjackets"].value_counts(normalize=True)

0    0.928949
1    0.071051
Name: blazerscoatsjackets, dtype: float64

In [23]:
# fit the model
model.fit(X_train, y_train, epochs=15, verbose=1)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.callbacks.History at 0x12324bb90>

## Evaluate the Model

In [24]:
# evaluate the model
loss, accuracy = model.evaluate(X_test, y_test, verbose=1)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 94.962215


In [25]:
test_docs = [
    "With a flattering twisted front, this fitted sheath dress is made in a plush velvet fabric, engineered with super-stretch for supreme comfort and cut pile-up for a high-shine look. Crew neck. 3/4 sleeves. Invisible zip at back. Twisted detail at the waist. Unlined."
]

test_docs = list(
    map(lambda doc: " ".join([token.text for token in nlp(doc) if not token.is_stop]), test_docs))

encoded_test_sample = integer_encode_documents(test_docs, tokenizer)
padded_test_docs = pad_sequences(encoded_test_sample, maxlen=MAX_SEQUENCE_LENGTH, padding='post')




In [26]:
model.predict_classes(padded_test_docs)

array([0])

In [27]:
prediction = model.predict_classes(padded_test_docs)
encoder.inverse_transform(prediction)

array([0])

## Try get all(test version)

In [28]:
full = pd.read_csv("/Users/rachel/Desktop/full_data.csv")

In [29]:
full.head()

Unnamed: 0,product_id,brand,mpn,product_full_name,description,brand_category,created_at,updated_at,deleted_at,brand_canonical_url,details,labels,bc_product_id
0,01DSE9TC2DQXDG6GWKW9NMJ416,Banana Republic,514683.0,Ankle-Strap Pump,"A modern pump, in a rounded silhouette with an...",Unknown,2019-11-11 22:37:15.719107+00,2019-12-19 20:40:30.786144+00,,https://bananarepublic.gap.com/browse/product....,"A modern pump, in a rounded silhouette with an...","{""Needs Review""}",
1,01DSE9SKM19XNA6SJP36JZC065,Banana Republic,526676.0,Petite Tie-Neck Top,Dress it down with jeans and sneakers or dress...,Unknown,2019-11-11 22:36:50.682513+00,2019-12-19 20:40:30.786144+00,,https://bananarepublic.gap.com/browse/product....,Dress it down with jeans and sneakers or dress...,"{""Needs Review""}",
2,01DSJX8GD4DSAP76SPR85HRCMN,Loewe,400100000000.0,52MM Padded Leather Round Sunglasses,Padded leather covers classic round sunglasses.,JewelryAccessories/SunglassesReaders/RoundOval...,2019-11-13 17:33:59.581661+00,2019-12-19 20:40:30.786144+00,,https://www.saksfifthavenue.com/loewe-52mm-pad...,100% UV protection Case and cleaning cloth inc...,"{""Needs Review""}",
3,01DSJVKJNS6F4KQ1QM6YYK9AW2,Converse,400012000000.0,Baby's & Little Kid's All-Star Two-Tone Mid-To...,The iconic mid-top design gets an added dose o...,"JustKids/Shoes/Baby024Months/BabyGirl,JustKids...",2019-11-13 17:05:05.203733+00,2019-12-19 20:40:30.786144+00,,https://www.saksfifthavenue.com/converse-babys...,Canvas upper Round toe Lace-up vamp SmartFOAM ...,"{""Needs Review""}",
4,01DSK15ZD4D5A0QXA8NSD25YXE,Alexander McQueen,400011000000.0,64MM Rimless Sunglasses,Hexagonal shades offer a rimless view with int...,JewelryAccessories/SunglassesReaders/RoundOval,2019-11-13 18:42:30.941321+00,2019-12-19 20:40:30.786144+00,,https://www.saksfifthavenue.com/alexander-mcqu...,100% UV protection Gradient lenses Adjustable ...,"{""Needs Review""}",


In [30]:
tt = full.iloc[0:20,]
tt.head()

Unnamed: 0,product_id,brand,mpn,product_full_name,description,brand_category,created_at,updated_at,deleted_at,brand_canonical_url,details,labels,bc_product_id
0,01DSE9TC2DQXDG6GWKW9NMJ416,Banana Republic,514683.0,Ankle-Strap Pump,"A modern pump, in a rounded silhouette with an...",Unknown,2019-11-11 22:37:15.719107+00,2019-12-19 20:40:30.786144+00,,https://bananarepublic.gap.com/browse/product....,"A modern pump, in a rounded silhouette with an...","{""Needs Review""}",
1,01DSE9SKM19XNA6SJP36JZC065,Banana Republic,526676.0,Petite Tie-Neck Top,Dress it down with jeans and sneakers or dress...,Unknown,2019-11-11 22:36:50.682513+00,2019-12-19 20:40:30.786144+00,,https://bananarepublic.gap.com/browse/product....,Dress it down with jeans and sneakers or dress...,"{""Needs Review""}",
2,01DSJX8GD4DSAP76SPR85HRCMN,Loewe,400100000000.0,52MM Padded Leather Round Sunglasses,Padded leather covers classic round sunglasses.,JewelryAccessories/SunglassesReaders/RoundOval...,2019-11-13 17:33:59.581661+00,2019-12-19 20:40:30.786144+00,,https://www.saksfifthavenue.com/loewe-52mm-pad...,100% UV protection Case and cleaning cloth inc...,"{""Needs Review""}",
3,01DSJVKJNS6F4KQ1QM6YYK9AW2,Converse,400012000000.0,Baby's & Little Kid's All-Star Two-Tone Mid-To...,The iconic mid-top design gets an added dose o...,"JustKids/Shoes/Baby024Months/BabyGirl,JustKids...",2019-11-13 17:05:05.203733+00,2019-12-19 20:40:30.786144+00,,https://www.saksfifthavenue.com/converse-babys...,Canvas upper Round toe Lace-up vamp SmartFOAM ...,"{""Needs Review""}",
4,01DSK15ZD4D5A0QXA8NSD25YXE,Alexander McQueen,400011000000.0,64MM Rimless Sunglasses,Hexagonal shades offer a rimless view with int...,JewelryAccessories/SunglassesReaders/RoundOval,2019-11-13 18:42:30.941321+00,2019-12-19 20:40:30.786144+00,,https://www.saksfifthavenue.com/alexander-mcqu...,100% UV protection Gradient lenses Adjustable ...,"{""Needs Review""}",


In [31]:
new_list = []
import time
test_docs = []
for i in tt["description"]:
    test_docs.append(i)
test_docs = list(map(lambda doc: " ".join([token.text for token in nlp(doc) if not token.is_stop]), test_docs))

encoded_test_sample = integer_encode_documents(test_docs, tokenizer)
padded_test_docs = pad_sequences(encoded_test_sample, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

print(model.predict_classes(padded_test_docs))
'''
for count, i in enumerate(tt["description"]):
    test_docs = i
    print(count)
    start = time.time()
    test_docs = list(" ".join([token.text for token in nlp(test_docs) if not token.is_stop]))
    #test_docs = list(
    #    map(lambda doc: " ".join([token.text for token in nlp(doc) if not token.is_stop]), test_docs))
    #print([char for char in test_docs])
    #end = time.time()
    #print(end - start)
    #print(test_docs)
    encoded_test_sample = integer_encode_documents(test_docs, tokenizer)
    padded_test_docs = pad_sequences(encoded_test_sample, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
    #print(padded_test_docs)
    #print(count, "before predict")
    print(model.predict_classes(padded_test_docs))
    #print(count, "after predict")
    prediction = model.predict_classes(padded_test_docs)
    m = encoder.inverse_transform(prediction)
    new_list.append(m)
'''
    

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]


'\nfor count, i in enumerate(tt["description"]):\n    test_docs = i\n    print(count)\n    start = time.time()\n    test_docs = list(" ".join([token.text for token in nlp(test_docs) if not token.is_stop]))\n    #test_docs = list(\n    #    map(lambda doc: " ".join([token.text for token in nlp(doc) if not token.is_stop]), test_docs))\n    #print([char for char in test_docs])\n    #end = time.time()\n    #print(end - start)\n    #print(test_docs)\n    encoded_test_sample = integer_encode_documents(test_docs, tokenizer)\n    padded_test_docs = pad_sequences(encoded_test_sample, maxlen=MAX_SEQUENCE_LENGTH, padding=\'post\')\n    #print(padded_test_docs)\n    #print(count, "before predict")\n    print(model.predict_classes(padded_test_docs))\n    #print(count, "after predict")\n    prediction = model.predict_classes(padded_test_docs)\n    m = encoder.inverse_transform(prediction)\n    new_list.append(m)\n'