# Creating Standard Neural Network for Prediction

In [201]:
#Following the code here: https://github.com/ychennay/dso-560-nlp-and-text-analytics/blob/master/week7/Deep%20Learning%20with%20Word%20Embeddings.ipynb

In [283]:
import pandas as pd
import numpy as np
from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from numpy import asarray
from numpy import zeros
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.text import text_to_word_sequence
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from typing import List
from pprint import pprint

In [295]:
#1. Prepare to Iterate through All Tags
#- The goal is to train a neural network to predict for each tag
#- In order to do so, we must read in the CSVs created by the "One Hot Encoding and Duplicate Removal" python notebook

fulldata = pd.read_csv("full_data_joined_attr_processed.csv") #gathering data as created through the notebook that gathers and cleans the input variables

chosen_cats = ['style','embellishment','occasion','category','dry_clean_only'] #gathering the attribute categories (5 for our group)

#gather the unique tags for each of the 5 catgories
styles = fulldata[fulldata['attribute_name'] == 'style']['attribute_value'].unique()
embels = fulldata[fulldata['attribute_name'] == 'embellishment']['attribute_value'].unique()
occasi = fulldata[fulldata['attribute_name'] == 'occasion']['attribute_value'].unique()
catego = fulldata[fulldata['attribute_name'] == 'category']['attribute_value'].unique()
drycle = fulldata[fulldata['attribute_name'] == 'dry_clean_only']['attribute_value'].unique()

In [285]:
styles = list(styles)
embels = list(embels)
occasi = list(occasi)
catego = list(catego)
drycle = list(drycle)

tags = styles + embels + occasi + catego + drycle

In [286]:
#Reading in the documents associated with each tag
docs = [] #for the X values in our neural net
labels = [] #for the Y values in our neural net
for i in range(0,len(tags)):
    docs.append(pd.read_csv(f"{tags[i]}_NN.csv")['final']) #reading in the documents of 1 hot encoding, final is the column of product documents
     #for example: pd.read_csv("classic_NN.csv")['final'] #would get the documents for all products that have style
    labels.append(array(pd.read_csv(f"{tags[i]}_NN.csv")[tags[i]]))

In [287]:
#Each neural net will have a different number of documents to train and test on, here we create a list of 
training_sets = []
for i in range(0,len(tags)):
    training_sets.append(int(len(docs[i])*.8)) #80%-20% train-test split
training_sets

[3132,
 3132,
 3132,
 3132,
 3132,
 3132,
 3132,
 3132,
 3132,
 3132,
 3132,
 56,
 56,
 56,
 56,
 56,
 56,
 56,
 56,
 56,
 3131,
 3131,
 3131,
 3131,
 3131,
 3131,
 3131,
 3175,
 3175,
 3175,
 3175,
 3175,
 3175,
 3175,
 3175,
 2220,
 2220]

In [288]:
type(labels[0]), type(docs[0])

(numpy.ndarray, pandas.core.series.Series)

In [289]:
Xtrain_sets = []
Ytrain_sets = []
Xtest_sets = []
Ytest_sets = []

for i in range(0,len(tags)):
    Xtrain_sets.append(docs[i].iloc[:training_sets[i]])
    Ytrain_sets.append(labels[i][:training_sets[i]])
    Xtest_sets.append(list(docs[i].iloc[training_sets[i]:]))
    Ytest_sets.append(labels[i][training_sets[i]:])

#### Training the Neural Nets Iteratively for Each Tag

In [290]:
#creating some useful functions
def integer_encode_documents(docs: List[str], tokenizer: Tokenizer)-> List[List[int]]:
    documents = []
    for d in docs:
        doc_integers = []
        for i in text_to_word_sequence(d):
            doc_integers.append(tokenizer.word_index[i])
        documents.append(doc_integers)
    return documents

def integer_encode_documents(docs, tokenizer):
    return tokenizer.texts_to_sequences(docs)

def get_max_token_length_per_doc(docs: List[List[str]])-> int:
    return max(list(map(lambda x: len(x.split()), docs)))

In [292]:
#The following cell trains and tests a neural network for each of the categories. 
EMBEDDING_SIZE = 12
set_of_training_accuracies = []
set_of_test_accuracies = []
for i in range(0,len(tags)):
    docs = Xtrain_sets[i] #for ease of re-using code
    vocab_size = 5400
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(docs)
    # integer encode the documents
    encoded_docs = integer_encode_documents(docs, tokenizer)# this is a list of lists, the numbers represent the index position of that word.
    max_length = get_max_token_length_per_doc(docs)    # get the max length in terms of token length
    # pad documents to a max length of 4 words
    padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
    
    model = Sequential()
    model.add(Embedding(vocab_size, EMBEDDING_SIZE, input_length=max_length))
    model.add(Flatten()) #Flatten makes this a NN x 1 vector.
    model.add(Dense(1, activation='sigmoid')) # these 32 elements are coalesced into one final output node, a sigmoid that outputs a probability of positive or negative

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc']) # compile the model

    model.fit(padded_docs, Ytrain_sets[i], epochs=50, verbose=0) # fit the model

    loss, accuracy = model.evaluate(padded_docs, Ytrain_sets[i], verbose=0)
    set_of_training_accuracies.append(f'{tags[i]} Accuracy: %f' % (accuracy*100)) 

    embedding_layer = model.layers[0]
    embedding_layer.get_weights()[0].shape
    
    encoded_test_docs = integer_encode_documents(Xtest_sets[i], tokenizer)

    padded_test_docs = pad_sequences(encoded_test_docs, maxlen=max_length, padding='post')# pad test documents
    prediction = model.predict(padded_test_docs, verbose=0)
    prediction[prediction>0.5] = 1
    prediction[prediction<=0.5] = 0
    prediction = prediction.flatten()
    correct = (prediction == Ytest_sets[i])
    total = len(correct)
    true = np.count_nonzero(correct == True)
    test_accuracy = true/total
    set_of_test_accuracies.append(test_accuracy)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tenso

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [None]:
VOCAB_SIZE = 5400
def make_lstm_classification_model(plot=False):
    docs = Xtrain_sets[i] #for ease of re-using code
    vocab_size = 5400
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(docs)
    # integer encode the documents
    encoded_docs = integer_encode_documents(docs, tokenizer)# this is a list of lists, the numbers represent the index position of that word.
    max_length = get_max_token_length_per_doc(docs)    # get the max length in terms of token length
    # pad documents to a max length of 4 words
    padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
    
    model = Sequential()
    model.add(Embedding(VOCAB_SIZE, 100, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False))
    model.add(Masking(mask_value=0.0)) # masking layer, masks any words that don't have an embedding as 0s.
    model.add(LSTM(units=32, input_shape=(1, MAX_SEQUENCE_LENGTH)))
    model.add(Dense(16))
    model.add(Dense(2, activation='softmax'))
    
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc']) # compile the model

    model.fit(padded_docs, , epochs=50, verbose=0) # fit the model


In [None]:
EMBEDDING_SIZE = 12
set_of_training_accuracies = []
set_of_test_accuracies = []
for i in range(0,len(tags)):
    docs = Xtrain_sets[i] #for ease of re-using code
    vocab_size = 5400
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(docs)
    # integer encode the documents
    encoded_docs = integer_encode_documents(docs, tokenizer)# this is a list of lists, the numbers represent the index position of that word.
    max_length = get_max_token_length_per_doc(docs)    # get the max length in terms of token length
    # pad documents to a max length of 4 words
    padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
    
    model = Sequential()
    model.add(Embedding(vocab_size, EMBEDDING_SIZE, input_length=max_length))
    model.add(Flatten()) #Flatten makes this a NN x 1 vector.
    model.add(Dense(1, activation='sigmoid')) # these 32 elements are coalesced into one final output node, a sigmoid that outputs a probability of positive or negative

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc']) # compile the model

    model.fit(padded_docs, Ytrain_sets[i], epochs=50, verbose=0) # fit the model

    loss, accuracy = model.evaluate(padded_docs, Ytrain_sets[i], verbose=0)
    set_of_training_accuracies.append(f'{tags[i]} Accuracy: %f' % (accuracy*100)) 

    embedding_layer = model.layers[0]
    embedding_layer.get_weights()[0].shape
    
    encoded_test_docs = integer_encode_documents(Xtest_sets[i], tokenizer)

    padded_test_docs = pad_sequences(encoded_test_docs, maxlen=max_length, padding='post')# pad test documents
    prediction = model.predict(padded_test_docs, verbose=0) 
    prediction[prediction>0.5] = 1 #for the given tag, classify the product according to it's probability, 1 if p>0.5 
    prediction[prediction<=0.5] = 0 #same but for 0
    prediction = prediction.flatten() #flatten the array for the next step
    correct = (prediction == Ytest_sets[i]) #for each tag, create an array that has True/False where True is a correctly predicted tag
    total = len(correct) #find the total length of the array for computing the accuracy
    true = np.count_nonzero(correct == True) #find the count of True values 
    test_accuracy = true/total #calculate the accuracy
    set_of_test_accuracies.append(test_accuracy) #append the tag accuracy to the list of accuracies

In [293]:
#Viewing test accuracy for comparison to other models
for i in range(0,len(tags)):
    print(f'{tags[i]} test accuracy is {"":>15}{set_of_test_accuracies[i]}')

classic test accuracy is                0.5994897959183674
modern test accuracy is                0.5816326530612245
casual test accuracy is                0.7206632653061225
romantic test accuracy is                0.7920918367346939
glam test accuracy is                0.8150510204081632
businesscasual test accuracy is                0.6479591836734694
edgy test accuracy is                0.8252551020408163
retro test accuracy is                0.9017857142857143
androgynous test accuracy is                0.6135204081632653
boho test accuracy is                0.8635204081632653
athleisure test accuracy is                0.923469387755102
studs test accuracy is                0.9285714285714286
embroidery test accuracy is                0.7857142857142857
trim test accuracy is                0.8571428571428571
ruffles test accuracy is                0.7142857142857143
mesh test accuracy is                0.8571428571428571
fringe test accuracy is                1.0
lace test accurac

In [294]:
sum(set_of_test_accuracies)/len(set_of_test_accuracies)

0.8435369219774127