# Creating Standard Neural Network for Prediction

In [201]:
#Following the code here: https://github.com/ychennay/dso-560-nlp-and-text-analytics/blob/master/week7/Deep%20Learning%20with%20Word%20Embeddings.ipynb

In [12]:
import pandas as pd
import numpy as np
from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from numpy import asarray
from numpy import zeros
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.text import text_to_word_sequence
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from typing import List
from pprint import pprint

In [13]:
#1. Prepare to Iterate through All Tags
#- The goal is to train a neural network to predict for each tag
#- In order to do so, we must read in the CSVs created by the "One Hot Encoding and Duplicate Removal" python notebook

fulldata = pd.read_csv("full_data_joined_attr_processed.csv") #gathering data as created through the notebook that gathers and cleans the input variables
testdata = pd.read_csv("full_data_to_predict.csv")['final']
testdata = list(testdata.astype('str'))

chosen_cats = ['style','embellishment','occasion','category','dry_clean_only'] #gathering the attribute categories (5 for our group)

#gather the unique tags for each of the 5 catgories
styles = fulldata[fulldata['attribute_name'] == 'style']['attribute_value'].unique()
embels = fulldata[fulldata['attribute_name'] == 'embellishment']['attribute_value'].unique()
occasi = fulldata[fulldata['attribute_name'] == 'occasion']['attribute_value'].unique()
catego = fulldata[fulldata['attribute_name'] == 'category']['attribute_value'].unique()
drycle = fulldata[fulldata['attribute_name'] == 'dry_clean_only']['attribute_value'].unique()

#conver to lists
styles = list(styles)
embels = list(embels)
occasi = list(occasi)
catego = list(catego)
drycle = list(drycle)

#combine tags to create a single list
tags = styles + embels + occasi + catego + drycle
tags

In [15]:
#Reading in the documents associated with each tag
#NOTE: "One Hot Encoding and Duplicate Removal.ipybn" must be run to create the CSVs used in this cell

docs = [] #for the X values in our neural net
labels = [] #for the Y values in our neural net
for i in range(0,len(tags)):
    docs.append(pd.read_csv(f"{tags[i]}_NN.csv")['final']) #reading in the documents of 1 hot encoding, final is the column of product documents
    labels.append(array(pd.read_csv(f"{tags[i]}_NN.csv")[tags[i]]))

In [16]:
#We are training on the full data set because we are using this model to predict
training_sets = []
for i in range(0,len(tags)):
    training_sets.append(int(len(docs[i]))) #full training set
training_sets

[3916,
 3916,
 3916,
 3916,
 3916,
 3916,
 3916,
 3916,
 3916,
 3916,
 3916,
 70,
 70,
 70,
 70,
 70,
 70,
 70,
 70,
 70,
 3914,
 3914,
 3914,
 3914,
 3914,
 3914,
 3914,
 3969,
 3969,
 3969,
 3969,
 3969,
 3969,
 3969,
 3969,
 2776,
 2776]

In [18]:
#We are training on the full data set, so Xtest_sets and Ytest_sets are commented out
Xtrain_sets = []
Ytrain_sets = []
# Xtest_sets = []
# Ytest_sets = []

#Creating the training sets
for i in range(0,len(tags)):
    Xtrain_sets.append(docs[i].iloc[:training_sets[i]])
    Ytrain_sets.append(labels[i][:training_sets[i]])
#     Xtest_sets.append(list(docs[i].iloc[training_sets[i]:]))
#     Ytest_sets.append(labels[i][training_sets[i]:])

#### Training the Neural Nets Iteratively for Each Tag

In [19]:
#creating some useful functions used in the next cell
def integer_encode_documents(docs: List[str], tokenizer: Tokenizer)-> List[List[int]]:
    documents = []
    for d in docs:
        doc_integers = []
        for i in text_to_word_sequence(d):
            doc_integers.append(tokenizer.word_index[i])
        documents.append(doc_integers)
    return documents

def integer_encode_documents(docs, tokenizer):
    return tokenizer.texts_to_sequences(docs)

def get_max_token_length_per_doc(docs: List[List[str]])-> int:
    return max(list(map(lambda x: len(x.split()), docs)))

In [42]:
#This cell trains and tests a neural network for each of the categories. 
EMBEDDING_SIZE = 12 #the default was 8 so we kept it small, although we saw embedding size of 100 in another example. 8 is good because it keeps dimensions low
#set_of_training_accuracies = [] #for recording the training accuracy
set_of_predictions = [] #for recording the predictions on the untagged data

for i in range(0,len(tags)):
    docs = Xtrain_sets[i] #gather one training set at a time
    vocab_size = 10000 #make this high enough that no errors happen
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(docs)
    
    # integer encode the documents
    encoded_docs = integer_encode_documents(docs, tokenizer)# this is a list of lists, the numbers represent the index position of that word.
    max_length = get_max_token_length_per_doc(docs)    # get the max length in terms of token length
    
    # pad documents to a max length of 4 words
    padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
    
    model = Sequential() #Simple neural network
    model.add(Embedding(vocab_size, EMBEDDING_SIZE, input_length=max_length))
    model.add(Flatten()) #Flatten makes this a NN x 1 vector.
    model.add(Dense(1, activation='sigmoid')) # these 32 elements are coalesced into one final output node, a sigmoid that outputs a probability of positive or negative

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc']) # compile the model

    model.fit(padded_docs, Ytrain_sets[i], epochs=50, verbose=0) # fit the model

    loss, accuracy = model.evaluate(padded_docs, Ytrain_sets[i], verbose=0)

    embedding_layer = model.layers[0]
    embedding_layer.get_weights()[0].shape
    
    encoded_test_docs = integer_encode_documents(testdata, tokenizer)

    padded_test_docs = pad_sequences(encoded_test_docs, maxlen=max_length, padding='post')# pad test documents
    prediction = model.predict(padded_test_docs, verbose=0)
    prediction[prediction>0.5] = 1
    prediction[prediction<=0.5] = 0
    prediction = prediction.flatten()
    set_of_predictions.append(prediction)
#     correct = (prediction == Ytest_sets[i]) #this and the following were used in training/validation stage, commmented out here
#     total = len(correct)
#     true = np.count_nonzero(correct == True)
#     test_accuracy = true/total
#     set_of_test_accuracies.append(test_accuracy)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tenso

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [64]:
#Create the output with predictions on all untagged products (i.e. the test data set)

output = pd.DataFrame()
output['product_id'] = pd.read_csv("full_data_to_predict.csv")['product_id']
for i in range(0,len(tags)):
    output[i+1] = set_of_predictions[i]

In [67]:
tags.insert(0,'product_id')

In [68]:
output.columns = tags

In [69]:
output

Unnamed: 0,product_id,classic,modern,casual,romantic,glam,businesscasual,edgy,retro,androgynous,...,top,onepiece,bottom,shoe,sweater,accessory,blazerscoatsjackets,sweatshirthoodie,yes,no
0,01DSRPSZTDW2PGK1YWYXJGKZZ0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,01DSQXJBX0R7DCW7KTAC1SW547,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,01DPGV8TGRAB993PF7Z3YWG2VR,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,01DSR8G3F7DBRTMP8THF97XSQ2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,01DSR8G5GP519DEDCSKBMWQVK5,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44135,01E2P33K4RRKYDCGN6WX8X1HCJ,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
44136,01E2P30V4VR23GS7ZT9CTQZBCG,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
44137,01E2P4J0GC34Z496D78AMNV0WC,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
44138,01E5ZW1B7Q3AFDD1RTXC4RTZZN,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


### Important Note/Weakness of This Output

The model structure predicted a 1/0 (yes/no) on each of the tags individually. This is a good strategy for categories that follow a "Select All that Apply" criteria (e.g. a product can be classified as multiple styles: athleisure, casual, etc).

But for categories that follow a "Choose 1" criteria (Category and Dry Clean Only) - this method fails. See the cell below this one for a demonstration.

Our group knows that a multiple classification neural network, with a softmax activation function and a categorical crossentropy loss function would be the correct start to resolving this issue. Frankly, we did not have time to resolve this issue in the proper manner.

In [96]:
sum(output['top']+output['bottom']>=2) #This sum should be 0, as a product should only be classified as either "top" or "bottom" and not both

17

In [98]:
#Merging the original file with the predictions on product_id
x = pd.read_csv("full_data_to_predict.csv") #reading in original file
output2 = x.merge(output,on="product_id") #mergin

In [108]:
#dropping unnecessary columns from the original file
output3 = output2.drop(['Unnamed: 0', 'dry_clean_only','category',  'combined_data', 'rm_sw', 'lemmatized', 'final_list', 'studs_x', 'sequins_x',
       'embroidery_x', 'trim_x', 'ruffles_x', 'mesh_x', 'lace_x', 'fringe_x',
       'buckles_x','crystals', 'rhinestone', 'patches', 'epaulets', 'beaded',
       'modern_x', 'romantic_x', 'classic_x', 'casual_x', 'businesscasual_x',
       'glam_x', 'edgy_x', 'retro_x', 'androgynous_x', 'boho_x',
       'athleisure_x'], axis=1) 

In [112]:
output3.to_csv("output.csv")