In [None]:
import gensim
import nltk
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
import seaborn as sns
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.datasets import imdb
from tensorflow.keras.layers import Dense, LSTM, GRU, Embedding, Input
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.optimizers import Adam

from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
import os

import warnings 
## warnings.filterwarnings("ignore", category=DeprecationWarning)
%matplotlib inline

## Data Loading and Exploratory Data Analysis

In [None]:
## Read the data set from a CSV file
## There is no header row in the input file.
## The first column is the label or Product_Category, and the 2nd column is
## the product and description as a string.
df = pd.read_csv('./data/ecommerceDataset.csv',header=None)

df.columns=['product_category','raw_text']
df

In [None]:
## Count the occurences of each product category
df['product_category'].value_counts()

## Data Cleaning

The data has no missing values, but the product descriptions are free-form text. We plan to use the GloVe pre-trained word
vectors to encode all the words we can match from the product descriptions. But the first steps are to convert all the text to lower case, remove all punctuation and non-alphabetic characters, and remove English stop words.

In [None]:
match_non_alpha = re.compile(r'[^-a-z]+')

def clean_words(text):
    """
    1) Change all text to lower case.
    2) Substitute space for all non-alphabetic characters (allow hyphen to remain)
    3) Split into word tokens and drop English stop words (using ENGLISH_STOP_WORDS from sklearn.feature_extraction.text
    4) Remove single letter tokens
    4) Return a string which concatenates all remaining words in each text.
    """
    text2 = match_non_alpha.sub(r' ',text.lower())
    tokens = text2.split(' ')
    cleaned = []
    for token in tokens:
        ## Also remove single letter tokens, 
        ## Also remove English stop words from scikit-learn feature_extraction
        if len(token) > 1 and token not in ENGLISH_STOP_WORDS:
            cleaned.append(token)
    clean_text = ' '.join(cleaned)
    return clean_text
        
## function to count words in text
def count_words(text):
    return len (text.split(' '))

In [None]:
df['cleaned_text'] = df['raw_text'].apply(clean_words)
df['word_counts'] = df['cleaned_text'].apply(count_words)
df

In [None]:
df['word_counts'].describe()

In [None]:
## It turns out that after removing stop words and puctuation, etc, 
## A few of the cleaned_text values are empty. We will drop these rows
## from the data frame.

df[df['cleaned_text']=='']

In [None]:
df=df[df['cleaned_text']!='']
df

In [None]:
## Create a list of all (cleaned) words used in the product descriptions
## and identify the 50 most frequently occuring words
corpus = list(df['cleaned_text'])

all_words = []
for text in corpus:
    all_words.extend(text.split(' '))
    
print(f'Total Words in the corpus = {len(all_words)}')

all_word_series = pd.Series(all_words)
display(all_word_series.describe())
    
top_50 = pd.DataFrame(
    Counter(all_words).most_common(50),
    columns=['word', 'frequency']
)

top_50

In [None]:
# Plot a bar chart for the top 50 most frequently occuring words
fig = plt.figure(figsize=(20,10))

g = sns.barplot(
    x='word',
    y='frequency',
    data=top_50,
)

g.set_xticklabels(
    g.get_xticklabels(),
    rotation=45,
    fontsize=14
)

plt.yticks(fontsize=14)
plt.xlabel('Words', fontsize=14)
plt.ylabel('Frequency', fontsize=14)
plt.title('Top 50 Words', fontsize=17)

plt.show()

## Model Development

We plan to use supervised LSTM model for classification of the 4 product categories.
Therefore we need to reserve some of the data for out-of-sample testing. We will hold out 20% for test. We will encode the cleaned text from the product descriptions as sequences of GloVe vectors (300 floats for each word).

We are using GloVe pre-trained word vectors from this website: https://nlp.stanford.edu/projects/glove/

Reference: Jeffrey Pennington, Richard Socher, and Christopher D. Manning. 2014.

GloVe: Global Vectors for Word Representation. https://nlp.stanford.edu/pubs/glove.pdf

The GloVe vectors are trained on a large corpus of text, in such a way that word with similar meaning have similar directions in the vector space.

We will be matching all the words in our product descriptions with a dictionary of the Glove vectors and so converting a sequence of words into a sequence of these vectors. Words not in the dictionary will be dropped but most commonly used words should be found. The dictionary we are using has a 1.9 million token vocabulary. To build an LSTM model we need all the word sequences to be of a uniform length. Therefore we will be truncating or padding all the word sequences to a uniform length of 150 tokens.

In [None]:
## Here we are reserving 20 % of the data for testing
train, test = train_test_split(df, train_size=0.8, shuffle=True, random_state=42)

In [None]:
## Function to Load the GloVe dictionary of word embeddings.
## Using GloVe pre-trained word vectors from this website: https://nlp.stanford.edu/projects/glove/
## 
## Jeffrey Pennington, Richard Socher, and Christopher D. Manning. 2014. 
## GloVe: Global Vectors for Word Representation. https://nlp.stanford.edu/pubs/glove.pdf
##
## This particular set of word vectors is described as:
## Common Crawl (42B tokens, 1.9M vocab, uncased, 300d vectors, 1.75 GB download): glove.42B.300d.zip
## 

def get_glove_vectors(filename="data/glove.42B.300d.txt"):
    ## function from 
    ## https://campus.datacamp.com/courses/recurrent-neural-networks-for-language-modeling-in-python/rnn-architecture?ex=7
    # Get all word vectors from pre-trained model
    glove_vector_dict = {}
    with open(filename, encoding="UTF-8") as f:
        i = 0;
        for line in f:
            i = i + 1
            values = line.split()
            try:
                word = values[0]
                coefs = values[1:]
                glove_vector_dict[word] = np.asarray(coefs, dtype='float32')
            except Exception as inst:
                print(f'error on line {i} {type(inst)}')
                print(line)
    return glove_vector_dict

import time
start = time.time()

glove_vector_dict = get_glove_vectors()

end = time.time()
print(f'elapsed seconds = {end - start}')
type(glove_vector_dict)


In [None]:
def glove_word_embeddings(documents, pad_to=150):
    ## We plan to replace all the words in the documents
    ## with embeddings from the GloVe dictionary, skipping
    ## any words not found, and also padding the sequence 
    ## of embeddings to a fixed length.
    
    ## If none of the words match for a given document we will substitute
    ## a with place holder vector of one word, "neutral".
    d = glove_vector_dict
    neutral = d["neutral"]
    placeHolder = np.array([neutral])
    padNeutral = pad_sequences(placeHolder.T, pad_to, dtype='float32')
    outer = []
    for doc in documents:
        enc_list = []
        for word in doc.split(' '):
            if(type(d.get(word)) is np.ndarray):
                enc_list.append(d.get(word))
        if(len(enc_list) > 0):
            enc_array = np.array(enc_list)
            pad = pad_sequences(enc_array.T, pad_to, dtype='float32')
            outer.append(pad.T)
        else:
            outer.append(padNeutral.T)
    return np.array(outer)

In [None]:
train

In [None]:
%%time

X_train = glove_word_embeddings(train['cleaned_text'])
display(X_train.shape)
labelEncoderOnehot = LabelBinarizer()
y_train = labelEncoderOnehot.fit_transform(train['product_category'])
display(y_train.shape)

In [None]:
DROPOUT = 0.2
UNITS_PER_LAYER = 64

## Try switching to a Bidirectional LSTM model, as in this example
## https://keras.io/examples/nlp/bidirectional_lstm_imdb/

from tensorflow import keras
from tensorflow.keras import layers

inputs = keras.Input(shape=(150, 300) )
x = layers.Bidirectional(LSTM(units=UNITS_PER_LAYER, return_sequences=True, dropout=DROPOUT))(inputs)
x = layers.Bidirectional(LSTM(units=UNITS_PER_LAYER, return_sequences=True, dropout=DROPOUT))(x)
x = layers.Bidirectional(LSTM(units=UNITS_PER_LAYER, return_sequences=False, dropout=DROPOUT))(x)
# Add a classifier
outputs = layers.Dense(4,  activation='softmax')(x)
model = keras.Model(inputs, outputs)

opt = tf.keras.optimizers.Adam(learning_rate=0.0002)

model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])

file_name = 'weights_{epoch:03d}_{val_accuracy:.4f}.hdf5'

checkpoint_filepath = os.path.join('.', 'SAVE_MODELS', file_name)

modelCheckpoint = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)

earlyStopping = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=6, restore_best_weights=True)

model.summary()

In [None]:
history = model.fit(X_train, y_train, 
                    batch_size=1, 
                    epochs=100, 
                    validation_split=0.2,
                    callbacks=[earlyStopping,modelCheckpoint]
                   )