In [1]:
# import analysis packages
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import keras
from keras.callbacks import EarlyStopping
from keras.layers import Dense, Dropout, Embedding, Flatten, SimpleRNN, TextVectorization
from keras.models import Sequential
from keras.regularizers import l2
from keras.utils import to_categorical
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
import tensorflow as tf

### Data Preprocessing

In [2]:
# read data from .csv files
trainDF = pd.read_csv('./ibotta_train.csv')
testDF = pd.read_csv('./ibotta_test.csv')

# combine data sets for preprocessing
trainDF['origin'] = 'train'
testDF['origin'] = 'test'
fullDF = pd.concat([trainDF, testDF])

# text cleaning
fullDF['Brand_name'].where(-fullDF['Brand_name'].isna(), '', inplace = True)
fullDF['Brand_name'] = fullDF['Brand_name'].apply(lambda x: x.lower().replace("'", "").replace(",", "").replace(":", "").replace("-", "").replace(".", ""))
fullDF['Name'] = fullDF['Name'].apply(lambda x: x.lower().replace("'", "").replace(",", "").replace(":", "").replace("-", "").replace(".", ""))

# combine brand and name fields
fullDF['brandAlready'] = fullDF.apply(lambda x: x['Name'].find(x['Brand_name']), axis = 1)
fullDF.loc[fullDF.brandAlready == -1, 'Name'] = fullDF.loc[fullDF.brandAlready == -1, 'Brand_name'] + \
    ' ' + fullDF.loc[fullDF.brandAlready == -1, 'Name']
fullDF.drop('brandAlready', axis = 1, inplace = True)

# seed random seed
random.seed(542023)

# split data
trainDF = pd.DataFrame(fullDF.loc[fullDF['origin'] == 'train'].drop('origin', axis = 1))
validIdx = random.sample(list(trainDF['Id'] - 1), 1000)
validDF = trainDF.loc[validIdx]
trainDF = trainDF.loc[trainDF['Id'].apply(lambda x: x not in validIdx)]
testDF = pd.DataFrame(fullDF.loc[fullDF['origin'] == 'test'].drop(['origin', 'Category'], axis = 1))

### Data Vectorization

In [8]:
# find total number of unique words
unique_words = np.unique(np.array(' '.join(np.array(fullDF['Name'])).split(' ')))
max_length = len(unique_words)

# initialize empty dataframe
wordBag = pd.DataFrame(np.zeros((len(fullDF), max_length)), 
                       columns = unique_words)

# loop through product names
for i, productName in enumerate(fullDF['Name'].apply(lambda x: x.split(' '))):
    # loop through words in name
    for word in productName:
        # identify word presence
        wordBag.loc[i, word] = 1

# split word bag
trainDFwordBag = wordBag.loc[0:7999]
validDFwordBag = trainDFwordBag.loc[validIdx]
trainDFwordBag = trainDFwordBag.loc[pd.Series(trainDFwordBag.index).apply(lambda x: x not in validIdx)]
testDFwordBag = wordBag.loc[8000:9999]

In [4]:
# tag data with iterable object
tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(fullDF['Name'].apply(lambda x: x.split(' ')))]

# doc2vec vectorization function
def doc2vecGen(size):
    # train doc2vec tokenizer
    tokenizer = Doc2Vec(tagged_data, vector_size = size, min_count = 1, epochs = 100)
    tokenizer.build_vocab(list(tagged_data))
    tokenizer.train(list(tagged_data), 
                    total_examples = size, 
                    epochs = tokenizer.epochs)
    
    # vectorize text data
    doc2vecDF = fullDF['Name'].apply(lambda x: tokenizer.infer_vector(x.split(' ')))
    doc2vecDF = pd.DataFrame(np.array(doc2vecDF))[0].apply(pd.Series)

    # split vectorized data
    trainDFdoc2vec = doc2vecDF.loc[0:7999]
    validDFdoc2vec = doc2vecDF.loc[validIdx]
    trainDFdoc2vec = trainDFdoc2vec.loc[pd.Series(trainDFdoc2vec.index).apply(lambda x: x not in validIdx)]
    testDFdoc2vec = doc2vecDF.loc[8000:9999]

    return(trainDFdoc2vec, validDFdoc2vec, testDFdoc2vec)

# generate vectorized data
# trainDFdoc2vecMAX, validDFdoc2vecMAX, testDFdoc2vecMAX = doc2vecGen(max_length)
# trainDFdoc2vec1000, validDFdoc2vec1000, testDFdoc2vec1000 = doc2vecGen(1000)
# trainDFdoc2vec500, validDFdoc2vec500, testDFdoc2vec500 = doc2vecGen(500)
# trainDFdoc2vec100, validDFdoc2vec100, testDFdoc2vec100 = doc2vecGen(100)

In [5]:
# keras tokenizer function
def textVecGen(size):
    # train keras tokenizer
    tokenizer = TextVectorization(max_tokens = size,
                                  output_sequence_length = size)
    tokenizer.adapt(fullDF['Name'])

    # vectorize data
    textVecDF = pd.DataFrame(tokenizer(fullDF['Name']))
    
    # split vectorized data
    trainDFtextVec = textVecDF.loc[0:7999]
    validDFtextVec = trainDFtextVec.loc[validIdx]
    trainDFtextVec = trainDFtextVec.loc[pd.Series(trainDFtextVec.index).apply(lambda x: x not in validIdx)]
    testDFtextVec = textVecDF.loc[8000:9999]

    return(trainDFtextVec, validDFtextVec, testDFtextVec)

# generate vectorized data
trainDFtextVecMAX, validDFtextVecMAX, testDFtextVecMAX = textVecGen(max_length)
trainDFtextVec1000, validDFtextVec1000, testDFtextVec1000 = textVecGen(1000)
trainDFtextVec500, validDFtextVec500, testDFtextVec500 = textVecGen(500)
trainDFtextVec100, validDFtextVec100, testDFtextVec100 = textVecGen(100)

Metal device set to: Apple M1

systemMemory: 8.00 GB
maxCacheSize: 2.67 GB



2023-05-05 12:04:08.952118: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


### Model Evaluation

In [6]:
# model evaluation function
def EvaluateModel(model, X_train):
    # set random seeds
    np.random.seed(542023)
    tf.random.set_seed(542023)

    # print model summary
    try:
        model.summary()
    except:
        pass

    # compile model
    model.compile(optimizer = 'rmsprop',
                  loss = 'categorical_crossentropy',
                  metrics = ['accuracy'])
    
    # define early stopping criterion
    early = EarlyStopping(monitor = 'val_loss', mode = 'min', patience = 3)

    # train deep learning model
    trained = model.fit(X_train,
                        to_categorical(trainDF['Cat_code']),
                        epochs = 100,
                        batch_size = 32,
                        callbacks = early,
                        validation_split = 0.15,
                        verbose = 1)
    
    # prepare model evaluation
    acc = trained.history['accuracy']
    val_acc = trained.history['val_accuracy']
    loss = trained.history['loss']
    val_loss = trained.history['val_loss']
    epochs = range(1, len(acc) + 1)

    # final validation accuracy
    display(f"Internal validation accuracy: {round(val_acc[-1] * 100, 2)}%")

    # plot training accuracy
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize = (15, 5))
    ax1.plot(epochs, acc, 'bo', label = 'Training acc')
    ax1.plot(epochs, val_acc, 'b', label = 'Validation acc')
    ax1.set(xlabel = 'Epochs', ylabel = 'Accuracy')
    ax1.legend()
    ax2.plot(epochs, loss, 'bo', label = 'Training loss')
    ax2.plot(epochs, val_loss, 'b', label = 'Validation loss')
    ax2.set(xlabel = 'Epochs', ylabel = 'Loss')
    ax2.legend()
    fig.suptitle('Evaluation Metrics')

### Model Fitting

In [None]:
# define model architecture
model = Sequential([
    Embedding(100, 128),
    SimpleRNN(32),
    Dense(128, activation = 'relu'),
    Dense(64, activation = 'relu'),
    Dense(7, activation = 'softmax')
])

# evaluate model
EvaluateModel(model, trainDFtextVec100)

In [None]:
# define model architecture
model = Sequential([
    Embedding(100, 8),
    SimpleRNN(32, return_sequences = True),
    SimpleRNN(32, return_sequences = True),
    SimpleRNN(32),
    Dense(7, activation = 'softmax')
])

# evaluate model
EvaluateModel(model, trainDFtextVec100)

In [None]:
# define model architecture
model = Sequential([
    Embedding(500, 128),
    SimpleRNN(32),
    Dense(128, activation = 'relu'),
    Dense(64, activation = 'relu'),
    Dense(7, activation = 'softmax')
])

# evaluate model
EvaluateModel(model, trainDFtextVec500)

In [None]:
# define model architecture
model = Sequential([
    Embedding(500, 8),
    SimpleRNN(32, return_sequences = True),
    SimpleRNN(32, return_sequences = True),
    SimpleRNN(32),
    Dense(7, activation = 'softmax')
])

# evaluate model
EvaluateModel(model, trainDFtextVec500)

In [None]:
# define model architecture
model = Sequential([
    Embedding(1000, 128),
    SimpleRNN(32),
    Dense(128, activation = 'relu'),
    Dense(64, activation = 'relu'),
    Dense(7, activation = 'softmax')
])

# evaluate model
EvaluateModel(model, trainDFtextVec1000)

In [None]:
# define model architecture
model = Sequential([
    Embedding(1000, 8),
    SimpleRNN(32, return_sequences = True),
    SimpleRNN(32, return_sequences = True),
    SimpleRNN(32),
    Dense(7, activation = 'softmax')
])

# evaluate model
EvaluateModel(model, trainDFtextVec1000)

In [None]:
# define model architecture
model = Sequential([
    Embedding(4977, 128),
    SimpleRNN(32),
    Dense(128, activation = 'relu'),
    Dense(64, activation = 'relu'),
    Dense(7, activation = 'softmax')
])

# evaluate model
EvaluateModel(model, trainDFtextVecMAX)

In [None]:
# define model architecture
model = Sequential([
    Embedding(4977, 8),
    SimpleRNN(32, return_sequences = True),
    SimpleRNN(32, return_sequences = True),
    SimpleRNN(32),
    Dense(7, activation = 'softmax')
])

# evaluate model
EvaluateModel(model, trainDFtextVecMAX)

In [None]:
# define model architecture
model = Sequential([
    Embedding(100, 128),
    SimpleRNN(32),
    Dense(128, activation = 'relu'),
    Dense(64, activation = 'relu'),
    Dense(7, activation = 'softmax')
])

# evaluate model
# EvaluateModel(model, trainDFdoc2vec100)

In [None]:
# define model architecture
model = Sequential([
    Embedding(100, 8),
    SimpleRNN(32, return_sequences = True),
    SimpleRNN(32, return_sequences = True),
    SimpleRNN(32),
    Dense(7, activation = 'softmax')
])

# evaluate model
# EvaluateModel(model, trainDFdoc2vec100)

In [None]:
# define model architecture
model = Sequential([
    Embedding(500, 128),
    SimpleRNN(32),
    Dense(128, activation = 'relu'),
    Dense(64, activation = 'relu'),
    Dense(7, activation = 'softmax')
])

# evaluate model
# EvaluateModel(model, trainDFdoc2vec500)

In [None]:
# define model architecture
model = Sequential([
    Embedding(500, 8),
    SimpleRNN(32, return_sequences = True),
    SimpleRNN(32, return_sequences = True),
    SimpleRNN(32),
    Dense(7, activation = 'softmax')
])

# evaluate model
# EvaluateModel(model, trainDFdoc2vec500)

In [None]:
# define model architecture
model = Sequential([
    Embedding(1000, 128, input_shape = (1000, )),
    SimpleRNN(32),
    Dense(128, activation = 'relu'),
    Dense(64, activation = 'relu'),
    Dense(7, activation = 'softmax')
])

# evaluate model
# EvaluateModel(model, trainDFdoc2vec1000)

In [None]:
# define model architecture
model = Sequential([
    Embedding(1000, 8),
    SimpleRNN(32, return_sequences = True),
    SimpleRNN(32, return_sequences = True),
    SimpleRNN(32),
    Dense(7, activation = 'softmax')
])

# evaluate model
# EvaluateModel(model, trainDFdoc2vec1000)

In [None]:
# define model architecture
model = Sequential([
    Embedding(4977, 128, input_shape = (4977, )),
    SimpleRNN(32),
    Dense(128, activation = 'relu'),
    Dense(64, activation = 'relu'),
    Dense(7, activation = 'softmax')
])

# evaluate model
# EvaluateModel(model, trainDFdoc2vecMAX)

In [None]:
# define model architecture
model = Sequential([
    Embedding(4977, 8),
    SimpleRNN(32, return_sequences = True),
    SimpleRNN(32, return_sequences = True),
    SimpleRNN(32),
    Dense(7, activation = 'softmax')
])

# evaluate model
# EvaluateModel(model, trainDFdoc2vecMAX)