In [1]:
# import analysis packages
import keras
from keras.callbacks import EarlyStopping
from keras.layers import Dense, Dropout, Embedding, Flatten, SimpleRNN, TextVectorization
from keras.models import Sequential
from keras.regularizers import l2
from keras.utils import to_categorical
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow_addons.metrics import F1Score

### Data Preprocessing

In [2]:
# read data from .csv files
trainDF = pd.read_csv('./ibotta_train.csv')
testDF = pd.read_csv('./ibotta_test.csv')

# combine data sets for preprocessing
trainDF['origin'] = 'train'
testDF['origin'] = 'test'
fullDF = pd.concat([trainDF, testDF])

# combine name and brand name fields
fullDF['Brand_name'].where(-fullDF['Brand_name'].isna(), '', inplace = True)
fullDF['Full_text'] = fullDF['Brand_name'] + ' ' + fullDF['Name']

# seed random seed
random.seed(542023)

# split data
trainDF = pd.DataFrame(fullDF.loc[fullDF['origin'] == 'train'].drop('origin', axis = 1))
testDF = pd.DataFrame(fullDF.loc[fullDF['origin'] == 'test'].drop(['origin', 'Category'], axis = 1))

### Data Vectorization

In [None]:
# train integer index tokenizer
intTokenizer = TextVectorization()
intTokenizer.adapt(fullDF['Full_text'])

# vectorize text data
intVecDF = pd.DataFrame(intTokenizer(fullDF['Full_text']))
trainDFintVec = intVecDF.loc[0:7999]
testDFintVec = intVecDF.loc[8000:9999]

In [None]:
# train bag of words tokenizer
countTokenizer = TextVectorization(output_mode = 'multi_hot')
countTokenizer.adapt(fullDF['Full_text'])

# vectorize text data
countVecDF = pd.DataFrame(countTokenizer(fullDF['Full_text']))
trainDFcountVec = countVecDF.loc[0:7999]
testDFcountVec = countVecDF.loc[8000:9999]

In [None]:
# train truncated bag of words tokenizer
countTokenizer1000 = TextVectorization(output_mode = 'multi_hot',
                                       vocabulary = countTokenizer.get_vocabulary()[0:1000])

# vectorize text data
countVec1000DF = pd.DataFrame(countTokenizer1000(fullDF['Full_text']))
trainDFcountVec1000 = countVec1000DF.loc[0:7999]
testDFcountVec1000 = countVec1000DF.loc[8000:9999]

In [None]:
# train tfidf tokenizer
tfidfTokenizer = TextVectorization(output_mode = 'tf_idf')
tfidfTokenizer.adapt(fullDF['Full_text'])

# vectorize text data
tfidfVecDF = pd.DataFrame(tfidfTokenizer(fullDF['Full_text']))
trainDFtfidfVec = tfidfVecDF.loc[0:7999]
testDFtfidfVec = tfidfVecDF.loc[8000:9999]

In [None]:
# train tfidf tokenizer
tfidfTokenizer1000 = TextVectorization(output_mode = 'tf_idf', 
                                       vocabulary = tfidfTokenizer.get_vocabulary()[0:1000],
                                       idf_weights = tfidfTokenizer.get_weights()[0][0:1000])

# vectorize text data
tfidfVec1000DF = pd.DataFrame(tfidfTokenizer1000(fullDF['Full_text']))
trainDFtfidfVec1000 = tfidfVec1000DF.loc[0:7999]
testDFtfidfVec1000 = tfidfVec1000DF.loc[8000:9999]

### Model Evaluation

In [6]:
# model evaluation function
def EvaluateModel(model, X_train):
    # set random seeds
    np.random.seed(542023)
    tf.random.set_seed(542023)

    # print model summary
    try:
        model.summary()
    except:
        pass

    # define F1 metric
    f1_score_metric = F1Score(num_classes = 7, average = 'weighted')

    # compile model
    model.compile(optimizer = 'rmsprop',
                  loss = 'categorical_crossentropy',
                  metrics = ['accuracy', f1_score_metric])
    
    # define early stopping criterion
    early = EarlyStopping(monitor = 'val_f1_score', mode = 'max', patience = 3)

    # train deep learning model
    trained = model.fit(X_train,
                        to_categorical(trainDF['Cat_code']),
                        epochs = 100,
                        batch_size = 128,
                        callbacks = early,
                        validation_split = 0.15,
                        verbose = 1)
    
    # prepare model evaluation
    acc = trained.history['accuracy']
    val_acc = trained.history['val_accuracy']
    loss = trained.history['loss']
    val_loss = trained.history['val_loss']
    f1_score = trained.history['f1_score']
    val_f1_score = trained.history['val_f1_score']
    epochs = range(1, len(acc) + 1)

    # final validation accuracy
    display(f"Internal validation accuracy: {round(val_acc[-1] * 100, 2)}%")
    display(f'Internal validation F1 score: {round(val_f1_score[-1], 4)}')

    # plot training accuracy
    fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize = (15, 5))
    ax1.plot(epochs, acc, 'bo', label = 'Training acc')
    ax1.plot(epochs, val_acc, 'b', label = 'Validation acc')
    ax1.set(xlabel = 'Epochs', ylabel = 'Accuracy')
    ax1.legend()
    ax2.plot(epochs, loss, 'bo', label = 'Training loss')
    ax2.plot(epochs, val_loss, 'b', label = 'Validation loss')
    ax2.set(xlabel = 'Epochs', ylabel = 'Loss')
    ax2.legend()
    ax3.plot(epochs, f1_score, 'bo', label = 'Training F1')
    ax3.plot(epochs, val_f1_score, 'b', label = 'Validation F1')
    ax3.set(xlabel = 'Epochs', ylabel = 'F1 Score')
    ax3.legend()
    fig.suptitle('Evaluation Metrics')

### Model Fitting

In [None]:
# define model architecture
model = Sequential([
    Embedding(4880, 8, input_shape = (4880, )),
    SimpleRNN(16),
    Dense(7, activation = 'softmax')
])

# evaluate model
EvaluateModel(model, trainDFcountVec)

In [None]:
# define model architecture
model = Sequential([
    Embedding(4880, 8, input_shape = (4880, )),
    SimpleRNN(16),
    Dense(7, activation = 'softmax')
])

# evaluate model
EvaluateModel(model, trainDFtfidfVec)

In [None]:
# define model architecture
model = Sequential([
    Embedding(4880, 8, input_shape = (4880, )),
    SimpleRNN(32),
    Dense(7, activation = 'softmax')
])

# evaluate model
EvaluateModel(model, trainDFcountVec)

In [None]:
# define model architecture
model = Sequential([
    Embedding(4880, 8, input_shape = (4880, )),
    SimpleRNN(32),
    Dense(7, activation = 'softmax')
])

# evaluate model
EvaluateModel(model, trainDFtfidfVec)

In [None]:
# define model architecture
model = Sequential([
    Embedding(4880, 16, input_shape = (4880, )),
    SimpleRNN(32),
    Dense(7, activation = 'softmax')
])

# evaluate model
EvaluateModel(model, trainDFcountVec)

In [None]:
# define model architecture
model = Sequential([
    Embedding(4880, 16, input_shape = (4880, )),
    SimpleRNN(32),
    Dense(7, activation = 'softmax')
])

# evaluate model
EvaluateModel(model, trainDFtfidfVec)

In [None]:
# define model architecture
model = Sequential([
    Embedding(4880, 16, input_shape = (4880, )),
    SimpleRNN(32, return_sequences = True),
    SimpleRNN(32, return_sequences = True),
    SimpleRNN(32),
    Dense(7, activation = 'softmax')
])

# evaluate model
EvaluateModel(model, trainDFcountVec)

In [None]:
# define model architecture
model = Sequential([
    Embedding(4880, 16, input_shape = (4880, )),
    SimpleRNN(32, return_sequences = True),
    SimpleRNN(32, return_sequences = True),
    SimpleRNN(32),
    Dense(7, activation = 'softmax')
])

# evaluate model
EvaluateModel(model, trainDFtfidfVec)