# Homework 4 Final Models (Text Classification)

Adam Kiehl  
5/7/2023

In [20]:
# import analysis packages
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import keras
from keras.callbacks import EarlyStopping
from keras.layers import Dense, Dropout, Embedding, Flatten, SimpleRNN, TextVectorization
from keras.models import Sequential
from keras.regularizers import l2
from keras.utils import to_categorical
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow_addons.metrics import F1Score

### Data Cleaning

In [21]:
# read data from .csv files
trainDF = pd.read_csv('./ibotta_train.csv')
testDF = pd.read_csv('./ibotta_test.csv')

# combine data sets for preprocessing
trainDF['origin'] = 'train'
testDF['origin'] = 'test'
fullDF = pd.concat([trainDF, testDF])

# text cleaning
fullDF['Brand_name'].where(-fullDF['Brand_name'].isna(), '', inplace = True)
fullDF['Brand_name'] = fullDF['Brand_name'].apply(lambda x: x.lower().replace("'", "").replace(",", "").replace(":", "").replace("-", "").replace(".", ""))
fullDF['Name'] = fullDF['Name'].apply(lambda x: x.lower().replace("'", "").replace(",", "").replace(":", "").replace("-", "").replace(".", ""))

# combine brand and name fields
fullDF['brandAlready'] = fullDF.apply(lambda x: x['Name'].find(x['Brand_name']), axis = 1)
fullDF.loc[fullDF.brandAlready == -1, 'Name'] = fullDF.loc[fullDF.brandAlready == -1, 'Brand_name'] + \
    ' ' + fullDF.loc[fullDF.brandAlready == -1, 'Name']
fullDF.drop('brandAlready', axis = 1, inplace = True)

# split data
trainDF = pd.DataFrame(fullDF.loc[fullDF['origin'] == 'train'].drop('origin', axis = 1))
testDF = pd.DataFrame(fullDF.loc[fullDF['origin'] == 'test'].drop(['origin', 'Category'], axis = 1))

### Text Vectorization

In [22]:
# find total number of unique words
unique_words = np.unique(np.array(' '.join(np.array(fullDF['Name'])).split(' ')))
max_length = len(unique_words)

# initialize empty dataframe
wordBag = pd.DataFrame(np.zeros((len(fullDF), max_length)), 
                       columns = unique_words)

# loop through product names
for i, productName in enumerate(fullDF['Name'].apply(lambda x: x.split(' '))):
    # loop through words in name
    for word in productName:
        # identify word presence
        wordBag.loc[i, word] = 1

# split word bag
trainDFwordBag = wordBag.loc[0:7999]
testDFwordBag = wordBag.loc[8000:9999]

In [23]:
# keras tokenizer function
def textVecGen(size):
    # train keras tokenizer
    tokenizer = TextVectorization(max_tokens = size, 
                                  output_sequence_length = size)
    tokenizer.adapt(fullDF['Name'])

    # vectorize data
    textVecDF = pd.DataFrame(tokenizer(fullDF['Name']))
    
    # split vectorized data
    trainDFtextVec = textVecDF.loc[0:7999]
    testDFtextVec = textVecDF.loc[8000:9999]

    return(trainDFtextVec, testDFtextVec)

# generate vectorized data
trainDFtextVec1000, testDFtextVec1000 = textVecGen(1000)

### Model Fitting

In [24]:
# set random seeds
np.random.seed(542023)
tf.random.set_seed(542023)

# define model architecture
model1 = Sequential([
    Dense(512, activation = 'relu'),
    Dense(256, activation = 'relu'),
    Dense(128, activation = 'relu'),
    Dense(64, activation = 'relu'),
    Dense(7, activation = 'softmax')
])

# define F1 metric
f1_score_metric = F1Score(num_classes = 7, average = 'weighted')

# compile model
model1.compile(optimizer = 'rmsprop',
               loss = 'categorical_crossentropy',
               metrics = ['accuracy', f1_score_metric])
    
# define early stopping criterion
early = EarlyStopping(monitor = 'f1_score', mode = 'max', patience = 3)

# train deep learning model
trained1 = model1.fit(trainDFwordBag,
                      to_categorical(trainDF['Cat_code']),
                      epochs = 100,
                      batch_size = 128,
                      callbacks = early,
                      verbose = 1)

# predict on test set
pred1 = model1.predict(testDFwordBag)

# create submission data frame
submission = pd.DataFrame({'Id': testDF['Id'], 'Cat_code': np.argmax(pred1, axis = 1).reshape(len(pred1), )})

# export submission
submission.to_csv('./submission1.csv', index = False)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100


In [25]:
# set random seeds
np.random.seed(542023)
tf.random.set_seed(542023)

# define model architecture
model2 = Sequential([
    Embedding(1000, 128, input_shape = (1000, )),
    Flatten(),
    Dense(128, activation = 'relu'),
    Dense(64, activation = 'relu'),
    Dense(7, activation = 'softmax')
])

# print model summary
model2.summary()

# define F1 metric
f1_score_metric = F1Score(num_classes = 7, average = 'weighted')

# compile model
model2.compile(optimizer = 'rmsprop',
               loss = 'categorical_crossentropy',
               metrics = ['accuracy', f1_score_metric])
    
# define early stopping criterion
early = EarlyStopping(monitor = 'f1_score', mode = 'max', patience = 3)

# train deep learning model
trained2 = model2.fit(trainDFtextVec1000,
                      to_categorical(trainDF['Cat_code']),
                      epochs = 100,
                      batch_size = 128,
                      callbacks = early,
                      verbose = 1)

# predict on test set
pred2 = model2.predict(testDFtextVec1000)

# create submission data frame
submission = pd.DataFrame({'Id': testDF['Id'], 'Cat_code': np.argmax(pred2, axis = 1).reshape(len(pred2), )})

# export submission
submission.to_csv('./submission2.csv', index = False)

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1000, 128)         128000    
                                                                 
 flatten (Flatten)           (None, 128000)            0         
                                                                 
 dense_25 (Dense)            (None, 128)               16384128  
                                                                 
 dense_26 (Dense)            (None, 64)                8256      
                                                                 
 dense_27 (Dense)            (None, 7)                 455       
                                                                 
Total params: 16,520,839
Trainable params: 16,520,839
Non-trainable params: 0
_________________________________________________________________
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 