# Required Imports

In [1]:
pip install nltk



In [2]:
pip install tensorflow



In [3]:
pip install datasets



In [4]:
from datasets import load_dataset
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
import ssl
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import Embedding
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay


import keras
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import os

# Preprocessing

In [5]:
# downloads the stop words
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
dataset = load_dataset("yelp_review_full")
dp_train = dataset['train'].to_pandas()
dp_test = dataset['test'].to_pandas()
dp_train

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Unnamed: 0,label,text
0,4,dr. goldberg offers everything i look for in a...
1,1,"Unfortunately, the frustration of being Dr. Go..."
2,3,Been going to Dr. Goldberg for over 10 years. ...
3,3,Got a letter in the mail last week that said D...
4,0,I don't know what Dr. Goldberg was like before...
...,...,...
649995,4,I had a sprinkler that was gushing... pipe bro...
649996,0,Phone calls always go to voicemail and message...
649997,0,Looks like all of the good reviews have gone t...
649998,4,I was able to once again rely on Yelp to provi...


In [7]:
# preprocesss the text data by removing stop words and leading spaces
# this data pre-process method was inspired by: https://www.kaggle.com/code/gcdatkin/gru-hotel-rating-prediction

def pre_process_data(X):
    stop_words = stopwords.words('english')
    X = re.sub(r'\d+', ' ', X)
    X = X.split()
    X = " ".join([word for word in X if word.lower().strip() not in stop_words])
    return X

In [8]:
reviews = dp_train['text'].apply(pre_process_data)
reviews

0         dr. goldberg offers everything look general pr...
1         Unfortunately, frustration Dr. Goldberg's pati...
2         going Dr. Goldberg years. think one st patient...
3         Got letter mail last week said Dr. Goldberg mo...
4         know Dr. Goldberg like moving Arizona, let tel...
                                ...                        
649995    sprinkler gushing... pipe broken way ground, t...
649996    Phone calls always go voicemail messages retur...
649997    Looks like good reviews gone head place! Jason...
649998    able rely Yelp provide needed response leaking...
649999    using company months. Ryan would come every we...
Name: text, Length: 650000, dtype: object

In [9]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(reviews)

sequences = tokenizer.texts_to_sequences(reviews)

In [10]:
# get the length of the largest sequnces
max_length = np.max(list(map(lambda x: len(x), sequences)))

# pads all the inputs to be the same length of the max length
print("The max length is ", max_length)
inputs = pad_sequences(sequences, maxlen=max_length, padding = 'post')

labels = np.array(dp_train['label'])
X_train, X_test, y_train, y_test = train_test_split(inputs, labels, train_size=0.80, random_state=100)

The max length is  1166


# Create and Train MLP Model

In [11]:
# Tensorflow mlp was inspired from their website
# https://www.tensorflow.org/guide/core/mlp_core and #https://www.geeksforgeeks.org/multi-layer-perceptron-learning-in-tensorflow

# We want to use an embedding so the neural network is better able to differniate words

model = Sequential([
    #sets the input to be the size of the text (which is the max-length)
    Flatten(input_shape= (max_length, )),

    Dense(256, activation = 'relu'),
    Dense(256, activation = 'relu'),
    Dense(256, activation = 'relu'),
    Dense(256, activation = 'relu'),

    #output layer
    Dense(5, activation = 'softmax'),
])

optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)
model.compile(optimizer = optimizer, loss = 'sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
model.fit(X_train, y_train, epochs = 10)

In [None]:
tf.saved_model.save(model, "/Users/calebschaefer/Documents/comp_5600/term_project/model3")

# Evaluate Model

In [None]:
test_reviews = dp_test['text'].apply(pre_process_data)
test_reviews

In [None]:
sequences = tokenizer.texts_to_sequences(test_reviews)

#max_length = np.max(list(map(lambda x: len(x), sequences)))
#print(max_length)

# pads all the inputs to be the same length of the max length
test_inputs = pad_sequences(sequences, maxlen=855, padding = 'post')
test_inputs

In [None]:
y_test = np.asarray(dp_test['label'])
print(y_test.shape)
print(test_inputs.shape)

In [None]:
# Evaluate already displays loss and accuracy metrics
result = model.evaluate(test_inputs, y_test)

In [None]:
# Predict only generates class labels --> need these for accuracy and confusion matrix
predict_test = model.predict(test_inputs, y_test)

In [None]:
# Shows summary of current state of model
print(model.summary())

In [None]:
# Displays loss and accuracy
dict(zip(model.metrics_names, result))

In [None]:
# Displays a confusion matrix based on the predicted vs actual values
cm = confusion_matrix(y_test, predict_test)
ConfusionMatrixDisplay(cm).plot()