## Imports 

In [None]:
pip install nltk

In [None]:
pip install gensim

In [None]:
pip install tensorflow

In [None]:
pip install datasets

In [None]:
pip install scikit-learn

In [None]:
from datasets import load_dataset
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
import ssl
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten 
from tensorflow.keras.layers import Dense 
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import Embedding 
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.metrics import Precision, Recall
from gensim.models import Word2Vec
from tensorflow.keras.layers import LSTM, SpatialDropout1D, Bidirectional, Dropout
from sklearn.metrics import classification_report
from keras.constraints import max_norm
from keras.callbacks import EarlyStopping
import string
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

import numpy as np
import tensorflow as tf
from gensim.models import Word2Vec

import keras
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import os

In [None]:
#download's the stop word's
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('stopwords')
nltk.download('punkt')

In [None]:
tf.config.experimental.list_physical_devices('GPU')

## Data Preprocessing

In [None]:
dataset = load_dataset("yelp_review_full")
dp_train = dataset['train'].to_pandas()
dp_test = dataset['test'].to_pandas()

#used so we don't have to use the entire training dataset
dp_sample = dp_train.sample(n=100000, random_state=42)
dp_sample

In [None]:
#preprocess's the text data
#by removing stop words and leading spaces 
#this data pre-process was inspired by: https://www.kaggle.com/code/gcdatkin/gru-hotel-rating-prediction
def pre_process_data(X):
    stop_words = stopwords.words('english')
    X = re.sub(r'\d+', ' ', X)
    X = X.split()
    X = " ".join([word for word in X if word.lower().strip() not in stop_words])
    return X

In [None]:
dp_sample['text'] = dp_sample['text'].apply(pre_process_data)
dp_sample

## Nesseccary Functions

In [None]:
#provides the option a toke/sequence embedding approach
#takes the modified reviews
#return the modified inputs and the amount of words's to be used in an embedding layer
#sentences are the actual reviews
#num_words are the amount of top words_taken
#use_entire_vocab, boolean to determine if we want to use top words or everything
#max_length, determines if he want to take into account the max_length sequnce or average length sequnce
def create_sequences(sentences, num_words, use_entire_vocab, max_length_voacb):
    #takes the most used word's so we don't have the entire vocab
    if use_entire_vocab:
        tokenizer = Tokenizer()
    else:
        tokenizer = Tokenizer(num_words = num_words)
    
   
    tokenizer.fit_on_texts(sentences)
    sequences = tokenizer.texts_to_sequences(sentences)
    
    if use_entire_vocab:
        num_words = len(tokenizer.word_index)
    
    if max_length_voacb:
        #get the length of the largest sequnces
        max_length = np.max(list(map(lambda x: len(x), sequences)))
    else:
        sequence_lengths = [len(seq) for seq in sequences]
        # Calculate the average length
        max_length = int(sum(sequence_lengths) / len(sequence_lengths))
    
    
    #pad all the input's to be the same length of the max length
    #print(max_length)
    inputs = pad_sequences(sequences, maxlen=max_length, padding = 'post')
    
    return inputs, num_words, max_length

#used to test word to vec embeedings weights on the neural network
def make_embeddings(data, num_words):
    sentence_split = [line.split() for line in data]
    
    word2vec_model = Word2Vec(sentences=sentence_split, vector_size=100, window=5, min_count=3, workers=4)   
    vocab = list(word2vec_model.wv.key_to_index.keys())
    print(len(vocab))
    # Convert tokens to embeddings
    embedding_matrix = np.zeros((len(vocab), word2vec_model.vector_size))
    for index, word in enumerate(vocab):
        embedding_matrix[index] = word2vec_model.wv[word]
        
    
    indexed_data = []
    for sentence in sentence_split:
        indexed_sentence = []
        for word in sentence:
            if word in word2vec_model.wv.key_to_index:
                #indexed_sentence = [word2vec_model.wv.key_to_index[word] for word in sentence]
                indexed_sentence.append(word2vec_model.wv.key_to_index[word])
        indexed_data.append(indexed_sentence)
    max_length = np.max(list(map(lambda x: len(x), indexed_data)))
    padded_data = pad_sequences(indexed_data, maxlen=max_length, padding='post')
    return embedding_matrix, len(vocab), padded_data
    
    

In [None]:
inputs, num_words, max_length = create_sequences(dp_sample['text'],50000, False, False)

In [None]:
print(num_words)
print(max_length)
inputs

In [None]:
#make the testing and training parititons
#inputs are tokenized seqeuences and data is the pandas dataframe used for testing
def make_training_partition(inputs, data):
    labels = np.array(data['label'])
    X_train, X_test, y_train, y_test = train_test_split(inputs, labels, train_size=0.75, random_state=42)
    y_train = tf.one_hot(y_train, depth=5)
    y_train = tf.cast(y_train, dtype=tf.int32)
    y_test =  tf.one_hot(y_test, depth=5)
    y_test = tf.cast(y_test, dtype=tf.int32)
    
    return X_train, X_test, y_train, y_test

#performs predictions on the model
#model is the current model to be tested
#X_test is the testing partition
#y_test is is the label testing partition
def predict(model, X_test, y_test):
    predictions = model.predict(X_test)
    predictions_transformed = np.argmax(predictions, axis=1)
    test_transformed = np.argmax(y_test, axis=1)
    print(classification_report(predictions_transformed, test_transformed))
    
    #check how many were being predicted one away
    classified_correct = 0
    classified_one_away = 0
    classified_more_than_one = 0
    for i in range(0, len(predictions_transformed)):
        if predictions_transformed[i] == test_transformed[i]:
            classified_correct = classified_correct + 1
        else:
            if abs(predictions_transformed[i] - test_transformed[i]) == 1:
                classified_one_away  = classified_one_away + 1
            else:
                classified_more_than_one = classified_more_than_one + 1
    print("Number of Correct Classificaiton: " + str(classified_correct))
    print("Number of Incorrect by One: " + str(classified_one_away))
    print("Number of Incorrect More than One: " + str(classified_more_than_one))
    
    cm = confusion_matrix(predictions_transformed, test_transformed)
    ConfusionMatrixDisplay(cm).plot()
    

#make's a baseline neural network with an embedding layer
#num_words are the 
def make_baseline_model(num_words, max_length, X_train):
    model = Sequential([
        Embedding(num_words,128,input_length=X_train.shape[1]),
        tf.keras.layers.GlobalAveragePooling1D(),
        Dense(8,activation='relu'),
        #output layer
        Dense(5, activation = 'softmax'), ])
    model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy', Precision(), Recall()])
    return model

#train a neural network with word2vec embedding weights
#reviews are the cleaned data as used above
def word2vec_models(data):
    embedding_matrix, vocab_size, inputs = make_embeddings(data, 50000)
    X_train, X_test, y_train, y_test = make_training_partition(inputs, dp_sample)
    model = Sequential([
         Embedding(vocab_size, 100, weights = [embedding_matrix]),
         tf.keras.layers.GlobalAveragePooling1D(),
         Dense(8,activation='relu'),
         Dense(5, activation = 'softmax'),
    ])
    model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy', Precision(), Recall()])
    model.fit(X_train,y_train ,epochs=8, validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss',patience=3)])
    return model, X_test, y_test 

#Long Short term memory network with no embedding
#num_words are the number of words in the vocabulary
#X_train is the training dataset
def lstm_test(num_words, X_train, y_train):
    regularise = tf.keras.regularizers.l2(0.001)
    model = Sequential([
        Embedding(num_words,128,input_length=X_train.shape[1]),
        Dropout(0.5),
        LSTM(32,kernel_constraint=max_norm(3)),
        Dense(32,activation='relu',kernel_regularizer=regularise),
        Dropout(0.5),
        Dense(5,activation='softmax')
    ])

    model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
    #Fitting the model
    history1 =  model.fit(X_train,y_train ,epochs=8, validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss',patience=3)])
    return model


def lstm_test_with_embed(data):
    embedding_matrix, vocab_size, inputs = make_embeddings(data, 50000)
    X_train, X_test, y_train, y_test = make_training_partition(inputs, dp_sample)
    model = Sequential([
         Embedding(vocab_size, 100, weights = [embedding_matrix]),
         Bidirectional(LSTM(128, dropout=0.2)),
         Dense(128, activation = 'relu'),
         Dropout(0.5),
         Dense(64, activation = 'relu'),
         Dense(5,activation='softmax')
    ])
    model.compile(loss='categorical_crossentropy',optimizer=tf.keras.optimizers.Adam(learning_rate=.00001),metrics=['accuracy'])
    #Fitting the model
    history1 =  model.fit(X_train,y_train ,epochs=8, validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss',patience=3)])
    return model, X_test, y_test
    

    

## Training and Evaluation

In [42]:
X_train, X_test, y_train, y_test = make_training_partition(inputs, dp_sample)

In [None]:
#run the baseline model on the specified number of samples
baseline_model = make_baseline_model(num_words, max_length, X_train)
#Fitting the model
baseline_history = baseline_model.fit(X_train,y_train ,epochs=8, validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss',patience=3)])

In [None]:
#prediction on the baseline model
predict(baseline_model, X_test, y_test)

In [None]:
#training on the word_embedding model
word_embed_model, X_test_embed, Y_test_embed = word2vec_models(dp_sample['text'])

In [None]:
predict(word_embed_model, X_test_embed, Y_test_embed)

In [None]:
#train on the lstm model
lstm_model = lstm_test(num_words, X_train, y_train)

In [None]:
#predictions on the LSTM model
predict(lstm_model, X_test, y_test)

In [None]:
#LSTM with word embedding
lstm_embed, X_embed_ltest, y_embed_ltest = lstm_test_with_embed(dp_sample['text'])

In [None]:
predict(lstm_embed, X_embed_ltest, y_embed_ltest)

## Filtered Model

In [None]:
#Let's remove the second and fourth start
df_sample_filtered = dp_sample[~dp_sample['label'].isin([1, 3])]
df_sample_filtered

In [None]:
df_sample_filtered['label'].value_counts()

In [None]:
df_sample_filtered['text'] = df_sample_filtered['text'].apply(pre_process_data)

In [None]:
df_sample_filtered

In [None]:
filtered_inputs, filtered_words, max_length = create_sequences(df_sample_filtered['text'],50000, False, False)

In [None]:
X_train_fil, X_test_fil, y_train_fil, y_test_fil = make_training_partition(filtered_inputs, df_sample_filtered)
y_train_fil

In [None]:
filter_model = lstm_test(filtered_words, X_train_fil, y_train_fil)

In [None]:
predict(filter_model, X_test_fil, y_test_fil)