In [1]:
import numpy as np
import os
import tensorflow as tf
import pandas as pd

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow import keras
from tensorflow.keras import optimizers
from tensorflow.keras.regularizers import l2
from keras.layers import LSTM, Dense

In [None]:
reviews_data = pd.read_csv('output.csv')
print(reviews_data.dtypes)
reviews_data.head()

Unnamed: 0     int64
video game    object
username      object
date          object
score          int64
review        object
rating        object
dtype: object


Unnamed: 0.1,Unnamed: 0,video game,username,date,score,review,rating
0,0,the-lord-of-the-rings---gollum,vova22379,"Jul 27, 2023",0,⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀...,Negative
1,1,the-lord-of-the-rings---gollum,Xeculiar,"Jul 24, 2023",0,so horrendously bad I **** my pants and had a ...,Negative
2,2,the-lord-of-the-rings---gollum,Dirac,"Jul 22, 2023",1,This game is inspirational.\rAfter playing thi...,Negative
3,3,the-lord-of-the-rings---gollum,GyurMaGyuri7947,"Jul 20, 2023",0,Typical after-2013 game (huge hype and then hu...,Negative
4,4,the-lord-of-the-rings---gollum,JDear,"Jul 9, 2023",5,Better than I expected. Some cool ideas but mo...,Average


In [None]:
reviews_classes = reviews_data['rating']
reviews = reviews_data['review']

label_encoder = preprocessing.LabelEncoder()
review_classes = label_encoder.fit_transform(reviews_classes)

review_classes = tf.keras.utils.to_categorical(review_classes, 3)
print("One hot encoding shape: ", review_classes.shape)

One hot encoding shape:  (5441, 3)


In [None]:
MAX_LENGTH = max(reviews.apply(lambda x: len(x)))
VOCAB_WORDS = 1000

review_tokenizer = Tokenizer(num_words=VOCAB_WORDS)
review_tokenizer.fit_on_texts(reviews)

review_sequences = review_tokenizer.texts_to_sequences(reviews)
review_padded = pad_sequences(review_sequences, maxlen=MAX_LENGTH)

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(review_padded,
                                                   review_classes,
                                                   test_size=0.2)

In [None]:
glove_dict = {}

with open('glove.6B.50d.txt','r',encoding='utf8') as glove_file:
    for line in glove_file:
        emb_line = line.split()
        emb_token = emb_line[0]
        emb_vector = np.array(emb_line[1:], dtype=np.float32)

        if emb_vector.shape[0] == 50:
            glove_dict[emb_token] = emb_vector

In [None]:
vocab_len = len(review_tokenizer.word_index) + 1

embedding_matrix = np.zeros((vocab_len, 50))

for word, id in review_tokenizer.word_index.items():
    try:
        embedding_vector = glove_dict.get(word)
        if embedding_vector is not None:
            embedding_matrix[id] = embedding_vector
    except:
        pass

In [None]:
# Setup Hyper Parameters for building the model
NB_CLASSES=3

model = tf.keras.models.Sequential()
model.add(keras.layers.Embedding(vocab_len,
                                50,
                                name='Embedding-Layer',
                                weights=[embedding_matrix],
                                input_length=MAX_LENGTH,
                                trainable=True))

model.add(LSTM(256))
model.add(keras.layers.Flatten())
model.add(keras.layers.Dense(NB_CLASSES, name='Output-Layer',
                            activation='softmax'))
model.compile(loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Embedding-Layer (Embedding)  (None, 4966, 50)         1242950   
                                                                 
 lstm (LSTM)                 (None, 256)               314368    
                                                                 
 flatten (Flatten)           (None, 256)               0         
                                                                 
 Output-Layer (Dense)        (None, 3)                 771       
                                                                 
Total params: 1,558,089
Trainable params: 1,558,089
Non-trainable params: 0
_________________________________________________________________


In [None]:
VERBOSE=1

BATCH_SIZE=256
EPOCHS=10
VALIDATION_SPLIT=0.2

history = model.fit(X_train, Y_train, batch_size=BATCH_SIZE,
                   epochs=EPOCHS,verbose=VERBOSE,validation_split=VALIDATION_SPLIT)

model.evaluate(X_test,Y_test)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


[0.7010186910629272, 0.7226813435554504]