## Disneyland Review Rating Prediction

Given *reviews of Disneyland*, let's try to predict the **rating** associated with a given review.

We will use a Tensorlflow/Keras text model with word embeddings to make our predictions.

Data source: https://www.kaggle.com/datasets/arushchillar/disneyland-reviews

In [2]:
import numpy as np
import pandas as pd
pd.set_option("display.max_columns", None)

from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import tensorflow as tf

data = pd.read_csv("DisneylandReviews.csv", encoding='latin-1')

In [3]:
data.head()

Unnamed: 0,Review_ID,Rating,Year_Month,Reviewer_Location,Review_Text,Branch
0,670772142,4,2019-4,Australia,If you've ever been to Disneyland anywhere you...,Disneyland_HongKong
1,670682799,4,2019-5,Philippines,Its been a while since d last time we visit HK...,Disneyland_HongKong
2,670623270,4,2019-4,United Arab Emirates,Thanks God it wasn t too hot or too humid wh...,Disneyland_HongKong
3,670607911,4,2019-4,Australia,HK Disneyland is a great compact park. Unfortu...,Disneyland_HongKong
4,670607296,4,2019-4,United Kingdom,"the location is not in the city, took around 1...",Disneyland_HongKong


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42656 entries, 0 to 42655
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Review_ID          42656 non-null  int64 
 1   Rating             42656 non-null  int64 
 2   Year_Month         42656 non-null  object
 3   Reviewer_Location  42656 non-null  object
 4   Review_Text        42656 non-null  object
 5   Branch             42656 non-null  object
dtypes: int64(2), object(4)
memory usage: 2.0+ MB


### Preprocessing

In [5]:
def get_sequences(texts, tokenizer, train=True, max_seq_length=None):
    sequences = tokenizer.texts_to_sequences(texts)

    if train == True:
        max_seq_length = np.max(list(map(len, sequences)))

    sequences = pad_sequences(sequences, maxlen = max_seq_length, padding='post')

    return sequences

In [6]:
def preprocess_inputs(df):
    df = df.copy()

    # Limit the data to only the review and rating columns
    y = df['Rating']
    X = df['Review_Text']

    # Train-Test Split
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        train_size=0.7,
                                                        shuffle=True,
                                                        random_state=1)
    # Fit tokenizer
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(X_train)
    print("Vocab length:", len(tokenizer.word_index) + 1)

    # Convert texts to sequences
    X_train = get_sequences(X_train, tokenizer, train=True)
    X_test = get_sequences(X_test, tokenizer, train=False,
                           max_seq_length=X_train.shape[1])

    return X_train, X_test, y_train, y_test, tokenizer

In [7]:
X_train, X_test, y_train, y_test, t = preprocess_inputs(data)

Vocab length: 37846


In [8]:
X_train.shape

(29859, 3958)

In [9]:
inputs = tf.keras.Input(shape=(X_train.shape[1],))
x = tf.keras.layers.Embedding(
        input_dim=37846,
        output_dim=64
    )(inputs)

x = tf.keras.layers.Flatten()(x)
x = tf.keras.layers.Dense(128, activation='relu')(x)
outputs = tf.keras.layers.Dense(1, activation='linear')(x)

model = tf.keras.Model(inputs=inputs, outputs=outputs)

model.compile(
        optimizer = 'adam',
        loss='mse'
    )

history = model.fit(
        X_train,
        y_train,
        validation_split = 0.2,
        batch_size=32,
        epochs=100,
        callbacks=[
                tf.keras.callbacks.EarlyStopping(
                        monitor = 'val_loss',
                        patience=3,
                        restore_best_weights = True
                    )
            ]
    )

Epoch 1/100
[1m747/747[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 14ms/step - loss: 2142.5559 - val_loss: 1.9667
Epoch 2/100
[1m747/747[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 12ms/step - loss: 1.9507 - val_loss: 1.1636
Epoch 3/100
[1m747/747[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 11ms/step - loss: 0.8338 - val_loss: 0.8880
Epoch 4/100
[1m747/747[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 12ms/step - loss: 0.4418 - val_loss: 0.7749
Epoch 5/100
[1m747/747[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 11ms/step - loss: 0.2314 - val_loss: 0.6749
Epoch 6/100
[1m747/747[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 11ms/step - loss: 0.1943 - val_loss: 0.7340
Epoch 7/100
[1m747/747[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 11ms/step - loss: 0.1207 - val_loss: 0.7109
Epoch 8/100
[1m747/747[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 12ms/step - loss: 0.0846 - val_loss: 0.7218


### Results

In [11]:
y_pred = np.squeeze(model.predict(X_test))
y_pred

[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step


array([4.712944 , 3.1573424, 4.4595146, ..., 4.10944  , 4.283192 ,
       4.449741 ], dtype=float32)

In [16]:
rmse = np.sqrt(np.mean((y_test - y_pred)**2))
r2 = 1 - (np.sum((y_test - y_pred)**2) / np.sum((y_test - y_test.mean())**2))

print("     RMSE: {:.2f}".format(rmse))
print("R^2 Score: {:.5f}".format(r2))

     RMSE: 0.81
R^2 Score: 0.40957
