## 1 Installs & Imports

In [None]:
"""!pip install tensorflow==2.15.0 tensorflow-text==2.15.* tf-models-official==2.15.*"""

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import shutil
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import load_model
from tensorflow.keras import layers, Model
from official.nlp import optimization
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.utils import resample
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tensorflow.keras.utils import to_categorical

import warnings
warnings.filterwarnings('ignore', category=pd.errors.SettingWithCopyWarning)
tf.get_logger().setLevel('ERROR')  # To suppress TensorFlow logging

In [None]:
print("TF Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("Hub version: ", hub.__version__)
print("GPU is", "available" if tf.config.experimental.list_physical_devices("GPU") else "NOT AVAILABLE")

TF Version:  2.15.0
Eager mode:  True
Hub version:  0.16.1
GPU is available


## 2 Data Loading

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
file_path = '/content/drive/My Drive/Work Project/Data/Rotten Tomatoes/archive/df_modeling_regression.csv'

df = pd.read_csv(file_path)

df

Unnamed: 0,rotten_tomatoes_link,critic_name,top_critic,publisher_name,review_type,review_date,review_content,rating,word_count,sentiment_polarity
0,m/0814255,Ben McEachen,False,Sunday Mail (Australia),Fresh,2010-02-09,Whether audiences will get behind The Lightnin...,3.50,37,0.000000
1,m/0814255,Nick Schager,False,Slant Magazine,Rotten,2010-02-10,Harry Potter knockoffs don't come more transpa...,1.25,17,-0.410805
2,m/0814255,Bill Goodykoontz,True,Arizona Republic,Fresh,2010-02-10,"Percy Jackson isn't a great movie, but it's a ...",3.50,23,0.253021
3,m/0814255,Jordan Hoffman,False,UGO,Fresh,2010-02-10,"Fun, brisk and imaginative",4.00,4,0.231060
4,m/0814255,Jim Schembri,True,The Age (Australia),Fresh,2010-02-10,"Crammed with dragons, set-destroying fights an...",3.00,30,-0.149372
...,...,...,...,...,...,...,...,...,...,...
753754,m/zulu,,False,Empire Magazine,Fresh,2017-07-10,As a spectacular war film with a powerful mora...,4.00,10,0.768415
753755,m/zulu_dawn,Ken Hanke,False,"Mountain Xpress (Asheville, NC)",Fresh,2007-03-07,"Seen today, it's not only a startling indictme...",3.50,18,0.246188
753756,m/zulu_dawn,Dennis Schwartz,False,Dennis Schwartz Movie Reviews,Fresh,2010-09-16,A rousing visual spectacle that's a prequel of...,4.30,14,-0.175630
753757,m/zulu_dawn,Christopher Lloyd,False,Sarasota Herald-Tribune,Rotten,2011-02-28,"A simple two-act story: Prelude to war, and th...",3.50,45,0.000000


### Basic Preprocessing

In [None]:
# Function to preprocess review text (minimal for BERT)
def preprocess_review(text):
    # Removing leading/trailing whitespaces
    text = text.strip()
    return text

# Apply the preprocessing to the 'review_content' column
df['review_content'] = df['review_content'].apply(preprocess_review)

# Display the first few rows to verify
df[['review_content']].head()

Unnamed: 0,review_content
0,Whether audiences will get behind The Lightnin...
1,Harry Potter knockoffs don't come more transpa...
2,"Percy Jackson isn't a great movie, but it's a ..."
3,"Fun, brisk and imaginative"
4,"Crammed with dragons, set-destroying fights an..."


## 3 Train-Test-Validation Split

Performing stratified train-test-validation split

In [None]:
# Bucketizing review length
df['length_bucket'] = pd.cut(df['word_count'], bins=[0, 15, 30, np.inf], labels=['short', 'medium', 'long'])

# Grouping sentiment into strong and neutral_or_moderate
df['sentiment_group'] = df['sentiment_polarity'].apply(
    lambda x: 'strong' if x >= 0.75 or x <= -0.75 else 'neutral_or_moderate'
)

# Step 1: Stratified split based on 'length_bucket' and 'sentiment_group'
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

# Create train (80%) and temp (20%) sets while preserving 'length_bucket' and 'sentiment_group'
for train_idx, temp_idx in split.split(df, df[['length_bucket', 'sentiment_group']]):
    train_set = df.iloc[train_idx]
    temp_set = df.iloc[temp_idx]

# Step 2: Stratified split on the temp set (50% validation, 50% test)
split = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=42)

for val_idx, test_idx in split.split(temp_set, temp_set[['length_bucket', 'sentiment_group']]):
    val_set = temp_set.iloc[val_idx]
    test_set = temp_set.iloc[test_idx]

# Checking the distribution in each set
print("Training Set Size:", len(train_set))
print("Validation Set Size:", len(val_set))
print("Test Set Size:", len(test_set))

# Optional: Check the distribution of 'length_bucket' and 'sentiment_group' in each set
print("\nTraining Set Distribution by Length and Sentiment:")
print(train_set.groupby(['length_bucket', 'sentiment_group']).size())

print("\nValidation Set Distribution by Length and Sentiment:")
print(val_set.groupby(['length_bucket', 'sentiment_group']).size())

print("\nTest Set Distribution by Length and Sentiment:")
print(test_set.groupby(['length_bucket', 'sentiment_group']).size())

Training Set Size: 603007
Validation Set Size: 75376
Test Set Size: 75376

Training Set Distribution by Length and Sentiment:
length_bucket  sentiment_group    
short          neutral_or_moderate    161648
               strong                  15289
medium         neutral_or_moderate    311756
               strong                  11190
long           neutral_or_moderate    100694
               strong                   2430
dtype: int64

Validation Set Distribution by Length and Sentiment:
length_bucket  sentiment_group    
short          neutral_or_moderate    20206
               strong                  1912
medium         neutral_or_moderate    38970
               strong                  1398
long           neutral_or_moderate    12587
               strong                   303
dtype: int64

Test Set Distribution by Length and Sentiment:
length_bucket  sentiment_group    
short          neutral_or_moderate    20206
               strong                  1911
medium         neut

  print(train_set.groupby(['length_bucket', 'sentiment_group']).size())
  print(val_set.groupby(['length_bucket', 'sentiment_group']).size())
  print(test_set.groupby(['length_bucket', 'sentiment_group']).size())


In [None]:
# Checking the rating distribution in each set

print("\nTraining Set Rating Distribution:")
print(train_set['rating'].describe())

print("\nValidation Set Rating Distribution:")
print(val_set['rating'].describe())

print("\nTest Set Rating Distribution:")
print(test_set['rating'].describe())


Training Set Rating Distribution:
count    603007.000000
mean          3.263357
std           1.023411
min           1.000000
25%           2.500000
50%           3.300000
75%           4.000000
max           5.000000
Name: rating, dtype: float64

Validation Set Rating Distribution:
count    75376.000000
mean         3.270919
std          1.027275
min          1.000000
25%          2.500000
50%          3.500000
75%          4.000000
max          5.000000
Name: rating, dtype: float64

Test Set Rating Distribution:
count    75376.000000
mean         3.263545
std          1.026809
min          1.000000
25%          2.500000
50%          3.300000
75%          4.000000
max          5.000000
Name: rating, dtype: float64


To make sure that the distribution of the target variable—in this case, rating—is proportionately represented across all datasets, stratified sampling is employed in the train-test-validation split. When dealing with unbalanced data, where some classes (ratings) may be underrepresented, this is particularly crucial. More dependable and broadly applicable model performance results from stratified sampling, which ensures that the proportion of each class is maintained in each split (training, validation, and test). Incorporating length_bucket into the analysis guarantees consistency in rating distributions and review lengths across splits, facilitating equitable assessment and training that takes into account the possible impacts of these variables on model performance.

## 4 Creating TensorFlow datasets

In order to ensure effective handling and processing of large datasets, creating TensorFlow datasets is an essential step in preparing data for model training. This process overlaps data preprocessing with model execution, which optimizes memory usage and speeds up training through batching and prefetching data. It also makes it possible to create organized and consistent input pipelines, which are necessary for supplying data to deep learning models such as BERT. By minimizing input-output bottlenecks during execution, these steps not only improve model performance but also streamline the training process.

Adjust labels to be zero-indexed (from 0 - 4)

In [None]:
# Extract text data from the sets
train_texts = train_set['review_content'].tolist()
val_texts = val_set['review_content'].tolist()
test_texts = test_set['review_content'].tolist()

# Extract labels for each set
train_labels = train_set['rating'].values - 1  # Subtracting 1 to make labels 0-based
val_labels = val_set['rating'].values - 1
test_labels = test_set['rating'].values - 1

# Check to confirm the label adjustment
print(f"Adjusted train labels: {np.unique(train_labels)}")
print(f"Adjusted val labels: {np.unique(val_labels)}")
print(f"Adjusted test labels: {np.unique(test_labels)}")

Adjusted train labels: [0.         0.05       0.1        0.10141326 0.1062605  0.1411515
 0.15       0.2        0.24       0.25       0.3        0.325
 0.35       0.375      0.4        0.45       0.5        0.5105
 0.5391     0.55       0.5625     0.6        0.60605    0.6112115
 0.62       0.625      0.65       0.66666667 0.7        0.7
 0.75       0.7575     0.775      0.781695   0.8        0.825
 0.85       0.875      0.87605    0.88       0.9        0.91706219
 0.925      0.95       0.995      1.         1.00005    1.0105
 1.02762    1.05       1.0718205  1.08333333 1.1        1.105
 1.1055     1.1056     1.10691171 1.11       1.12       1.125
 1.15       1.16666667 1.1821105  1.1875     1.2        1.21
 1.225      1.25       1.255      1.2605     1.27272727 1.275
 1.279      1.28       1.3        1.3        1.30769231 1.325
 1.35       1.375      1.395      1.4        1.4135     1.425
 1.43       1.445      1.446075   1.45       1.475      1.4917163
 1.49185    1.5        1.505   

In [None]:
# Round labels to 2 decimal places
train_labels = np.round(train_labels, 2)
val_labels = np.round(val_labels, 2)
test_labels = np.round(test_labels, 2)

# Check to confirm the label adjustment after rounding
print(f"Rounded train labels: {np.unique(train_labels)}")
print(f"Rounded val labels: {np.unique(val_labels)}")
print(f"Rounded test labels: {np.unique(test_labels)}")

Rounded train labels: [0.   0.05 0.1  0.11 0.14 0.15 0.2  0.24 0.25 0.3  0.33 0.35 0.38 0.4
 0.45 0.5  0.51 0.54 0.55 0.56 0.6  0.61 0.62 0.65 0.67 0.7  0.75 0.76
 0.77 0.78 0.8  0.82 0.85 0.88 0.9  0.92 0.95 1.   1.01 1.03 1.05 1.07
 1.08 1.1  1.11 1.12 1.15 1.17 1.18 1.19 1.2  1.21 1.23 1.25 1.26 1.27
 1.28 1.3  1.31 1.33 1.35 1.38 1.4  1.41 1.42 1.43 1.44 1.45 1.48 1.49
 1.5  1.51 1.55 1.56 1.6  1.61 1.62 1.64 1.65 1.67 1.7  1.71 1.72 1.73
 1.74 1.75 1.76 1.78 1.8  1.81 1.83 1.84 1.85 1.86 1.88 1.89 1.9  1.91
 1.92 1.94 1.95 1.98 1.99 2.   2.04 2.05 2.06 2.08 2.1  2.12 2.13 2.15
 2.17 2.18 2.2  2.22 2.24 2.25 2.28 2.3  2.33 2.34 2.35 2.38 2.39 2.4
 2.42 2.43 2.44 2.45 2.48 2.49 2.5  2.51 2.52 2.54 2.55 2.56 2.58 2.59
 2.6  2.61 2.62 2.63 2.65 2.68 2.69 2.7  2.71 2.72 2.74 2.75 2.76 2.77
 2.78 2.8  2.81 2.82 2.85 2.88 2.9  2.91 2.92 2.94 2.95 2.97 2.98 2.99
 3.   3.03 3.05 3.06 3.08 3.1  3.11 3.12 3.13 3.15 3.16 3.17 3.18 3.19
 3.2  3.21 3.22 3.23 3.24 3.25 3.27 3.28 3.29 3.3  3.31 3

Create TF datasets by setting autotune and batch size

In [None]:
# TensorFlow AUTOTUNE and batch size
AUTOTUNE = tf.data.AUTOTUNE
batch_size = 128

# Modify the create_dataset function to include shuffling
def create_dataset(texts, labels, batch_size=batch_size, shuffle=True):
    dataset = tf.data.Dataset.from_tensor_slices((texts, labels))
    if shuffle:
        dataset = dataset.shuffle(buffer_size=len(texts), seed=42)
    dataset = dataset.batch(batch_size).prefetch(buffer_size=AUTOTUNE)
    return dataset

# Create TensorFlow datasets with shuffling for the training set
train_ds = create_dataset(train_texts, train_labels, shuffle=True)
val_ds = create_dataset(val_texts, val_labels, shuffle=False)  # No need to shuffle validation
test_ds = create_dataset(test_texts, test_labels, shuffle=False)  # No need to shuffle test

# Prefetch to improve performance
train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

Inspection of newly created datasets to ensure correctness

In [None]:
# Inspect the first and second batch from the training set
for i, (text_batch, label_batch) in enumerate(train_ds.take(2)):
    print(f"Batch {i + 1} - Text batch: {text_batch}")
    print(f"Batch {i + 1} - Label batch: {label_batch}\n")

Batch 1 - Text batch: [b'This is purely gutter, fanboy, Cheez Whiz cinema, but Alexander delivers the goods.'
 b"The horror prequel recycles every horror trope you've seen over the past decade ... and then things get interesting."
 b'..."Stop Loss" not only has something new to say, it is in many ways a smaller, modern day "Deer Hunter."'
 b'The personal stories are informative, interesting and nicely illustrated but the central meeting between the three men lacks both passion and personality.'
 b'Choosing its metaphors with great care, wistfully contemplating the nature of man\'s worth to his country, and elegantly juxtaposing the spirit of a singular woman with her country, "Sunset Song" is a remarkable experience.'
 b"His account of Anderson's demise moves, but by then, this arch-raconteur's bluster may have sand-papered your patience."
 b"Expect a fast-paced, beautifully mounted and well-acted soap opera with overripe dialogue that plays fast and loose with history -- just like the

In [None]:
print(f"Train labels shape: {train_labels.shape}")
print(f"Val labels shape: {val_labels.shape}")
print(f"Test labels shape: {test_labels.shape}")

Train labels shape: (603007,)
Val labels shape: (75376,)
Test labels shape: (75376,)


Checking shape of text and label batches

In [None]:
for text_batch, label_batch in train_ds.take(1):
    print(f'Text batch shape: {text_batch.shape}')
    print(f'Label batch shape: {label_batch.shape}')

Text batch shape: (128,)
Label batch shape: (128,)


# 5 Small BERT

## 5.1 Baseline-BERT

In [None]:
# Load the Small BERT preprocessing and encoder from TensorFlow Hub
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/2", trainable=True)

# Define a function to build the regression model
def build_bert_regression_model():
    # Input layer: Expecting a string (raw text)
    text_input = layers.Input(shape=(), dtype=tf.string, name='review_content')

    # Preprocess the input text with BERT's tokenizer
    preprocessed_text = bert_preprocess(text_input)

    # Encode the preprocessed text using the BERT encoder
    bert_output = bert_encoder(preprocessed_text)

    # Use the pooled output from BERT (representing the [CLS] token) for regression
    pooled_output = bert_output['pooled_output']

    # Add a dropout layer for regularization
    dropout = layers.Dropout(0.3)(pooled_output)

    # Dense layer for regression (output is a single continuous value)
    output = layers.Dense(1, name='output')(dropout)

    # Define the model
    model = tf.keras.Model(inputs=text_input, outputs=output)

    # Compile the model for regression using MSE as the loss function
    optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)  # Adjust the learning rate if needed

    # Compile with a regression loss function
    model.compile(optimizer=optimizer,
                  loss='mean_squared_error',  # Use MSE for regression
                  metrics=['mae'])  # Mean Absolute Error as an additional metric

    return model

# Build and display the model architecture
regression_model = build_bert_regression_model()
regression_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 review_content (InputLayer  [(None,)]                    0         []                            
 )                                                                                                
                                                                                                  
 keras_layer (KerasLayer)    {'input_type_ids': (None,    0         ['review_content[0][0]']      
                             128),                                                                
                              'input_mask': (None, 128)                                           
                             , 'input_word_ids': (None,                                           
                              128)}                                                           

In [None]:
early_stopping = EarlyStopping(
    monitor='val_mae',
    patience=3,
    restore_best_weights=True
)

checkpoint = ModelCheckpoint(
    '/content/drive/My Drive/Work Project/Models/best_model.h5',  # Save the best model to Google Drive
    monitor='val_mae',
    save_best_only=True,
    mode='min',
    verbose=1
)

with tf.device('/GPU:0'):
    history = regression_model.fit(
        train_ds,
        validation_data=val_ds,
        epochs=10,
        callbacks=[early_stopping, checkpoint]
    )

Epoch 1/10
Epoch 1: val_mae improved from inf to 0.55582, saving model to /content/drive/My Drive/Work Project/Models/best_model.h5


  saving_api.save_model(


Epoch 2/10
Epoch 2: val_mae improved from 0.55582 to 0.53471, saving model to /content/drive/My Drive/Work Project/Models/best_model.h5
Epoch 3/10
Epoch 3: val_mae improved from 0.53471 to 0.52514, saving model to /content/drive/My Drive/Work Project/Models/best_model.h5
Epoch 4/10
Epoch 4: val_mae improved from 0.52514 to 0.52110, saving model to /content/drive/My Drive/Work Project/Models/best_model.h5
Epoch 5/10
Epoch 5: val_mae improved from 0.52110 to 0.51677, saving model to /content/drive/My Drive/Work Project/Models/best_model.h5
Epoch 6/10
Epoch 6: val_mae did not improve from 0.51677
Epoch 7/10
Epoch 7: val_mae did not improve from 0.51677
Epoch 8/10
Epoch 8: val_mae improved from 0.51677 to 0.51546, saving model to /content/drive/My Drive/Work Project/Models/best_model.h5
Epoch 9/10
Epoch 9: val_mae improved from 0.51546 to 0.51250, saving model to /content/drive/My Drive/Work Project/Models/best_model.h5
Epoch 10/10
Epoch 10: val_mae did not improve from 0.51250


In [None]:
regression_model.save('/content/drive/My Drive/Work Project/Models/BERT_Cont_Rating_Regressor')

In [None]:
model_path = '/content/drive/My Drive/Work Project/Models/SmallBERT_Rating_Regressor'
regression_model = load_model(model_path)

print("Model loaded successfully!")

Model loaded successfully!


In [None]:
# Evaluate the model on the test set
test_loss, test_mae = regression_model.evaluate(test_ds)

# Print the test loss (MSE) and MAE
print(f'Test Loss (MSE): {test_loss:.4f}')
print(f'Test Mean Absolute Error (MAE): {test_mae:.4f}')

Test Loss (MSE): 0.4678
Test Mean Absolute Error (MAE): 0.5154



The results, with a Mean Squared Error (MSE) of 0.7926 and a Mean Absolute Error (MAE) of 0.6553 on a rating prediction scale from 1 to 5, demonstrate that the model achieves an average prediction error of approximately 0.66 rating points. This indicates that the model performs well, making relatively small errors, with the MAE representing around 13% of the total rating range. The MSE, being close to 0.8, suggests that the model does not frequently produce extreme errors. Overall, the model provides a satisfactory level of accuracy for predicting ratings, suggesting it is well-suited for the task.

Analyzing Test Set

In [None]:
y_pred_continuous = regression_model.predict(test_ds)



In [None]:
# Get the indices of the test set rows
test_indices = df.index[df.index.isin(test_set.index)]

# Step 2: Create the DataFrame for the results
results_df = pd.DataFrame({
    'y_true': test_labels[:len(y_pred_continuous)],  # True labels for test set
    'y_pred': y_pred_continuous.flatten(),  # Predicted continuous values
    'length_bucket': df.loc[test_indices, 'length_bucket'].values[:len(y_pred_continuous)],  # Align 'length_bucket'
    'sentiment_group': df.loc[test_indices, 'sentiment_group'].values[:len(y_pred_continuous)]  # Align 'sentiment_group'
})

# Display the results DataFrame
print(results_df.head())

   y_true    y_pred length_bucket      sentiment_group
0     2.5  2.446937        medium  neutral_or_moderate
1     1.0  1.594142         short  neutral_or_moderate
2     1.0  0.762507         short  neutral_or_moderate
3     2.0  2.003628         short  neutral_or_moderate
4     1.7  2.965239          long  neutral_or_moderate


In [None]:
# Calculate MAE by review length bucket
mae_by_length = results_df.groupby('length_bucket').apply(
    lambda group: mean_absolute_error(group['y_true'], group['y_pred'])
)

# Calculate MAE by sentiment polarity group
mae_by_sentiment = results_df.groupby('sentiment_group').apply(
    lambda group: mean_absolute_error(group['y_true'], group['y_pred'])
)

# Optionally, calculate MAE by both length bucket and sentiment group
mae_by_length_and_sentiment = results_df.groupby(['length_bucket', 'sentiment_group']).apply(
    lambda group: mean_absolute_error(group['y_true'], group['y_pred'])
)

# Step 4: Print the results
print("MAE by Review Length (Test Set):")
print(mae_by_length)

print("\nMAE by Sentiment Group (Test Set):")
print(mae_by_sentiment)

print("\nMAE by Review Length and Sentiment Group (Test Set):")
print(mae_by_length_and_sentiment)

MAE by Review Length (Test Set):
length_bucket
short     0.514083
medium    0.517144
long      0.512319
dtype: float64

MAE by Sentiment Group (Test Set):
sentiment_group
neutral_or_moderate    0.515242
strong                 0.518973
dtype: float64

MAE by Review Length and Sentiment Group (Test Set):
length_bucket  sentiment_group    
short          neutral_or_moderate    0.513743
               strong                 0.517672
medium         neutral_or_moderate    0.517296
               strong                 0.512900
long           neutral_or_moderate    0.511286
               strong                 0.555102
dtype: float64


  mae_by_length = results_df.groupby('length_bucket').apply(
  mae_by_length = results_df.groupby('length_bucket').apply(
  mae_by_sentiment = results_df.groupby('sentiment_group').apply(
  mae_by_length_and_sentiment = results_df.groupby(['length_bucket', 'sentiment_group']).apply(
  mae_by_length_and_sentiment = results_df.groupby(['length_bucket', 'sentiment_group']).apply(


The model's performance is fairly consistent across different review lengths and sentiment groups. The small variations in MAE suggest that while review length and sentiment polarity may have some effect, they do not drastically impact prediction accuracy.

Analyzing Validation Set

In [None]:
y_pred_val_probs = regression_model.predict(val_ds)  # Get predicted probabilities/values for validation set
y_pred_val = y_pred_val_probs.flatten()  # Flatten the predictions if needed



In [None]:
y_true_val = []
for batch in val_ds:
    y_true_val.extend(batch[1].numpy())

y_true_val = np.array(y_true_val)  # Convert the list to numpy array

# Step 3: Align the validation set with sentiment_group and length_bucket from 'df'
val_indices = df.index[df.index.isin(val_set.index)]  # Get the indices of the validation set rows

# Create the DataFrame for validation set analysis
results_val_df = pd.DataFrame({
    'y_true': y_true_val[:len(y_pred_val)],  # True continuous labels for validation set
    'y_pred': y_pred_val[:len(y_true_val)],  # Predicted continuous values
    'length_bucket': df.loc[val_indices, 'length_bucket'].values[:len(y_pred_val)],  # Align length_bucket
    'sentiment_group': df.loc[val_indices, 'sentiment_group'].values[:len(y_pred_val)]  # Align sentiment_group
})

mae_by_length_val = results_val_df.groupby('length_bucket', observed=True).apply(
    lambda group: mean_absolute_error(group['y_true'], group['y_pred']),
    include_groups=False
)

# Calculate MAE by sentiment polarity group for validation set
mae_by_sentiment_val = results_val_df.groupby('sentiment_group', observed=True).apply(
    lambda group: mean_absolute_error(group['y_true'], group['y_pred']),
    include_groups=False
)

# Calculate MAE by both length bucket and sentiment group for validation set
mae_by_length_and_sentiment_val = results_val_df.groupby(['length_bucket', 'sentiment_group'], observed=True).apply(
    lambda group: mean_absolute_error(group['y_true'], group['y_pred']),
    include_groups=False
)

# Step 5: Print the results
print("MAE by Review Length (Validation Set):")
print(mae_by_length_val)

print("\nMAE by Sentiment Group (Validation Set):")
print(mae_by_sentiment_val)

print("\nMAE by Review Length and Sentiment Group (Validation Set):")
print(mae_by_length_and_sentiment_val)

MAE by Review Length (Validation Set):
length_bucket
short     0.520482
medium    0.515013
long      0.515780
dtype: float64

MAE by Sentiment Group (Validation Set):
sentiment_group
neutral_or_moderate    0.516130
strong                 0.529039
dtype: float64

MAE by Review Length and Sentiment Group (Validation Set):
length_bucket  sentiment_group    
short          neutral_or_moderate    0.519096
               strong                 0.535124
medium         neutral_or_moderate    0.514941
               strong                 0.517009
long           neutral_or_moderate    0.515049
               strong                 0.546140
dtype: float64


## 5.2 Small BERT Multi-Feature

Checking for NaNs

In [None]:
# Check for NaN values in the dataset
nan_summary = df.isna().sum()

# Display the columns with NaN values and the count
print(nan_summary[nan_summary > 0])

# Optionally, display the full summary for all columns
print("\nFull NaN summary:")
print(nan_summary)

critic_name    5931
dtype: int64

Full NaN summary:
rotten_tomatoes_link       0
critic_name             5931
top_critic                 0
publisher_name             0
review_type                0
review_date                0
review_content             0
rating                     0
word_count                 0
sentiment_polarity         0
dtype: int64


In [None]:
# Drop rows where 'critic_name' has NaN values
df = df.dropna(subset=['critic_name'])

# Check if the NaN values are dropped successfully
print(df.isna().sum())

rotten_tomatoes_link    0
critic_name             0
top_critic              0
publisher_name          0
review_type             0
review_date             0
review_content          0
rating                  0
word_count              0
sentiment_polarity      0
dtype: int64


In [None]:
# Check the number of unique values in 'critic_name' and 'publisher_name'
unique_critics = df['critic_name'].nunique()
unique_publishers = df['publisher_name'].nunique()

print(f"Unique critic names: {unique_critics}")
print(f"Unique publisher names: {unique_publishers}")

Unique critic names: 6821
Unique publisher names: 1620


In [None]:
df['top_critic'] = df['top_critic'].astype(int)  # Convert True/False to 1/0

# Step 2: Extract the required features from df
# Text feature (review_content)
texts = df['review_content'].values

# Additional features
top_critic = df['top_critic'].values  # Binary feature
word_count = df['word_count'].values  # Numerical feature (could normalize if needed)
sentiment_polarity = df['sentiment_polarity'].values  # Numerical feature

# Optional: Normalize the 'word_count' feature using StandardScaler
scaler = StandardScaler()
word_count_scaled = scaler.fit_transform(word_count.reshape(-1, 1))

In [None]:
# Add normalized 'word_count' back to the DataFrame for inspection
df['word_count_scaled'] = word_count_scaled

# Inspect the DataFrame
print(df[['top_critic', 'word_count', 'word_count_scaled', 'sentiment_polarity']].head())

# Additional details if needed
print("\nSummary of word_count before scaling:")
print(df['word_count'].describe())

print("\nSummary of word_count_scaled after scaling:")
print(pd.DataFrame(word_count_scaled, columns=['word_count_scaled']).describe())

   top_critic  word_count  word_count_scaled  sentiment_polarity
0           0          37           1.675046            0.000000
1           0          17          -0.450567           -0.410805
2           1          23           0.187117            0.253021
3           0           4          -1.832216            0.231060
4           1          30           0.931082           -0.149372

Summary of word_count before scaling:
count    747828.000000
mean         21.239407
std           9.409055
min           1.000000
25%          14.000000
50%          21.000000
75%          28.000000
max          55.000000
Name: word_count, dtype: float64

Summary of word_count_scaled after scaling:
       word_count_scaled
count       7.478280e+05
mean        1.332099e-17
std         1.000001e+00
min        -2.151058e+00
25%        -7.694090e-01
50%        -2.544430e-02
75%         7.185204e-01
max         3.588099e+00


In [None]:
df

Unnamed: 0,rotten_tomatoes_link,critic_name,top_critic,publisher_name,review_type,review_date,review_content,rating,word_count,sentiment_polarity,word_count_scaled
0,m/0814255,Ben McEachen,0,Sunday Mail (Australia),Fresh,2010-02-09,Whether audiences will get behind The Lightnin...,3.50,37,0.000000,1.675046
1,m/0814255,Nick Schager,0,Slant Magazine,Rotten,2010-02-10,Harry Potter knockoffs don't come more transpa...,1.25,17,-0.410805,-0.450567
2,m/0814255,Bill Goodykoontz,1,Arizona Republic,Fresh,2010-02-10,"Percy Jackson isn't a great movie, but it's a ...",3.50,23,0.253021,0.187117
3,m/0814255,Jordan Hoffman,0,UGO,Fresh,2010-02-10,"Fun, brisk and imaginative",4.00,4,0.231060,-1.832216
4,m/0814255,Jim Schembri,1,The Age (Australia),Fresh,2010-02-10,"Crammed with dragons, set-destroying fights an...",3.00,30,-0.149372,0.931082
...,...,...,...,...,...,...,...,...,...,...,...
753752,m/zulu,Tony Sloman,0,Radio Times,Fresh,2017-07-10,The movie is a revelation.,5.00,5,0.469339,-1.725935
753755,m/zulu_dawn,Ken Hanke,0,"Mountain Xpress (Asheville, NC)",Fresh,2007-03-07,"Seen today, it's not only a startling indictme...",3.50,18,0.246188,-0.344286
753756,m/zulu_dawn,Dennis Schwartz,0,Dennis Schwartz Movie Reviews,Fresh,2010-09-16,A rousing visual spectacle that's a prequel of...,4.30,14,-0.175630,-0.769409
753757,m/zulu_dawn,Christopher Lloyd,0,Sarasota Herald-Tribune,Rotten,2011-02-28,"A simple two-act story: Prelude to war, and th...",3.50,45,0.000000,2.525292


In [None]:
publisher_encoder = LabelEncoder()
critic_encoder = LabelEncoder()

df['publisher_name_encoded'] = publisher_encoder.fit_transform(df['publisher_name'])
df['critic_name_encoded'] = critic_encoder.fit_transform(df['critic_name'])

# Step 2: Normalize 'word_count' using StandardScaler
scaler = StandardScaler()
df['word_count_scaled'] = scaler.fit_transform(df['word_count'].values.reshape(-1, 1))

In [None]:
# Ensure 'publisher_name' and 'critic_name' exist in the DataFrame
if 'publisher_name' in df.columns and 'critic_name' in df.columns:
    # Step 1: Encode 'publisher_name' and 'critic_name' as integers using LabelEncoder
    publisher_encoder = LabelEncoder()
    critic_encoder = LabelEncoder()

    df['publisher_name_encoded'] = publisher_encoder.fit_transform(df['publisher_name'])
    df['critic_name_encoded'] = critic_encoder.fit_transform(df['critic_name'])
else:
    print("Error: 'publisher_name' or 'critic_name' columns are missing from the dataframe.")

# Now, check if the encoding worked
print(df[['publisher_name', 'publisher_name_encoded', 'critic_name', 'critic_name_encoded']].head())

            publisher_name  publisher_name_encoded       critic_name  \
0  Sunday Mail (Australia)                    1261      Ben McEachen   
1           Slant Magazine                    1213      Nick Schager   
2         Arizona Republic                      58  Bill Goodykoontz   
3                      UGO                    1470    Jordan Hoffman   
4      The Age (Australia)                    1287      Jim Schembri   

   critic_name_encoded  
0                  693  
1                 5035  
2                  759  
3                 3413  
4                 3151  


In [None]:
# Step 1: Label encode 'publisher_name' and 'critic_name'
publisher_encoder = LabelEncoder()
critic_encoder = LabelEncoder()

df['publisher_name_encoded'] = publisher_encoder.fit_transform(df['publisher_name'])
df['critic_name_encoded'] = critic_encoder.fit_transform(df['critic_name'])

# Step 2: Normalize 'word_count' using StandardScaler
scaler = StandardScaler()
df['word_count_scaled'] = scaler.fit_transform(df['word_count'].values.reshape(-1, 1))

# Ensure 'sentiment_polarity' is used as-is (assuming it's already a numerical feature)

# Step 3: Create 'length_bucket' and 'sentiment_group'
df['length_bucket'] = pd.cut(df['word_count'], bins=[0, 15, 30, np.inf], labels=['short', 'medium', 'long'])
df['sentiment_group'] = df['sentiment_polarity'].apply(
    lambda x: 'strong' if x >= 0.75 or x <= -0.75 else 'neutral_or_moderate'
)

# Step 4: Stratified split based on 'length_bucket' and 'sentiment_group'
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

# Create train (80%) and temp (20%) sets while preserving 'length_bucket' and 'sentiment_group'
for train_idx, temp_idx in split.split(df, df[['length_bucket', 'sentiment_group']]):
    train_set = df.iloc[train_idx]
    temp_set = df.iloc[temp_idx]

# Step 5: Stratified split on the temp set (50% validation, 50% test)
split = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=42)

for val_idx, test_idx in split.split(temp_set, temp_set[['length_bucket', 'sentiment_group']]):
    val_set = temp_set.iloc[val_idx]
    test_set = temp_set.iloc[test_idx]

# Checking the distribution in each set
print("Training Set Size:", len(train_set))
print("Validation Set Size:", len(val_set))
print("Test Set Size:", len(test_set))

# Optional: Check the distribution of 'length_bucket' and 'sentiment_group' in each set
print("\nTraining Set Distribution by Length and Sentiment:")
print(train_set.groupby(['length_bucket', 'sentiment_group']).size())

print("\nValidation Set Distribution by Length and Sentiment:")
print(val_set.groupby(['length_bucket', 'sentiment_group']).size())

print("\nTest Set Distribution by Length and Sentiment:")
print(test_set.groupby(['length_bucket', 'sentiment_group']).size())

# Step 6: Extract text and additional features from the sets
train_texts = train_set['review_content'].tolist()
val_texts = val_set['review_content'].tolist()
test_texts = test_set['review_content'].tolist()

# Extract labels and adjust to be zero-indexed (from 0 - 4 if using rating 1 to 5)
train_labels = train_set['rating'].values - 1
val_labels = val_set['rating'].values - 1
test_labels = test_set['rating'].values - 1

# Extract other features: publisher_name, critic_name, word_count_scaled, sentiment_polarity
train_publishers = train_set['publisher_name_encoded'].values
val_publishers = val_set['publisher_name_encoded'].values
test_publishers = test_set['publisher_name_encoded'].values

train_critics = train_set['critic_name_encoded'].values
val_critics = val_set['critic_name_encoded'].values
test_critics = test_set['critic_name_encoded'].values

train_word_count_scaled = train_set['word_count_scaled'].values
val_word_count_scaled = val_set['word_count_scaled'].values
test_word_count_scaled = test_set['word_count_scaled'].values

train_sentiment_polarity = train_set['sentiment_polarity'].values
val_sentiment_polarity = val_set['sentiment_polarity'].values
test_sentiment_polarity = test_set['sentiment_polarity'].values

# Step 7: Create TensorFlow datasets
AUTOTUNE = tf.data.AUTOTUNE
batch_size = 128

def create_dataset(texts, publishers, critics, word_count, sentiment_polarity, labels, batch_size=128, shuffle=True):
    # Combine inputs into a dictionary, separating the features and labels
    inputs = {
        'review_content': texts,
        'publisher_name': publishers,
        'critic_name': critics,
        'word_count_scaled': word_count,
        'sentiment_polarity': sentiment_polarity
    }
    # Create dataset with input features and labels
    dataset = tf.data.Dataset.from_tensor_slices((inputs, labels))

    if shuffle:
        dataset = dataset.shuffle(buffer_size=len(texts), seed=42)

    # Batch and prefetch
    dataset = dataset.batch(batch_size).prefetch(buffer_size=tf.data.AUTOTUNE)
    return dataset

# Create TensorFlow datasets
train_ds = create_dataset(train_texts, train_publishers, train_critics, train_word_count_scaled, train_sentiment_polarity, train_labels, shuffle=True)
val_ds = create_dataset(val_texts, val_publishers, val_critics, val_word_count_scaled, val_sentiment_polarity, val_labels, shuffle=False)
test_ds = create_dataset(test_texts, test_publishers, test_critics, test_word_count_scaled, test_sentiment_polarity, test_labels, shuffle=False)

# Prefetch to improve performance
train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

Training Set Size: 598262
Validation Set Size: 74783
Test Set Size: 74783

Training Set Distribution by Length and Sentiment:
length_bucket  sentiment_group    
short          neutral_or_moderate    160060
               strong                  15075
medium         neutral_or_moderate    309505
               strong                  11085
long           neutral_or_moderate    100123
               strong                   2414
dtype: int64

Validation Set Distribution by Length and Sentiment:
length_bucket  sentiment_group    
short          neutral_or_moderate    20007
               strong                  1885
medium         neutral_or_moderate    38688
               strong                  1386
long           neutral_or_moderate    12515
               strong                   302
dtype: int64

Test Set Distribution by Length and Sentiment:
length_bucket  sentiment_group    
short          neutral_or_moderate    20008
               strong                  1884
medium         neut

  print(train_set.groupby(['length_bucket', 'sentiment_group']).size())
  print(val_set.groupby(['length_bucket', 'sentiment_group']).size())
  print(test_set.groupby(['length_bucket', 'sentiment_group']).size())


In [None]:
# Round the labels to two decimal places
train_labels = np.round(train_set['rating'].values - 1, 2)
val_labels = np.round(val_set['rating'].values - 1, 2)
test_labels = np.round(test_set['rating'].values - 1, 2)

In [None]:
# Check one batch from the train_ds to ensure everything is correct
for batch in train_ds.take(1):
    batch_contents = batch
    break

# Inspect the batch contents
batch_contents

({'review_content': <tf.Tensor: shape=(128,), dtype=string, numpy=
  array([b'Interesting for some extreme choices but more often than not a bit turgid, Red Sparrow plays like a missed opportunity at something more malicious and subversive than it ends up being.',
         b'A confident sidestep from District 9 rather than an ambitious leap forward, Elysium confirms Blomkamp as an expert genre director. Question is now: can he become more?',
         b'This is less a movie than a casting call -- just round up a crowd of pretty young faces, fresh yet still familiar, and call it a job well done.',
         b"'Don't like our pallid, predictable love story between two shallow characters? Don't worry, the monsters will be back.' The trouble is there's not nearly enough of the creatures or any real thrills to keep audiences entertained.",
         b"In the capable hands of director Bryan Singer, who helmed the first two X-Men films, it's a nearly seamless adventure ride despite a plot that a

In [None]:
# Check the range of the test labels
min_label = test_labels.min()
max_label = test_labels.max()

print(f"Minimum label value: {min_label}")
print(f"Maximum label value: {max_label}")

Minimum label value: 0.0
Maximum label value: 4.0


In [None]:
# Load the Small BERT preprocessing and encoder from TensorFlow Hub
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/2", trainable=True)

# Define a function to build the regression model with additional features
def build_bert_regression_model():
    # Input 1: Review Content (text)
    text_input = layers.Input(shape=(), dtype=tf.string, name='review_content')
    preprocessed_text = bert_preprocess(text_input)
    bert_output = bert_encoder(preprocessed_text)
    pooled_output = bert_output['pooled_output']  # Get [CLS] token representation

    # Input 2: Publisher (encoded as integers, to be embedded)
    publisher_input = layers.Input(shape=(1,), dtype=tf.int32, name='publisher_name')
    publisher_embedding = layers.Embedding(input_dim=1620, output_dim=16)(publisher_input)  # Set output_dim to 16
    publisher_embedding = layers.Flatten()(publisher_embedding)  # Flatten embedding output

    # Input 3: Critic (encoded as integers, to be embedded)
    critic_input = layers.Input(shape=(1,), dtype=tf.int32, name='critic_name')
    critic_embedding = layers.Embedding(input_dim=6821, output_dim=32)(critic_input)  # Set output_dim to 32
    critic_embedding = layers.Flatten()(critic_embedding)

    # Input 4: Word Count (scaled)
    word_count_input = layers.Input(shape=(1,), dtype=tf.float32, name='word_count_scaled')

    # Input 5: Sentiment Polarity
    sentiment_polarity_input = layers.Input(shape=(1,), dtype=tf.float32, name='sentiment_polarity')

    # Concatenate all inputs
    concatenated = layers.Concatenate()([pooled_output, publisher_embedding, critic_embedding, word_count_input, sentiment_polarity_input])

    # Add a dense layer with dropout for regularization
    x = layers.Dense(128, activation='relu')(concatenated)
    x = layers.Dropout(0.3)(x)

    # Output layer for regression (predicting a single value)
    output = layers.Dense(1, name='output')(x)

    # Define the model
    model = Model(inputs=[text_input, publisher_input, critic_input, word_count_input, sentiment_polarity_input], outputs=output)

    # Compile the model for regression
    optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
    model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=['mae'])

    return model

# Build and display the model architecture
regression_model = build_bert_regression_model()
regression_model.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 review_content (InputLayer  [(None,)]                    0         []                            
 )                                                                                                
                                                                                                  
 publisher_name (InputLayer  [(None, 1)]                  0         []                            
 )                                                                                                
                                                                                                  
 critic_name (InputLayer)    [(None, 1)]                  0         []                            
                                                                                            

In [None]:
early_stopping = EarlyStopping(
    monitor='val_mae',
    patience=3,
    restore_best_weights=True
)

checkpoint = ModelCheckpoint(
    '/content/drive/My Drive/Work Project/Models/best_model_v2.h5',
    monitor='val_mae',
    save_best_only=True,
    mode='min',
    verbose=1
)

with tf.device('/GPU:0'):
    history = regression_model.fit(
        train_ds,
        validation_data=val_ds,
        epochs=10,
        callbacks=[early_stopping, checkpoint]
    )

Epoch 1/10
Epoch 1: val_mae improved from inf to 0.51212, saving model to /content/drive/My Drive/Work Project/Models/best_model_v2.h5


  saving_api.save_model(


Epoch 2/10
Epoch 2: val_mae improved from 0.51212 to 0.49662, saving model to /content/drive/My Drive/Work Project/Models/best_model_v2.h5
Epoch 3/10
Epoch 3: val_mae improved from 0.49662 to 0.47880, saving model to /content/drive/My Drive/Work Project/Models/best_model_v2.h5
Epoch 4/10
Epoch 4: val_mae improved from 0.47880 to 0.47229, saving model to /content/drive/My Drive/Work Project/Models/best_model_v2.h5
Epoch 5/10
Epoch 5: val_mae did not improve from 0.47229
Epoch 6/10
Epoch 6: val_mae improved from 0.47229 to 0.46475, saving model to /content/drive/My Drive/Work Project/Models/best_model_v2.h5
Epoch 7/10
Epoch 7: val_mae improved from 0.46475 to 0.46281, saving model to /content/drive/My Drive/Work Project/Models/best_model_v2.h5
Epoch 8/10
Epoch 8: val_mae improved from 0.46281 to 0.46096, saving model to /content/drive/My Drive/Work Project/Models/best_model_v2.h5
Epoch 9/10
Epoch 9: val_mae improved from 0.46096 to 0.45862, saving model to /content/drive/My Drive/Work Pr

In [None]:
# After training, save the model with the best weights to Google Drive with a custom name
regression_model.save('/content/drive/My Drive/Work Project/Models/BERT_Cont_Rating_Regressor_MultiFeature')

In [None]:
# Load the model from Google Drive
model_path = '/content/drive/My Drive/Work Project/Models/BERT_Cont_Rating_Regressor_MultiFeature'
regression_model = load_model(model_path)

In [None]:
# Evaluate the model on the test set
test_loss, test_mae = regression_model.evaluate(test_ds)

# Print the test loss (MSE) and MAE
print(f'Test Loss (MSE): {test_loss:.4f}')
print(f'Test Mean Absolute Error (MAE): {test_mae:.4f}')

Test Loss (MSE): 0.3853
Test Mean Absolute Error (MAE): 0.4593


Analyzing Test Set

In [None]:
# Generate predictions for the test set
y_pred_test_probs = regression_model.predict(test_ds)  # Get predicted probabilities for test set
y_pred_test = y_pred_test_probs.flatten()  # Flatten the predictions



In [None]:
# Extract true values for the test set
y_true_test = []
for batch in test_ds:
    y_true_test.extend(batch[1].numpy())

y_true_test = np.array(y_true_test)

# Align the test set with sentiment_group and length_bucket from 'df'
test_indices = df.index[df.index.isin(test_set.index)]

# Create the DataFrame for test set analysis
results_test_df = pd.DataFrame({
    'y_true': y_true_test[:len(y_pred_test)],
    'y_pred': y_pred_test[:len(y_true_test)],
    'length_bucket': df.loc[test_indices, 'length_bucket'].values[:len(y_pred_test)],
    'sentiment_group': df.loc[test_indices, 'sentiment_group'].values[:len(y_pred_test)]
})

# Calculate MAE by review length for test set
mae_by_length_test = results_test_df.groupby('length_bucket', observed=True).apply(
    lambda group: mean_absolute_error(group['y_true'], group['y_pred']),
    include_groups=False
)

# Calculate MAE by sentiment polarity group for test set
mae_by_sentiment_test = results_test_df.groupby('sentiment_group', observed=True).apply(
    lambda group: mean_absolute_error(group['y_true'], group['y_pred']),
    include_groups=False
)

# Calculate MAE by both length bucket and sentiment group for test set
mae_by_length_and_sentiment_test = results_test_df.groupby(['length_bucket', 'sentiment_group'], observed=True).apply(
    lambda group: mean_absolute_error(group['y_true'], group['y_pred']),
    include_groups=False
)

# Print the results
print("MAE by Review Length (Test Set):")
print(mae_by_length_test)

print("\nMAE by Sentiment Group (Test Set):")
print(mae_by_sentiment_test)

print("\nMAE by Review Length and Sentiment Group (Test Set):")
print(mae_by_length_and_sentiment_test)

MAE by Review Length (Test Set):
length_bucket
short     0.460541
medium    0.457861
long      0.461717
dtype: float64

MAE by Sentiment Group (Test Set):
sentiment_group
neutral_or_moderate    0.459306
strong                 0.459307
dtype: float64

MAE by Review Length and Sentiment Group (Test Set):
length_bucket  sentiment_group    
short          neutral_or_moderate    0.459744
               strong                 0.468997
medium         neutral_or_moderate    0.458175
               strong                 0.449076
long           neutral_or_moderate    0.462100
               strong                 0.445764
dtype: float64


Analyzing Validation Set

In [None]:
# Generate predictions for the validation set
y_pred_val_probs = regression_model.predict(val_ds)  # Get predicted probabilities/values for validation set
y_pred_val = y_pred_val_probs.flatten()  # Flatten the predictions



In [None]:
y_true_val = []
for batch in val_ds:
    y_true_val.extend(batch[1].numpy())

y_true_val = np.array(y_true_val)

# Align the validation set with sentiment_group and length_bucket from 'df'
val_indices = df.index[df.index.isin(val_set.index)]

# Create the DataFrame for validation set analysis
results_val_df = pd.DataFrame({
    'y_true': y_true_val[:len(y_pred_val)],
    'y_pred': y_pred_val[:len(y_true_val)],
    'length_bucket': df.loc[val_indices, 'length_bucket'].values[:len(y_pred_val)],
    'sentiment_group': df.loc[val_indices, 'sentiment_group'].values[:len(y_pred_val)]
})

# Calculate MAE by review length for validation set
mae_by_length_val = results_val_df.groupby('length_bucket', observed=True).apply(
    lambda group: mean_absolute_error(group['y_true'], group['y_pred']),
    include_groups=False
)

# Calculate MAE by sentiment polarity group for validation set
mae_by_sentiment_val = results_val_df.groupby('sentiment_group', observed=True).apply(
    lambda group: mean_absolute_error(group['y_true'], group['y_pred']),
    include_groups=False
)

# Calculate MAE by both length bucket and sentiment group for validation set
mae_by_length_and_sentiment_val = results_val_df.groupby(['length_bucket', 'sentiment_group'], observed=True).apply(
    lambda group: mean_absolute_error(group['y_true'], group['y_pred']),
    include_groups=False
)

# Print the results
print("MAE by Review Length (Validation Set):")
print(mae_by_length_val)

print("\nMAE by Sentiment Group (Validation Set):")
print(mae_by_sentiment_val)

print("\nMAE by Review Length and Sentiment Group (Validation Set):")
print(mae_by_length_and_sentiment_val)

MAE by Review Length (Validation Set):
length_bucket
short     0.453837
medium    0.457553
long      0.457545
dtype: float64

MAE by Sentiment Group (Validation Set):
sentiment_group
neutral_or_moderate    0.456727
strong                 0.451213
dtype: float64

MAE by Review Length and Sentiment Group (Validation Set):
length_bucket  sentiment_group    
short          neutral_or_moderate    0.454250
               strong                 0.449449
medium         neutral_or_moderate    0.457366
               strong                 0.462764
long           neutral_or_moderate    0.458712
               strong                 0.409211
dtype: float64


# 6 BERT Base Multi-Feature

In [None]:
# Load the BERT Base preprocessing and encoder from TensorFlow Hub
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3", trainable=True)

# Define a function to build the regression model with additional features
def build_bert_regression_model():
    # Input 1: Review Content (text)
    text_input = layers.Input(shape=(), dtype=tf.string, name='review_content')
    preprocessed_text = bert_preprocess(text_input)
    bert_output = bert_encoder(preprocessed_text)
    pooled_output = bert_output['pooled_output']  # Get [CLS] token representation

    # Input 2: Publisher (encoded as integers, to be embedded)
    publisher_input = layers.Input(shape=(1,), dtype=tf.int32, name='publisher_name')
    publisher_embedding = layers.Embedding(input_dim=1620, output_dim=16)(publisher_input)  # Set output_dim to 16
    publisher_embedding = layers.Flatten()(publisher_embedding)  # Flatten embedding output

    # Input 3: Critic (encoded as integers, to be embedded)
    critic_input = layers.Input(shape=(1,), dtype=tf.int32, name='critic_name')
    critic_embedding = layers.Embedding(input_dim=6821, output_dim=32)(critic_input)  # Set output_dim to 32
    critic_embedding = layers.Flatten()(critic_embedding)

    # Input 4: Word Count (scaled)
    word_count_input = layers.Input(shape=(1,), dtype=tf.float32, name='word_count_scaled')

    # Input 5: Sentiment Polarity
    sentiment_polarity_input = layers.Input(shape=(1,), dtype=tf.float32, name='sentiment_polarity')

    # Concatenate all inputs
    concatenated = layers.Concatenate()([pooled_output, publisher_embedding, critic_embedding, word_count_input, sentiment_polarity_input])

    # Add a dense layer with dropout for regularization
    x = layers.Dense(128, activation='relu')(concatenated)
    x = layers.Dropout(0.3)(x)

    # Output layer for regression (predicting a single value)
    output = layers.Dense(1, name='output')(x)

    # Define the model
    model = Model(inputs=[text_input, publisher_input, critic_input, word_count_input, sentiment_polarity_input], outputs=output)

    # Compile the model for regression
    optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
    model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=['mae'])

    return model

# Build and display the model architecture
regression_model = build_bert_regression_model()
regression_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 review_content (InputLayer  [(None,)]                    0         []                            
 )                                                                                                
                                                                                                  
 publisher_name (InputLayer  [(None, 1)]                  0         []                            
 )                                                                                                
                                                                                                  
 critic_name (InputLayer)    [(None, 1)]                  0         []                            
                                                                                              

In [None]:
early_stopping = EarlyStopping(
    monitor='val_mae',
    patience=3,
    restore_best_weights=True
)

checkpoint = ModelCheckpoint(
    '/content/drive/My Drive/Work Project/Models/best_model_v3.h5',
    monitor='val_mae',
    save_best_only=True,
    mode='min',
    verbose=1
)

with tf.device('/GPU:0'):
    history = regression_model.fit(
        train_ds,
        validation_data=val_ds,
        epochs=10,
        callbacks=[early_stopping, checkpoint]
    )

Epoch 1/10
Epoch 1: val_mae improved from inf to 0.45461, saving model to /content/drive/My Drive/Work Project/Models/best_model_v3.h5


  saving_api.save_model(


Epoch 2/10
Epoch 2: val_mae improved from 0.45461 to 0.43490, saving model to /content/drive/My Drive/Work Project/Models/best_model_v3.h5
Epoch 3/10
Epoch 3: val_mae improved from 0.43490 to 0.42992, saving model to /content/drive/My Drive/Work Project/Models/best_model_v3.h5
Epoch 4/10
Epoch 4: val_mae improved from 0.42992 to 0.42387, saving model to /content/drive/My Drive/Work Project/Models/best_model_v3.h5
Epoch 5/10
Epoch 5: val_mae did not improve from 0.42387
Epoch 6/10
Epoch 6: val_mae did not improve from 0.42387
Epoch 7/10
Epoch 7: val_mae did not improve from 0.42387


In [None]:
# Evaluate the model on the test set to check performance
test_loss, test_mae = best_model.evaluate(test_ds)
print(f"Test MAE: {test_mae}")

Test MAE: 0.42771288752555847


In [None]:
best_model.save('/content/drive/My Drive/Work Project/Models/BERTBase_Cont_Rating_Regressor_MultiFeature')

Analyzing Test Set

In [None]:
# Generate predictions for the test set
y_pred_test_probs = best_model.predict(test_ds)  # Get predicted probabilities for test set
y_pred_test = y_pred_test_probs.flatten()  # Flatten the predictions



In [None]:
# Extract true values for the test set
y_true_test = []
for batch in test_ds:
    y_true_test.extend(batch[1].numpy())

y_true_test = np.array(y_true_test)

# Align the test set with sentiment_group and length_bucket from 'df'
test_indices = df.index[df.index.isin(test_set.index)]

# Create the DataFrame for test set analysis
results_test_df = pd.DataFrame({
    'y_true': y_true_test[:len(y_pred_test)],
    'y_pred': y_pred_test[:len(y_true_test)],
    'length_bucket': df.loc[test_indices, 'length_bucket'].values[:len(y_pred_test)],
    'sentiment_group': df.loc[test_indices, 'sentiment_group'].values[:len(y_pred_test)]
})

# Calculate MAE by review length for test set
mae_by_length_test = results_test_df.groupby('length_bucket', observed=True).apply(
    lambda group: mean_absolute_error(group['y_true'], group['y_pred']),
    include_groups=False
)

# Calculate MAE by sentiment polarity group for test set
mae_by_sentiment_test = results_test_df.groupby('sentiment_group', observed=True).apply(
    lambda group: mean_absolute_error(group['y_true'], group['y_pred']),
    include_groups=False
)

# Calculate MAE by both length bucket and sentiment group for test set
mae_by_length_and_sentiment_test = results_test_df.groupby(['length_bucket', 'sentiment_group'], observed=True).apply(
    lambda group: mean_absolute_error(group['y_true'], group['y_pred']),
    include_groups=False
)

# Print the results
print("MAE by Review Length (Test Set):")
print(mae_by_length_test)

print("\nMAE by Sentiment Group (Test Set):")
print(mae_by_sentiment_test)

print("\nMAE by Review Length and Sentiment Group (Test Set):")
print(mae_by_length_and_sentiment_test)

MAE by Review Length (Test Set):
length_bucket
short     0.430585
medium    0.427493
long      0.423497
dtype: float64

MAE by Sentiment Group (Test Set):
sentiment_group
neutral_or_moderate    0.427497
strong                 0.432013
dtype: float64

MAE by Review Length and Sentiment Group (Test Set):
length_bucket  sentiment_group    
short          neutral_or_moderate    0.428913
               strong                 0.448339
medium         neutral_or_moderate    0.427901
               strong                 0.416090
long           neutral_or_moderate    0.423986
               strong                 0.403149
dtype: float64


Analyzing Validation Set

In [None]:
# Generate predictions for the validation set
y_pred_val_probs = best_model.predict(val_ds)  # Get predicted probabilities/values for validation set
y_pred_val = y_pred_val_probs.flatten()  # Flatten the predictions



In [None]:
y_true_val = []
for batch in val_ds:
    y_true_val.extend(batch[1].numpy())

y_true_val = np.array(y_true_val)

# Align the validation set with sentiment_group and length_bucket from 'df'
val_indices = df.index[df.index.isin(val_set.index)]

# Create the DataFrame for validation set analysis
results_val_df = pd.DataFrame({
    'y_true': y_true_val[:len(y_pred_val)],
    'y_pred': y_pred_val[:len(y_true_val)],
    'length_bucket': df.loc[val_indices, 'length_bucket'].values[:len(y_pred_val)],
    'sentiment_group': df.loc[val_indices, 'sentiment_group'].values[:len(y_pred_val)]
})

# Calculate MAE by review length for validation set
mae_by_length_val = results_val_df.groupby('length_bucket', observed=True).apply(
    lambda group: mean_absolute_error(group['y_true'], group['y_pred']),
    include_groups=False
)

# Calculate MAE by sentiment polarity group for validation set
mae_by_sentiment_val = results_val_df.groupby('sentiment_group', observed=True).apply(
    lambda group: mean_absolute_error(group['y_true'], group['y_pred']),
    include_groups=False
)

# Calculate MAE by both length bucket and sentiment group for validation set
mae_by_length_and_sentiment_val = results_val_df.groupby(['length_bucket', 'sentiment_group'], observed=True).apply(
    lambda group: mean_absolute_error(group['y_true'], group['y_pred']),
    include_groups=False
)

# Print the results
print("MAE by Review Length (Validation Set):")
print(mae_by_length_val)

print("\nMAE by Sentiment Group (Validation Set):")
print(mae_by_sentiment_val)

print("\nMAE by Review Length and Sentiment Group (Validation Set):")
print(mae_by_length_and_sentiment_val)

MAE by Review Length (Validation Set):
length_bucket
short     0.422864
medium    0.423523
long      0.426646
dtype: float64

MAE by Sentiment Group (Validation Set):
sentiment_group
neutral_or_moderate    0.423794
strong                 0.425279
dtype: float64

MAE by Review Length and Sentiment Group (Validation Set):
length_bucket  sentiment_group    
short          neutral_or_moderate    0.423161
               strong                 0.419713
medium         neutral_or_moderate    0.423120
               strong                 0.434754
long           neutral_or_moderate    0.426890
               strong                 0.416535
dtype: float64


In [None]:
# Assuming you have val_ds, and test_ds
full_dataset = test_ds.concatenate(val_ds)

# Get predictions on the full dataset
y_pred_full_probs = best_model.predict(full_dataset)
y_pred_full = y_pred_full_probs.flatten()



In [None]:
# Step 1: Extract true values from the full dataset (test + validation)
y_true_full = []
for batch in full_dataset:
    y_true_full.extend(batch[1].numpy())
y_true_full = np.array(y_true_full)

# Step 2: Align the full dataset with sentiment_group and length_bucket from 'df'
full_indices = df.index[df.index.isin(val_set.index) | df.index.isin(test_set.index)]

# Step 3: Create DataFrame for full dataset analysis
results_full_df = pd.DataFrame({
    'y_true': y_true_full[:len(y_pred_full)],
    'y_pred': y_pred_full[:len(y_true_full)],
    'length_bucket': df.loc[full_indices, 'length_bucket'].values[:len(y_pred_full)],
    'sentiment_group': df.loc[full_indices, 'sentiment_group'].values[:len(y_pred_full)]
})

# Step 4: Calculate MAE by sentiment group for the full set
mae_by_sentiment_full = results_full_df.groupby('sentiment_group').apply(
    lambda group: mean_absolute_error(group['y_true'], group['y_pred'])
)

# Print the results for MAE by sentiment group
print("MAE by Sentiment Group (Full Dataset - Test + Validation):")
print(mae_by_sentiment_full)

# Step 5: Calculate MAE by review length for the full dataset
mae_by_length_full = results_full_df.groupby('length_bucket').apply(
    lambda group: mean_absolute_error(group['y_true'], group['y_pred'])
)

# Print the results for MAE by review length
print("\nMAE by Review Length (Full Dataset - Test + Validation):")
print(mae_by_length_full)

# Step 6: Calculate MAE by both review length and sentiment group for the full dataset
mae_by_length_and_sentiment_full = results_full_df.groupby(['length_bucket', 'sentiment_group']).apply(
    lambda group: mean_absolute_error(group['y_true'], group['y_pred'])
)

# Print the results for MAE by both review length and sentiment group
print("\nMAE by Review Length and Sentiment Group (Full Dataset - Test + Validation):")
print(mae_by_length_and_sentiment_full)

MAE by Sentiment Group (Full Dataset - Test + Validation):
sentiment_group
neutral_or_moderate    0.425866
strong                 0.424252
dtype: float64

MAE by Review Length (Full Dataset - Test + Validation):
length_bucket
short     0.423862
medium    0.425902
long      0.428727
dtype: float64

MAE by Review Length and Sentiment Group (Full Dataset - Test + Validation):
length_bucket  sentiment_group    
short          neutral_or_moderate    0.423269
               strong                 0.430152
medium         neutral_or_moderate    0.426149
               strong                 0.419024
long           neutral_or_moderate    0.429144
               strong                 0.411409
dtype: float64


  mae_by_sentiment_full = results_full_df.groupby('sentiment_group').apply(
  mae_by_length_full = results_full_df.groupby('length_bucket').apply(
  mae_by_length_full = results_full_df.groupby('length_bucket').apply(
  mae_by_length_and_sentiment_full = results_full_df.groupby(['length_bucket', 'sentiment_group']).apply(
  mae_by_length_and_sentiment_full = results_full_df.groupby(['length_bucket', 'sentiment_group']).apply(
