In [1]:
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import numpy as np
from tensorflow.keras import layers, models

In [2]:
fpath = 'Data-NLP/processed_data.joblib'
df = joblib.load(fpath)

df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8650 entries, 0 to 8649
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   review_id       8650 non-null   object 
 1   movie_id        8650 non-null   int64  
 2   imdb_id         8650 non-null   object 
 3   original_title  8650 non-null   object 
 4   review          8650 non-null   object 
 5   rating          7454 non-null   float64
 6   target          8650 non-null   object 
dtypes: float64(1), int64(1), object(5)
memory usage: 473.2+ KB


Unnamed: 0,review_id,movie_id,imdb_id,original_title,review,rating,target
0,64ecc16e83901800af821d50,843,tt0118694,花樣年華,This is a fine piece of cinema from Wong Kar-W...,7.0,exclude
1,57086ff5c3a3681d29001512,7443,tt0120630,Chicken Run,"A guilty pleasure for me personally, as I love...",9.0,high-rating
2,5bb5ac829251410dcb00810c,7443,tt0120630,Chicken Run,Made my roommate who hates stop-motion animati...,6.0,exclude
3,5f0c53a013a32000357ec505,7443,tt0120630,Chicken Run,A very good stop-motion animation!\r\n\r\n<em>...,8.0,exclude
4,64ecc027594c9400ffe77c91,7443,tt0120630,Chicken Run,"Ok, there is an huge temptation to riddle this...",7.0,exclude


In [3]:
# Drop reviews with no raiting
df['rating'].isna().sum()

1196

In [4]:
df.dropna(subset=['rating'], inplace=True)

In [5]:
df['rating'].isna().sum()

0

In [6]:
X = df['review']  
y = df['target']

In [7]:
# Initialize the TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))

In [8]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create a pipeline with TfidfVectorizer and MultinomialNB
pipeline_nb = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', ngram_range=(1, 2))),
    ('classifier', MultinomialNB())
])

# Create a pipeline with TfidfVectorizer and LogisticRegression
pipeline_lr = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', ngram_range=(1, 2))),
    ('classifier', LogisticRegression(max_iter=1000))
])


In [9]:
# Fit the MultinomialNB pipeline on the training data
pipeline_nb.fit(X_train, y_train)

# Predict on the test data
y_pred_nb = pipeline_nb.predict(X_test)

# Evaluate the MultinomialNB model
from sklearn.metrics import classification_report

print("MultinomialNB Model Performance:")
print(classification_report(y_test, y_pred_nb))

# Fit the LogisticRegression pipeline on the training data
pipeline_lr.fit(X_train, y_train)

# Predict on the test data
y_pred_lr = pipeline_lr.predict(X_test)

# Evaluate the LogisticRegression model
print("LogisticRegression Model Performance:")
print(classification_report(y_test, y_pred_lr))

MultinomialNB Model Performance:
              precision    recall  f1-score   support

     exclude       0.68      1.00      0.81      1522
 high-rating       0.00      0.00      0.00       365
  low-rating       1.00      0.01      0.01       350

    accuracy                           0.68      2237
   macro avg       0.56      0.34      0.27      2237
weighted avg       0.62      0.68      0.55      2237



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


LogisticRegression Model Performance:
              precision    recall  f1-score   support

     exclude       0.71      1.00      0.83      1522
 high-rating       0.83      0.07      0.13       365
  low-rating       0.97      0.17      0.29       350

    accuracy                           0.72      2237
   macro avg       0.84      0.41      0.41      2237
weighted avg       0.77      0.72      0.63      2237



It appears that the logisitc regression model is performing better than the MultinomialNB Model. Now I will do some gridsearch to try and optimize the model.

In [10]:
# Define the pipeline
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer()),  # Start with TfidfVectorizer; can also use CountVectorizer
    ('classifier', LogisticRegression(max_iter=1000))  # Logistic Regression as the classifier
])

# Define the parameter grid
param_grid = {
    'vectorizer__stop_words': [None, 'english'],  # None or English stopwords
    'vectorizer__ngram_range': [(1, 1), (1, 2)],  # Unigrams or Bigrams
    'vectorizer__min_df': [1, 2, 5],  # Minimum document frequency
    'vectorizer__max_df': [0.9, 0.95, 1.0],  # Maximum document frequency
}

In [11]:
# Initialize the GridSearchCV object
grid_search = GridSearchCV(pipeline, param_grid, n_jobs=-1, verbose=2)

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


In [12]:
# Display the best parameters
print("Best parameters set:")
print(grid_search.best_params_)

# Predict with the best found model
y_pred = grid_search.predict(X_test)

# Evaluate the model
from sklearn.metrics import classification_report

print("Classification report:")
print(classification_report(y_test, y_pred))

Best parameters set:
{'vectorizer__max_df': 0.9, 'vectorizer__min_df': 5, 'vectorizer__ngram_range': (1, 1), 'vectorizer__stop_words': None}
Classification report:
              precision    recall  f1-score   support

     exclude       0.74      0.97      0.84      1522
 high-rating       0.72      0.21      0.33       365
  low-rating       0.88      0.29      0.44       350

    accuracy                           0.74      2237
   macro avg       0.78      0.49      0.54      2237
weighted avg       0.76      0.74      0.69      2237



The model has an overall accuracy of 74%, which means it correctly predicts the rating category 74% of the time across the test dataset.
For the "exclude" category, the model shows a high precision and recall, leading to an F1-score of 0.84, indicating strong performance in identifying this class.
The "high-rating" and "low-rating" categories, however, have lower F1-scores (0.33 and 0.44, respectively), suggesting that the model struggles more with these classifications. The low recall values indicate that the model misses a significant portion of these classes, whereas the precision values suggest the predictions it does make are reasonably reliable (more so for "low-rating").

## RNN (NLP Analysis)

In [13]:
# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit label encoder and return encoded labels
y_integers = label_encoder.fit_transform(y)

# Convert the integer encoded labels to one-hot encodings
y_onehot = tf.keras.utils.to_categorical(y_integers)

In [14]:
# Split the data into train, test, and validation sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y_onehot, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Create TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(32)
val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val)).batch(32)
test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(32)

In [15]:
# Create a TextVectorization layer
max_features = 10000  # Size of the vocabulary
sequence_length = 250  # Maximum length of the sequence

vectorize_layer = TextVectorization(
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)

# Fit the TextVectorization layer on the training texts
vectorize_layer.adapt(np.array(X_train))

In [16]:
vocab_size = len(vectorize_layer.get_vocabulary())
print(f"Vocabulary size: {vocab_size}")

Vocabulary size: 10000


In [17]:
# Define the RNN model
model = models.Sequential()
model.add(layers.InputLayer(input_shape=(1,), dtype=tf.string))
model.add(vectorize_layer)
model.add(layers.Embedding(input_dim=max_features + 1, output_dim=64, mask_zero=True))
model.add(layers.LSTM(64))
model.add(layers.Dense(units=y_onehot.shape[1], activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVec  (None, 250)              0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 250, 64)           640064    
                                                                 
 lstm (LSTM)                 (None, 64)                33024     
                                                                 
 dense (Dense)               (None, 3)                 195       
                                                                 
Total params: 673,283
Trainable params: 673,283
Non-trainable params: 0
_________________________________________________________________


In [18]:
history = model.fit(train_dataset, validation_data=val_dataset, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [19]:
train_loss, train_accuracy = model.evaluate(train_dataset)
print(f"Training Accuracy: {train_accuracy}")

Training Accuracy: 0.9664558172225952


In [20]:
test_loss, test_accuracy = model.evaluate(test_dataset)
print(f"Test Accuracy: {test_accuracy}")

Test Accuracy: 0.6005361676216125


After fitting and evaluating the RNN model, the results show a significant discrepancy between the training and test accuracies. The training accuracy is impressively high at approximately 93.35%, while the test accuracy is notably lower at around 60.14% indicating possible overfitting of the training data.

The RNN model shows a high degree of accuracy on the training set, which might surpass the performance of the ML models there. However, the true measure of a model's effectiveness is its performance on unseen data (test set), where the RNN model seems to underperform. The ML models might not have reached such high training accuracy but could potentially have better generalization, depending on their test accuracy.

# Save Model and Data

In [21]:
import joblib

# Save the models
joblib.dump(pipeline_lr, 'models/ml_model.joblib')
model.save('models/nlp_model', save_format='tf')

joblib.dump((X_train, y_train), 'data/training_data.joblib')
joblib.dump((X_test, y_test), 'data/test_data.joblib')
joblib.dump(label_encoder, 'data/label_encoder.joblib')



INFO:tensorflow:Assets written to: models/nlp_model\assets


INFO:tensorflow:Assets written to: models/nlp_model\assets


['data/label_encoder.joblib']