<a href="https://www.kaggle.com/code/awiksshiithnarang/movie-review-sentiment-analysis?scriptVersionId=136627159" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

**Importing required libraries:**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow_datasets as tfds
import tensorflow as tf
import logging

logging.getLogger( "tensorflow" ).setLevel( logging.ERROR )

**Loading the required dataset and preparing the data:**

In [None]:
dataset = pd.read_csv( "/kaggle/input/imdb-movie-ratings-sentiment-analysis/movie.csv" )
reviews = dataset[ "text" ].tolist()
sentiments = dataset[ "label" ].tolist()

**Subwording, tokenization & padding the reviews and splitting the data:**

In [None]:
vocab_size = 500
embedding_size = 16
max_length = 50
tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus( reviews, vocab_size, max_subword_length = 5 )
for i, review in enumerate( reviews ):
    reviews[ i ] = tokenizer.encode( review )
padded_reviews = tf.keras.preprocessing.sequence.pad_sequences( reviews, maxlen = max_length, padding = "post", truncating = "post" )
training_size = int( len( reviews ) * 0.8 )
training_reviews = padded_reviews[ 0 : training_size ]
training_sentiments = sentiments[ 0 : training_size ]
validation_reviews = padded_reviews[ training_size : ]
validation_sentiments = sentiments[ training_size : ]
training_sentiments = np.array( training_sentiments )
validation_sentiments = np.array( validation_sentiments )

**Creating, compiling, training different models:**

In [None]:
plain_model = tf.keras.Sequential( [ tf.keras.layers.Embedding( vocab_size, embedding_size, input_length = max_length ), tf.keras.layers.Dropout( 0.5 ), tf.keras.layers.GlobalAveragePooling1D(), tf.keras.layers.Dense( 6, activation = "relu" ), tf.keras.layers.Dense( 1, activation = "sigmoid" ) ] )
plain_model.compile( loss = "binary_crossentropy", optimizer = "adam", metrics = [ "accuracy" ] )
print( plain_model.summary() )
tf.keras.backend.clear_session()
epochs = 100
early_stopping = tf.keras.callbacks.EarlyStopping( patience = 50 )
plain_history = plain_model.fit( training_reviews, training_sentiments, epochs = epochs, validation_data = ( validation_reviews, validation_sentiments ), callbacks = [ early_stopping ] )

In [None]:
cnn_model = tf.keras.Sequential( [ tf.keras.layers.Embedding( vocab_size, embedding_size, input_length = max_length ), tf.keras.layers.Conv1D( 128, 5, activation = "relu" ), tf.keras.layers.Dropout( 0.5 ), tf.keras.layers.GlobalMaxPooling1D(), tf.keras.layers.Dense( 6, activation = "relu" ), tf.keras.layers.Dense( 1, activation = "sigmoid" ) ] )
cnn_model.compile( loss = "binary_crossentropy", optimizer = "adam", metrics = [ "accuracy" ] )
print( cnn_model.summary() )
tf.keras.backend.clear_session()
epochs = 100
early_stopping = tf.keras.callbacks.EarlyStopping( patience = 50 )
cnn_history = cnn_model.fit( training_reviews, training_sentiments, epochs = epochs, validation_data = ( validation_reviews, validation_sentiments ), callbacks = [ early_stopping ] )

In [None]:
gru_model = tf.keras.Sequential( [ tf.keras.layers.Embedding( vocab_size, embedding_size, input_length = max_length ), tf.keras.layers.Bidirectional( tf.keras.layers.GRU( embedding_size ) ), tf.keras.layers.Dropout( 0.5 ), tf.keras.layers.Dense( 6, activation = "relu" ), tf.keras.layers.Dense( 1, activation = "sigmoid" ) ] )
gru_model.compile( loss = "binary_crossentropy", optimizer = "adam", metrics = [ "accuracy" ] )
print( gru_model.summary() )
tf.keras.backend.clear_session()
epochs = 100
early_stopping = tf.keras.callbacks.EarlyStopping( patience = 50 )
gru_history = gru_model.fit( training_reviews, training_sentiments, epochs = epochs, validation_data = ( validation_reviews, validation_sentiments ), callbacks = [ early_stopping ] )

In [None]:
lstm_model = tf.keras.Sequential( [ tf.keras.layers.Embedding( vocab_size, embedding_size, input_length = max_length ), tf.keras.layers.Bidirectional( tf.keras.layers.LSTM( embedding_size ) ), tf.keras.layers.Dropout( 0.5 ), tf.keras.layers.Dense( 6, activation = "relu" ), tf.keras.layers.Dense( 1, activation = "sigmoid" ) ] )
lstm_model.compile( loss = "binary_crossentropy", optimizer = "adam", metrics = [ "accuracy" ] )
print( lstm_model.summary() )
tf.keras.backend.clear_session()
epochs = 100
early_stopping = tf.keras.callbacks.EarlyStopping( patience = 50 )
lstm_history = lstm_model.fit( training_reviews, training_sentiments, epochs = epochs, validation_data = ( validation_reviews, validation_sentiments ), callbacks = [ early_stopping ] )

In [None]:
stacked_lstm_model = tf.keras.Sequential( [ tf.keras.layers.Embedding( vocab_size, embedding_size, input_length = max_length ), tf.keras.layers.Bidirectional( tf.keras.layers.LSTM( embedding_size, return_sequences = True ) ),  tf.keras.layers.Bidirectional( tf.keras.layers.LSTM( embedding_size ) ), tf.keras.layers.Dropout( 0.5 ), tf.keras.layers.Dense( 6, activation = "relu" ), tf.keras.layers.Dense( 1, activation = "sigmoid" ) ] )
stacked_lstm_model.compile( loss = "binary_crossentropy", optimizer = "adam", metrics = [ "accuracy" ] )
print( stacked_lstm_model.summary() )
tf.keras.backend.clear_session()
epochs = 100
early_stopping = tf.keras.callbacks.EarlyStopping( patience = 50 )
stacked_lstm_history = stacked_lstm_model.fit( training_reviews, training_sentiments, epochs = epochs, validation_data = ( validation_reviews, validation_sentiments ), callbacks = [ early_stopping ] )

**Comparing models used:**

In [None]:
def plot_graphs( history, epochs ):
    acc = history.history[ "accuracy" ]
    val_acc = history.history[ "val_accuracy" ]
    loss = history.history[ "loss" ]
    val_loss = history.history[ "val_loss" ]
    plt.figure( figsize = ( 8, 8 ) )
    plt.subplot( 1, 2, 1 )
    plt.plot( epochs, acc, label = "Training accuracy" )
    plt.plot( epochs, val_acc, label = "Validation accuracy" )
    plt.legend( loc = "lower right" )
    plt.subplot( 1, 2, 2 )
    plt.plot( epochs, loss, label = "Training loss" )
    plt.plot( epochs, val_loss, label = "Validation loss" )
    plt.legend( loc = "upper right" )
    
plot_graphs( plain_history, range( 1, 101 ) )
plot_graphs( cnn_history, range( 1, 65 ) )
plot_graphs( gru_history, range( 1, 68 ) )
plot_graphs( lstm_history, range( 1, 65 ) )
plot_graphs( stacked_lstm_history, range( 1, 66 ) )
plt.show()

**Predictions of sentiments for new reviews:**

In [None]:
review1 = "I loved this movie"
review2 = "that was the worst movie I've ever seen"
review3 = "too much violence even for a Bond film"
review4 = "a captivating recounting of a cherished myth"
review5 = "I saw this movie yesterday and I was feeling low to start with, but it was such a wonderful movie that it lifted my spirits and brightened my day, you can\'t go wrong with a movie with Whoopi Goldberg in it."
review6 = "I don\'t understand why it received an oscar recommendation for best movie, it was long and boring"
review7 = "the scenery was magnificent, the CGI of the dogs was so realistic I thought they were played by real dogs even though they talked!"
review8 = "The ending was so sad and yet so uplifting at the same time. I'm looking for an excuse to see it again"
review9 = "I had expected so much more from a movie made by the director who made my most favorite movie ever, I was very disappointed in the tedious story"
review10 = "I wish I could watch this movie every day for the rest of my life"
new_reviews = [review1, review2, review3, review4, review5, review6, review7, 
               review8, review9, review10]

def predict_and_print( model, reviews ):
    tkn_in_fn = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus( reviews, 500, max_subword_length = 5 )
    for i, review in enumerate( reviews ):
        reviews[ i ] = tkn_in_fn.encode( review )
    padded_reviews = tf.keras.preprocessing.sequence.pad_sequences( reviews, maxlen = 50, padding = "post", truncating = "post" )
    predictions = model.predict( padded_reviews )
    return predictions

print( "On comparing all the above models that we've trained, the model which contains CNN is the best." )
print( "So, we'll use this for our predictions." )
print( "----------" )
for i, review in enumerate( new_reviews ):
    print( "Review {}: {}".format( ( i + 1 ), review ) )
preds = predict_and_print( cnn_model, new_reviews )
for i, pred in enumerate( preds ):
    print( "Prediction of sentiment of review {}: {}".format( ( i + 1 ), pred ) )