In [36]:
import tensorflow_datasets as tfds

In [37]:
#losding the data tensorflow dataset
dataset, info = tfds.load("imdb_reviews", split=["train", "test"], as_supervised=True, with_info=True)

In [38]:
train_data = list(dataset[0])
test_data = list(dataset[1])

#extracting reviews and labels
X_train, y_train = zip(*train_data)
X_test, y_test = zip(*test_data)

In [39]:
#converting tensorflow tensors to the strings
X_train = [x.numpy().decode("utf-8") for x in X_train]
X_test = [x.numpy().decode("utf-8") for x in X_test]

In [40]:
print(f"Sample Review: {X_train[0]}")
print(f"Sentiment: {y_train[0]}")

Sample Review: This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.
Sentiment: 0


**Text Preprocessing**

In [41]:
import nltk

# downloading necessary NLTK resources
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [42]:
import re
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


In [43]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


In [44]:
#text preprocessing
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\d+', '', text)
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)  # Converting list back to string


In [45]:
# applying preprocessing to dataset
X_train_clean = [preprocess_text(review) for review in X_train]
X_test_clean = [preprocess_text(review) for review in X_test]

print(f"Original Review: {X_train[0]}\n")
print(f"Processed Review: {X_train_clean[0]}")

Original Review: This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.

Processed Review: absolutely terrible movie lured christopher walken michael ironside great actor must simply worst role history even great acting could redeem movie ridiculous storyline movie early ninety u propaganda piece pathetic scene columbian rebel making case revo

**Feature Engineering (TF-IDF Vectorization)**

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer

#TF IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000)  # limiting to 5000 most important words

# fit and transform the training and testing data
X_train_tfidf = vectorizer.fit_transform(X_train_clean)
X_test_tfidf = vectorizer.transform(X_test_clean)

print(f"TF-IDF Matrix Shape (Train): {X_train_tfidf.shape}")
print(f"TF-IDF Matrix Shape (Test): {X_test_tfidf.shape}")

TF-IDF Matrix Shape (Train): (25000, 5000)
TF-IDF Matrix Shape (Test): (25000, 5000)


**Model Training (Logistic Regression)**

In [47]:
from sklearn.linear_model import LogisticRegression

#  logistic Regression model
model = LogisticRegression(max_iter=500)
model.fit(X_train_tfidf, y_train)

**Model Evaluation**

In [48]:
from sklearn.metrics import accuracy_score, classification_report


y_pred = model.predict(X_test_tfidf)

# evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")


print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))

Model Accuracy: 0.8790

Classification Report:

              precision    recall  f1-score   support

           0       0.88      0.88      0.88     12500
           1       0.88      0.88      0.88     12500

    accuracy                           0.88     25000
   macro avg       0.88      0.88      0.88     25000
weighted avg       0.88      0.88      0.88     25000



**Model Accuracy = (87.9%)**

**Testing with Sample Reviews**

In [49]:
def predict_sentiment(review):
    processed_review = preprocess_text(review)
    review_tfidf = vectorizer.transform([processed_review])  # convert to TF-IDF
    prediction = model.predict(review_tfidf)[0]
    sentiment = "Positive" if prediction == 1 else "Negative"
    return sentiment


In [50]:
# testing with some Reacher(Prime Video) reviews
sample_reviews = [
    "True to the book. True to the character. Entertaining and fun.",
    "Absolutely terrible. I regret wasting my time on this nonsense.",
    "I am sure there will be a season 2 and I hope it will be stronger than many of other Prime season 2 series",
    "My only disappointment, after watching the opening scenes at least a dozen times now, is that bite of peach pie never reaches Reacher's mouth when the local cops enter the cafe."
]


In [51]:
for review in sample_reviews:
    print(f"Review: {review}")
    print(f"Predicted Sentiment: {predict_sentiment(review)}\n")

Review: True to the book. True to the character. Entertaining and fun.
Predicted Sentiment: Positive

Review: Absolutely terrible. I regret wasting my time on this nonsense.
Predicted Sentiment: Negative

Review: I am sure there will be a season 2 and I hope it will be stronger than many of other Prime season 2 series
Predicted Sentiment: Positive

Review: My only disappointment, after watching the opening scenes at least a dozen times now, is that bite of peach pie never reaches Reacher's mouth when the local cops enter the cafe.
Predicted Sentiment: Negative

