In [9]:
# 1.1 Import Libraries
import pandas as pd
import numpy as np
import re # Regular expressions for text cleaning
import nltk # Natural Language Toolkit
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer # Optional: for stemming
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib # To save/load models and vectorizer (optional)
import csv

print("Libraries imported successfully.")

Libraries imported successfully.


In [11]:
# Load the CSV file (after downloading from Kaggle)
file_path = "IMDB Dataset.csv"  # Adjust path as needed
df = pd.read_csv(
    "IMDB Dataset.csv",
    quoting=csv.QUOTE_NONE,
    encoding='utf-8',
    on_bad_lines='skip'  # <-- updated argument for pandas 2.0+
)
# Display basic info and first few rows
print("\nDataset Info:")
df.info()

print("\nFirst 5 rows:")
print(df.head())

# Map sentiment labels to numerical values (positive: 1, negative: 0)
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

print("\nSentiment value counts (1: positive, 0: negative):")
print(df['sentiment'].value_counts())


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 35723 entries, ('"One of the other reviewers has mentioned that after watching just 1 Oz episode you\'ll be hooked. They are right', ' as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence', ' which set in right from the word GO. Trust me', ' this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs', ' sex or violence. Its is hardcore', ' in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City', ' an experimental section of the prison where all the cells have glass fronts and face inwards', ' so privacy is not high on the agenda. Em City is home to many..Aryans', ' Muslims', ' gangstas', ' Latinos', ' Christians', ' Italians', ' Irish and more....so scuffles', ' death stares

In [14]:


# Download stopwords if not already downloaded
nltk.download('stopwords')

# Set of English stopwords
stop_words = set(stopwords.words('english'))


# Preprocessing function
def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove punctuation and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Tokenize and remove stopwords
    words = text.split()
    words = [word for word in words if word not in stop_words]
    # Optional: Apply stemming
    # words = [ps.stem(word) for word in words]
    return ' '.join(words)

# Drop rows with missing reviews
df = df.dropna(subset=['review'])

# Apply preprocessing
print("\nPreprocessing text data... (This may take a few minutes)")
df['cleaned_review'] = df['review'].apply(preprocess_text)
print("Text preprocessing complete.")

# Show an example
print("\nExample Preprocessing:")
print("Original:", df['review'].iloc[0][:200] + "...")
print("Cleaned:", df['cleaned_review'].iloc[0][:200] + "...")



Preprocessing text data... (This may take a few minutes)
Text preprocessing complete.

Example Preprocessing:
Original:  you may become comfortable with what is uncomfortable viewing....thats if you can get in touch with your darker side."...
Cleaned: may become comfortable uncomfortable viewingthats get touch darker side...


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_review'] = df['review'].apply(preprocess_text)


In [16]:
df = df.dropna(subset=['review', 'sentiment'])

# 3.1 Split Data into Training and Testing Sets
X = df['cleaned_review']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)
# stratify=y ensures the proportion of positive/negative reviews is similar in train and test sets

print(f"\nData Split:")
print(f"Training set size: {len(X_train)} samples")
print(f"Testing set size: {len(X_test)} samples")

# 3.2 TF-IDF Vectorization
# Initialize TF-IDF Vectorizer
# max_features limits the vocabulary size to the most frequent terms, useful for large datasets
tfidf_vectorizer = TfidfVectorizer(max_features=5000) # You can tune max_features

# Fit the vectorizer on the training data and transform the training data
print("\nFitting TF-IDF Vectorizer and transforming training data...")
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform the test data using the *same* fitted vectorizer
print("Transforming test data...")
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print("TF-IDF transformation complete.")
print(f"Shape of TF-IDF matrix (Train): {X_train_tfidf.shape}") # (num_samples, num_features)
print(f"Shape of TF-IDF matrix (Test): {X_test_tfidf.shape}")


Data Split:
Training set size: 204 samples
Testing set size: 68 samples

Fitting TF-IDF Vectorizer and transforming training data...
Transforming test data...
TF-IDF transformation complete.
Shape of TF-IDF matrix (Train): (204, 1092)
Shape of TF-IDF matrix (Test): (68, 1092)


In [17]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Step 1: Define the DNN model
dnn_model = Sequential()
dnn_model.add(Dense(128, activation='relu', input_shape=(X_train_tfidf.shape[1],)))
dnn_model.add(Dropout(0.3))
dnn_model.add(Dense(64, activation='relu'))
dnn_model.add(Dropout(0.3))
dnn_model.add(Dense(1, activation='sigmoid'))  # Binary classification

# Step 2: Compile the model
dnn_model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# Step 3: Train the model
print("\nTraining Deep Neural Network model...")
history = dnn_model.fit(
    X_train_tfidf.toarray(), y_train,  # TF-IDF is usually sparse, convert to dense
    epochs=10,
    batch_size=512,
    validation_split=0.2,
    verbose=1
)
print("Model training complete.")

# Step 4: Evaluate on test set
loss, accuracy = dnn_model.evaluate(X_test_tfidf.toarray(), y_test)
print(f"\nTest Accuracy: {accuracy:.4f}")



Training Deep Neural Network model...
Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - accuracy: 0.4233 - loss: 0.6938 - val_accuracy: 0.3171 - val_loss: 0.7002
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 98ms/step - accuracy: 0.5890 - loss: 0.6886 - val_accuracy: 0.3171 - val_loss: 0.7008
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 93ms/step - accuracy: 0.6380 - loss: 0.6868 - val_accuracy: 0.3415 - val_loss: 0.7011
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 160ms/step - accuracy: 0.6380 - loss: 0.6833 - val_accuracy: 0.3415 - val_loss: 0.7016
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 116ms/step - accuracy: 0.6810 - loss: 0.6810 - val_accuracy: 0.3415 - val_loss: 0.7022
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 97ms/step - accuracy: 0.7546 - loss: 0.6745 - val_accuracy: 0.3659 - val_loss: 0.7029
Epoch 7/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

In [18]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import numpy as np

# Predict probabilities on the test set
print("\nEvaluating DNN model on the test set...")
y_pred_prob = dnn_model.predict(X_test_tfidf.toarray())

# Convert probabilities to binary predictions (0 or 1)
y_pred = (y_pred_prob > 0.5).astype("int32").flatten()

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {accuracy:.4f}")

# Display confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
# Format:
# [[TN, FP],
#  [FN, TP]]

# Display classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Negative (0)', 'Positive (1)']))


Evaluating DNN model on the test set...
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step

Accuracy: 0.4559

Confusion Matrix:
[[30  2]
 [35  1]]

Classification Report:
              precision    recall  f1-score   support

Negative (0)       0.46      0.94      0.62        32
Positive (1)       0.33      0.03      0.05        36

    accuracy                           0.46        68
   macro avg       0.40      0.48      0.33        68
weighted avg       0.39      0.46      0.32        68



In [19]:
from sklearn.preprocessing import FunctionTransformer
from tensorflow.keras.preprocessing.text import tokenizer_from_json

# Sentiment mapping
sentiment_labels = {1: 'Positive', 0: 'Negative'}

# New review examples
new_reviews = [
    "This movie was absolutely fantastic! The acting was superb and the storyline kept me engaged throughout.",
    "What a waste of time. The plot was predictable and the characters were incredibly boring. I would not recommend this film.",
    "It was an okay movie, not great but not terrible either. Some good moments but overall quite average."
]

print("\n--- Testing on New Reviews ---")

# Step 1: Preprocess the new reviews
cleaned_new_reviews = [preprocess_text(review) for review in new_reviews]
print("Cleaned Reviews:", cleaned_new_reviews)

# Step 2: Transform to TF-IDF (use the same vectorizer you used for training)
new_reviews_tfidf = tfidf_vectorizer.transform(cleaned_new_reviews)
print("Shape of TF-IDF for new reviews:", new_reviews_tfidf.shape)

# Step 3: Predict using the DNN model
new_predictions_prob = dnn_model.predict(new_reviews_tfidf.toarray())
new_predictions = (new_predictions_prob > 0.5).astype("int32").flatten()

# Step 4: Print results
for review, pred, prob in zip(new_reviews, new_predictions, new_predictions_prob):
    print(f"\nReview: \"{review[:100]}...\"")
    print(f"Predicted Sentiment: {sentiment_labels[pred]} ({pred}) with confidence: {prob[0]:.4f}")



--- Testing on New Reviews ---
Cleaned Reviews: ['movie absolutely fantastic acting superb storyline kept engaged throughout', 'waste time plot predictable characters incredibly boring would recommend film', 'okay movie great terrible either good moments overall quite average']
Shape of TF-IDF for new reviews: (3, 1092)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step

Review: "This movie was absolutely fantastic! The acting was superb and the storyline kept me engaged through..."
Predicted Sentiment: Negative (0) with confidence: 0.4836

Review: "What a waste of time. The plot was predictable and the characters were incredibly boring. I would no..."
Predicted Sentiment: Negative (0) with confidence: 0.4699

Review: "It was an okay movie, not great but not terrible either. Some good moments but overall quite average..."
Predicted Sentiment: Negative (0) with confidence: 0.4962


In [20]:
extra_reviews = [
    "Honestly, I fell asleep halfway through. That should tell you enough.",
    "I've never laughed so hard in my life. 10/10 comedy!",
    "This was the best worst movie I've ever seen. So bad it's actually good.",
    "Meh. It exists. That's about the best thing I can say.",
    "Wow. Just wow. I didn't expect much, and yet I was still disappointed.",
    "What a cinematic masterpiece. A pure work of art!",
    "Two hours of my life I can never get back. Thanks a lot.",
    "It was alright, I guess. Not my favorite but watchable.",
    "Acting? What acting? Felt like a school play on a low budget.",
    "Not terrible, not great. Just average in every way."
]
# Clean and vectorize
cleaned_extra_reviews = [preprocess_text(review) for review in extra_reviews]
extra_reviews_tfidf = tfidf_vectorizer.transform(cleaned_extra_reviews)

# Predict using the DNN model
extra_predictions_prob = dnn_model.predict(extra_reviews_tfidf.toarray())
extra_predictions = (extra_predictions_prob > 0.5).astype("int32").flatten()

# Output predictions
print("\n--- Testing on Extra Reviews ---")
for review, pred, prob in zip(extra_reviews, extra_predictions, extra_predictions_prob):
    print(f"\nReview: \"{review[:100]}...\"")
    print(f"Predicted Sentiment: {'😊 Positive' if pred == 1 else '😠 Negative'} ({prob[0]:.4f})")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step

--- Testing on Extra Reviews ---

Review: "Honestly, I fell asleep halfway through. That should tell you enough...."
Predicted Sentiment: 😠 Negative (0.4814)

Review: "I've never laughed so hard in my life. 10/10 comedy!..."
Predicted Sentiment: 😠 Negative (0.4896)

Review: "This was the best worst movie I've ever seen. So bad it's actually good...."
Predicted Sentiment: 😊 Positive (0.5026)

Review: "Meh. It exists. That's about the best thing I can say...."
Predicted Sentiment: 😊 Positive (0.5201)

Review: "Wow. Just wow. I didn't expect much, and yet I was still disappointed...."
Predicted Sentiment: 😠 Negative (0.4966)

Review: "What a cinematic masterpiece. A pure work of art!..."
Predicted Sentiment: 😊 Positive (0.5023)

Review: "Two hours of my life I can never get back. Thanks a lot...."
Predicted Sentiment: 😠 Negative (0.4830)

Review: "It was alright, I guess. Not my favorite but watchable...."
Predicted 