# Look at Big Picture

IMDB dataset for sentiment analysis -> 0 for negative, 1 for positive

In [1]:
import numpy as np 
import pandas as pd
import matplotlib as mpl 
import matplotlib.pyplot as plt
import seaborn as sns

np.random.seed(42)

In [2]:
# to use DL
import tensorflow as tf
from tensorflow import keras

tf.random.set_seed(42)

ModuleNotFoundError: No module named 'tensorflow'

# Get the Data

## Download the Data

In [None]:
# downloaded from kaggle

## Take a Quick Look at Data

In [None]:
dataset = pd.read_csv('movie.csv')

In [None]:
dataset.head()

In [None]:
dataset.info()

In [None]:
# check for null values
dataset.isnull().sum()

In [None]:
# check for duplicates
dataset.duplicated().sum()

In [None]:
# remove duplicates
dataset = dataset.drop_duplicates()

In [None]:
# distinct values in label and its count
dataset['label'].value_counts()

In [None]:
# both are nearly equal so accuracy is a good metric

## Create Train, Validation and Test Sets

In [6]:
from sklearn.model_selection import train_test_split


X = dataset['text'] 
y = dataset['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

NameError: name 'dataset' is not defined

In [None]:
X_train.shape

In [None]:
X_valid.shape

In [None]:
X_test.shape

# Discovering & Visualizing Data

## Visualizing Data

In [None]:
# Bar plot of label counts in training set
sns.countplot(x=y_train)
plt.title("Label Distribution in Training Set")
plt.xlabel("Sentiment (0 = Negative, 1 = Positive)")
plt.ylabel("Count")
plt.show()

In [None]:
from collections import Counter
import re

# Combine all text in X_train
all_text = ' '.join(X_train)

# Tokenize: remove non-alphabetic characters and lowercase everything
words = re.findall(r'\b[a-z]+\b', all_text.lower())

# Count word frequencies
word_freq = Counter(words)

In [None]:
# Get the top 20 most common words
words_rep = 20
most_common_words = word_freq.most_common(words_rep)

In [None]:
# Bar Chart 
words_bar, freqs_bar = zip(*most_common_words)

plt.figure(figsize=(12, 6))
plt.bar(words_bar, freqs_bar, color='skyblue')
plt.xticks(rotation=45)
plt.title('Top 20 Most Frequent Words in X_train')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

In [None]:
# Word Cloud
from wordcloud import WordCloud

wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_freq)

plt.figure(figsize=(15, 7))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of X_train')
plt.show()

In [None]:
# lets see non stopwords
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

filtered_words = [word for word in words if word not in stop_words]
word_freq = Counter(filtered_words)

In [None]:
most_common_words = word_freq.most_common(words_rep)

In [None]:
# Bar chart
words_bar, freqs_bar = zip(*most_common_words)

plt.figure(figsize=(12, 6))
plt.bar(words_bar, freqs_bar, color='salmon')
plt.xticks(rotation=45)
plt.title('Top 20 Most Frequent Words in X_train (No Stopwords)')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

In [None]:
# word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_freq)

plt.figure(figsize=(15, 7))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of X_train (No Stopwords)')
plt.show()

# Prepare Data for ML Algorithm

## Data Cleaning

In [None]:
# not req.

## Handling Text Attributes

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from bs4 import BeautifulSoup
import re

class TextCleaner(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X.apply(self._clean_text)
    
    def _clean_text(self, text):
        # Remove HTML
        text = BeautifulSoup(text, "html.parser").get_text()
        # Lowercase
        text = text.lower()
        # Remove numbers
        text = re.sub(r'\d+', '', text)
        # Remove punctuation
        text = re.sub(r'[^\w\s]', '', text)
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()
        
        return text

In [None]:
cleaner = TextCleaner()

X_train_clean = cleaner.transform(X_train)
X_valid_clean = cleaner.transform(X_valid)
X_test_clean  = cleaner.transform(X_test)

In [None]:
# we will use embedding layer after applying transformation

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

# Convert to sequences
X_train_seq = tokenizer.texts_to_sequences(X_train_clean)
X_valid_seq = tokenizer.texts_to_sequences(X_valid_clean)
X_test_seq  = tokenizer.texts_to_sequences(X_test_clean)

In [None]:
# Pad sequences
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_len = 200
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post', truncating='post')
X_valid_pad = pad_sequences(X_valid_seq, maxlen=max_len, padding='post', truncating='post')
X_test_pad  = pad_sequences(X_test_seq,  maxlen=max_len, padding='post', truncating='post')

In [None]:
sample = 'i like the movie so much <br> it is excallent'

# Step 1: Clean the text
sample_cleaned = cleaner.transform(pd.Series([sample]))[0]

# Step 2: Tokenize
sample_seq = tokenizer.texts_to_sequences([sample_cleaned])[0]

# Step 3: Pad
sample_pad = pad_sequences([sample_seq], maxlen=200, padding='post', truncating='post')[0]

# Output
print("🔹 Original:", sample)
print("🔹 Cleaned:", sample_cleaned)
print("🔹 Tokenized:", sample_seq)
print("🔹 Padded:", sample_pad)

## Feature Scaling

In [None]:
# no feature scaling required

## Final Data

In [None]:
# final data for ml model is in X_train_pad, X_valid_pad, X_test_pad

# Train a Model

## Create Embedding Layer

In [None]:
# Use glove.6B.50d.txt (50-dimensional vectors)

embedding_index = {}
with open('glove.6B.50d.txt', encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = vector

In [None]:
# embedding matrix
embedding_dim = 50
word_index = tokenizer.word_index
num_words = min(10000, len(word_index) + 1)

embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word_index.items():
    if i < num_words:
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [None]:
from keras.layers import Embedding

embeddings = Embedding(input_dim=num_words, output_dim=embedding_dim,
                        weights=[embedding_matrix], input_length=200)

## Create & Train a Model

In [None]:
keras.backend.clear_session()
np.random.seed(42)
tf.random.set_seed(42)

In [None]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout

model = Sequential([
    embeddings,
    LSTM(100, return_sequences=False),
    Dropout(0.3),
    Dense(100, activation='elu', kernel_initializer='he_normal'),
    Dense(100, activation='elu', kernel_initializer='he_normal'),
    Dropout(0.3),
    Dense(1, activation='sigmoid'),
])

# Freeze the embedding layer
model.layers[0].trainable = False

In [None]:
model.compile(loss='binary_crossentropy',
              optimizer=keras.optimizers.Nadam(learning_rate=1e-4), 
              metrics=['accuracy'])

In [None]:
from keras.callbacks import EarlyStopping, ModelCheckpoint

early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
checkpoint = ModelCheckpoint('best_model.keras', monitor='val_loss', save_best_only=True)

# Train
history = model.fit(X_train_pad, y_train,
                    validation_data=(X_valid_pad, y_valid),
                    epochs=10, batch_size=32,
                    callbacks=[early_stop, checkpoint])

In [None]:
history = model.fit(X_train_pad, y_train,
                    validation_data=(X_valid_pad, y_valid),
                    epochs=20, batch_size=32,
                    callbacks=[early_stop, checkpoint])

In [None]:
from keras.models import load_model

model = load_model('best_model.keras')

In [None]:
# lets unfreeze embegging layer
model.layers[0].trainable = True

In [None]:
model.compile(loss='binary_crossentropy',
              optimizer=keras.optimizers.Nadam(learning_rate=1e-4),
              metrics=['accuracy'])

In [None]:

history = model.fit(X_train_pad, y_train,
                      validation_data=(X_valid_pad, y_valid),
                      epochs=10, batch_size=32,
                      callbacks=[early_stop, checkpoint])

In [None]:
# lets train our final model for other 10 epochs
history = model.fit(X_train_pad, y_train,
                      validation_data=(X_valid_pad, y_valid),
                      epochs=10, batch_size=32,
                      callbacks=[early_stop, checkpoint])

In [None]:
# load best model
model = load_model('best_model.keras')

# Evaluate Test Set

In [None]:
test_loss, test_accuracy = model.evaluate(X_test_pad, y_test, verbose=1)

print(f"\n Test Accuracy: {test_accuracy:.4f}")
print(f" Test Loss: {test_loss:.4f}")