In [197]:
import torch
from torch import nn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [198]:
df = pd.read_csv("IMDB Dataset.csv")

In [199]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [200]:
df.shape

(50000, 2)

## Step 2: Preprocessing (Tokenization & Vocabulary Building)

- We'll convert text into numerical data.

In [201]:
from torch.nn.utils.rnn import pad_sequence

In [202]:
## we can also use the above library but we will use the basics of NLP for text preprocessing

In [203]:
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [204]:
# Download required resources
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kumar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kumar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [205]:
# Initialize stopwords and stemmer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

In [206]:
# tokenize
# to understand this function visit the another notebook
def tokenize(text):
  text = text.lower()
  text = text.replace('?','')
  text = text.replace("'","")
  return text.split()


def preprocess(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove non-alphabetic characters (punctuation, etc.)
    text = re.sub(r'\W', ' ', text)
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stopwords and apply stemming
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    
    return tokens

In [207]:
# Apply preprocessing to the review column
df['tokens'] = df['review'].apply(preprocess)

In [208]:
df.head()

Unnamed: 0,review,sentiment,tokens
0,One of the other reviewers has mentioned that ...,positive,"[one, review, mention, watch, 1, oz, episod, h..."
1,A wonderful little production. <br /><br />The...,positive,"[wonder, littl, product, br, br, film, techniq..."
2,I thought this was a wonderful way to spend ti...,positive,"[thought, wonder, way, spend, time, hot, summe..."
3,Basically there's a family where a little boy ...,negative,"[basic, famili, littl, boy, jake, think, zombi..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"[petter, mattei, love, time, money, visual, st..."


In [209]:
## Building the Vocabulary

""" 
'<UNK>' (short for Unknown) is a special token used to represent out-of-vocabulary (OOV) 
words—words that are not found in the training vocabulary.

The dictionary initializes '<UNK>' with an index of 0. This means that any word that is not in the vocabulary
will be replaced with this special token and assigned index 0.

"""

vocab = {'<UNK>': 0}  # Add <UNK> for unknown words

# Function to build the vocabulary
def build_vocab(row):
    tokens = row['tokens']# Get tokenized review
    
    for token in tokens:  # Iterate through tokens in the review
        if token not in vocab:  # If token is not already in vocab
            vocab[token] = len(vocab)  # Assign it a unique index

In [210]:
# Apply the function to each row of the dataframe
df.apply(build_vocab, axis=1)

0        None
1        None
2        None
3        None
4        None
         ... 
49995    None
49996    None
49997    None
49998    None
49999    None
Length: 50000, dtype: object

In [211]:
vocab

{'<UNK>': 0,
 'one': 1,
 'review': 2,
 'mention': 3,
 'watch': 4,
 '1': 5,
 'oz': 6,
 'episod': 7,
 'hook': 8,
 'right': 9,
 'exactli': 10,
 'happen': 11,
 'br': 12,
 'first': 13,
 'thing': 14,
 'struck': 15,
 'brutal': 16,
 'unflinch': 17,
 'scene': 18,
 'violenc': 19,
 'set': 20,
 'word': 21,
 'go': 22,
 'trust': 23,
 'show': 24,
 'faint': 25,
 'heart': 26,
 'timid': 27,
 'pull': 28,
 'punch': 29,
 'regard': 30,
 'drug': 31,
 'sex': 32,
 'hardcor': 33,
 'classic': 34,
 'use': 35,
 'call': 36,
 'nicknam': 37,
 'given': 38,
 'oswald': 39,
 'maximum': 40,
 'secur': 41,
 'state': 42,
 'penitentari': 43,
 'focus': 44,
 'mainli': 45,
 'emerald': 46,
 'citi': 47,
 'experiment': 48,
 'section': 49,
 'prison': 50,
 'cell': 51,
 'glass': 52,
 'front': 53,
 'face': 54,
 'inward': 55,
 'privaci': 56,
 'high': 57,
 'agenda': 58,
 'em': 59,
 'home': 60,
 'mani': 61,
 'aryan': 62,
 'muslim': 63,
 'gangsta': 64,
 'latino': 65,
 'christian': 66,
 'italian': 67,
 'irish': 68,
 'scuffl': 69,
 'death': 

In [212]:
len(vocab)

71513

In [213]:
## Convert Text to Indices Using

def text_to_indices(text, vocab):
    indexed_text = []  # Initialize an empty list to store numerical indices

    # Tokenize the input text
    for token in preprocess(text):  # Iterate over each token (word) in the tokenized text
        if token in vocab:  # If the token is in the vocabulary
            indexed_text.append(vocab[token])  # Append the corresponding index from vocab
        else:  # If the token is not in the vocabulary
            indexed_text.append(vocab['<UNK>'])  # Append the index for the <UNK> token

    return indexed_text  # Return the list of indices

In [214]:
# Apply the function to the 'review' column
df['encoded_review'] = df['review'].apply(lambda x: text_to_indices(x, vocab))

In [215]:
df

Unnamed: 0,review,sentiment,tokens,encoded_review
0,One of the other reviewers has mentioned that ...,positive,"[one, review, mention, watch, 1, oz, episod, h...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 12, 13..."
1,A wonderful little production. <br /><br />The...,positive,"[wonder, littl, product, br, br, film, techniq...","[136, 137, 138, 12, 12, 139, 140, 141, 142, 14..."
2,I thought this was a wonderful way to spend ti...,positive,"[thought, wonder, way, spend, time, hot, summe...","[200, 136, 201, 202, 143, 203, 204, 205, 206, ..."
3,Basically there's a family where a little boy ...,negative,"[basic, famili, littl, boy, jake, think, zombi...","[264, 265, 137, 266, 267, 268, 269, 270, 271, ..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"[petter, mattei, love, time, money, visual, st...","[304, 305, 238, 143, 306, 307, 308, 139, 4, 30..."
...,...,...,...,...
49995,I thought this movie did a down right good job...,positive,"[thought, movi, right, good, job, creativ, ori...","[200, 273, 9, 362, 1140, 4446, 494, 13, 292, 6..."
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative,"[bad, plot, bad, dialogu, bad, act, idiot, dir...","[483, 211, 483, 213, 483, 361, 3663, 363, 722,..."
49997,I am a Catholic taught in parochial elementary...,negative,"[cathol, taught, parochi, elementari, school, ...","[7976, 10930, 7761, 9310, 764, 12202, 10930, 4..."
49998,I'm going to have to disagree with the previou...,negative,"[go, disagre, previou, comment, side, maltin, ...","[22, 7281, 339, 513, 135, 5430, 1, 1472, 1143,..."


In [216]:
mapping_dict = {'positive': 1, 'negative': 0}
df['Sentiment'] = df['sentiment'].map(mapping_dict)

In [217]:
df

Unnamed: 0,review,sentiment,tokens,encoded_review,Sentiment
0,One of the other reviewers has mentioned that ...,positive,"[one, review, mention, watch, 1, oz, episod, h...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 12, 13...",1
1,A wonderful little production. <br /><br />The...,positive,"[wonder, littl, product, br, br, film, techniq...","[136, 137, 138, 12, 12, 139, 140, 141, 142, 14...",1
2,I thought this was a wonderful way to spend ti...,positive,"[thought, wonder, way, spend, time, hot, summe...","[200, 136, 201, 202, 143, 203, 204, 205, 206, ...",1
3,Basically there's a family where a little boy ...,negative,"[basic, famili, littl, boy, jake, think, zombi...","[264, 265, 137, 266, 267, 268, 269, 270, 271, ...",0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"[petter, mattei, love, time, money, visual, st...","[304, 305, 238, 143, 306, 307, 308, 139, 4, 30...",1
...,...,...,...,...,...
49995,I thought this movie did a down right good job...,positive,"[thought, movi, right, good, job, creativ, ori...","[200, 273, 9, 362, 1140, 4446, 494, 13, 292, 6...",1
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative,"[bad, plot, bad, dialogu, bad, act, idiot, dir...","[483, 211, 483, 213, 483, 361, 3663, 363, 722,...",0
49997,I am a Catholic taught in parochial elementary...,negative,"[cathol, taught, parochi, elementari, school, ...","[7976, 10930, 7761, 9310, 764, 12202, 10930, 4...",0
49998,I'm going to have to disagree with the previou...,negative,"[go, disagre, previou, comment, side, maltin, ...","[22, 7281, 339, 513, 135, 5430, 1, 1472, 1143,...",0


In [218]:
from torch.utils.data import Dataset, DataLoader

In [219]:
class MovieDataset(Dataset):
    def __init__(self, df, vocab):
        self.df = df
        self.vocab = vocab

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        numerical_review = text_to_indices(self.df.iloc[index]['review'], self.vocab)
        sentiment_label  = self.df.iloc[index]['sentiment']

        sentiment = 1 if sentiment_label == "positive" else 0  # Keep sentiment as int (not tensor)

        return numerical_review, sentiment  # Return list (not tensor) to be handled by collate_fn

In [220]:
# dataset = MovieDataset(df, vocab)

In [221]:
# dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

In [222]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    reviews, sentiments = zip(*batch)  # Unzip batch into reviews and sentiments

    # Convert each review (list) into a tensor
    reviews = [torch.tensor(r, dtype=torch.long) for r in reviews]

    # Pad sequences to the same length
    padded_reviews = pad_sequence(reviews, batch_first=True, padding_value=0)

    # Convert sentiments list into a tensor
    sentiments = torch.tensor(sentiments, dtype=torch.long)

    return padded_reviews, sentiments

In [223]:
BATCH_SIZE = 32

dataset = MovieDataset(df, vocab)
train_loader = DataLoader(dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn, shuffle=True)

In [224]:
# Example: Fetch one batch
for batch in train_loader:
    reviews, sentiments = batch
    print(reviews.shape)  # (batch_size, max_seq_len)
    print(sentiments.shape)  # (batch_size,)
    break

torch.Size([32, 364])
torch.Size([32])


In [225]:
import torch.nn as nn

In [226]:
class SimpleRNN(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        
        self.embeddings = nn.Embedding(vocab_size, embedding_dim=50)
        self.rnn = nn.RNN(50, 128, batch_first=True)
        self.fc = nn.Linear(128, 2)

    def forward(self, review):
        embedding_review = self.embeddings(review)
        hidden, final = self.rnn(embedding_review)
        output = self.fc(final[-1])  # final_hidden[-1] has shape (batch_size, hidden_dim)

        return output

In [227]:
model = SimpleRNN(len(vocab))

In [228]:
learning_rate = 0.001
epochs = 5

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [229]:
for epoch in range(epochs):
    total_loss = 0

    for review, sentiment in train_loader:
        sentiment = sentiment.long()  # Fix Issue 1 (Remove squeeze)

        optimizer.zero_grad()

        # Forward pass
        output = model(review)  # Shape: (batch_size, 2)

        # Compute loss
        loss = criterion(output, sentiment)  # Fix Issue 2 (Ensure sentiment is correct shape)

        # Backpropagation
        loss.backward()
        optimizer.step()

        # Accumulate loss
        total_loss += loss.item()  # Fix Issue 3 (Use +=)

    print(f"Epoch: {epoch+1}, Loss: {total_loss:.4f}")

Epoch: 1, Loss: 1091.9156
Epoch: 2, Loss: 1091.1613
Epoch: 3, Loss: 1092.4059
Epoch: 4, Loss: 1091.8880
Epoch: 5, Loss: 1090.3075


In [230]:
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for feature, labels in train_loader:
        # feature, labels = feature.to(device), labels.to(device)
        outputs = model(feature)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = 100 * correct / total
print(f"Validation Accuracy: {accuracy:.2f}%")

Validation Accuracy: 50.12%


In [246]:
def predict(model, review, vocab, threshold=0.4):
    # Convert text to numerical indices
    numerical_review = text_to_indices(review, vocab)

    # Convert to tensor and reshape for batch dimension
    review_tensor = torch.tensor(numerical_review, dtype=torch.long).unsqueeze(0)

    # Get model predictions
    output = model(review_tensor)

    # Convert logits to probabilities
    probs = torch.nn.functional.softmax(output, dim=1)

    # Get the max probability and corresponding sentiment index
    confidence, index = torch.max(probs, dim=1)

    # Convert tensor to float value
    confidence = float(confidence.item())

    print("confidence: ", confidence)

    # If confidence is too low, return "I don't know"
    if confidence < threshold:
        return "I don't know"

    # Convert index to sentiment label
    sentiment_label = "positive" if index.item() == 1 else "negative"

    return sentiment_label

In [247]:
predict(model, "moview was good", vocab)

confidence:  0.5656965374946594


'positive'

In [248]:
# def predict(model, review, vocab, threshold=0.5):
#     # Convert text to numerical indices
#     numerical_review = text_to_indices(review, vocab)

#     # Convert to tensor and reshape for batch dimension
#     review_tensor = torch.tensor(numerical_review, dtype=torch.long).unsqueeze(0)

#     # Get model predictions (logits)
#     output = model(review_tensor)

#     # Convert logits to probabilities
#     probs = torch.nn.functional.softmax(output, dim=1)

#     # Get the max probability and corresponding sentiment index
#     confidence, index = torch.max(probs, dim=1)

#     # Convert tensor to float
#     confidence = float(confidence.item())

#     # 🔄 Flip the prediction
#     flipped_index = 1 - index.item()  # If 0 → 1, if 1 → 0

#     # Print debugging info
#     print("Raw Output (logits):", output)
#     print("Softmax Probabilities:", probs)
#     print("Predicted Index:", index.item())
#     print("Confidence:", confidence)

#     # If confidence is too low, return "I don't know"
#     if confidence < threshold:
#         return "I don't know"

#     # Convert index to sentiment label
#     sentiment_label = "positive" if flipped_index == 1 else "negative"

#     return sentiment_label

In [249]:
predict(model, "moview was good", vocab)

confidence:  0.5656965374946594


'positive'

In [250]:
review = """While films like Parmanu, Padmavati, Padman, Sonu Ke Titu Ki Sweety , 102 Not Out, Hichki or even Blackmail were treat to eyes in the first half of 2018 giving hope that Bollywood movies have matured but then there comes a crap like Race 3 that will spoil your mood and you will wonder that if the makers are taking the audience for granted.

Race 3 tells the story of a dispute between a family when Shamsher Singh (Anil Kapoor) decides to give half of his wealth to the adopted son, Sikander Singh (Salman Khan) and the rest of it between his own kids, Suraj (Saqeeb Saleem) and Sanjana (Daisy Shah)."""

In [251]:
predict(model, review, vocab)

confidence:  0.5106473565101624


'positive'

In [252]:
review = """
I don't know what kind of mental conditions these people are suffering from, who are rating this movie 10/10. Why couldn't they just make it simple why this whole addition of crap. Just another crappy amalgamation of the movies which had a better script. 
I just don't think Salman will make any sensible movies in which he just acts good and doesn't just say mindless dialogues.
"""

In [253]:
predict(model, review, vocab)

confidence:  0.5202184319496155


'negative'