In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("./IMDB Dataset.csv")
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [3]:
df["review"][:5].values

array(["One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to

# Data Cleaning

In [5]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from bs4 import BeautifulSoup

nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

def preprocess_text(text):
    # remove HTML tags
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text()

    # tokenize text
    tokens = word_tokenize(text)

    # remove stopwords and punctuation
    # stemming
    stemmer = PorterStemmer()
    tokens = [
        stemmer.stem(word) for word in tokens if word.isalnum() and word not in stop_words
    ]

    return " ".join(tokens)


df["review"] = df["review"].apply(preprocess_text)
df["review"][:5].values

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  soup = BeautifulSoup(text, "html.parser")


array(['one review mention watch 1 oz episod hook they right exactli happen first thing struck oz brutal unflinch scene violenc set right word go trust show faint heart timid thi show pull punch regard drug sex violenc it hardcor classic use call oz nicknam given oswald maximum secur state penitentari it focus mainli emerald citi experiment section prison cell glass front face inward privaci high agenda em citi home mani aryan muslim gangsta latino christian italian irish scuffl death stare dodgi deal shadi agreement never far would say main appeal show due fact goe show would dare forget pretti pictur paint mainstream audienc forget charm forget romanc oz mess around the first episod i ever saw struck nasti surreal i could say i readi i watch i develop tast oz got accustom high level graphic violenc not violenc injustic crook guard sold nickel inmat kill order get away well manner middl class inmat turn prison bitch due lack street skill prison experi watch oz may becom comfort uncomf

In [6]:
from collections import Counter

token_counts = Counter()
for tokens in df["review"]:
    token_counts.update(tokens.split())

vocab = [t for t, c in token_counts.items() if c > 10]
vocab = ["<UNK>"] + ["<PAD>"] + vocab

token_to_idx = {token: idx for idx, token in enumerate(vocab)}

In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import numpy as np

def as_matrix(sequences):
    sequences = [torch.tensor([token_to_idx.get(word, 0) for word in seq.split()]) for seq in sequences]
    return pad_sequence(sequences, batch_first=True, padding_value=1)  # 1 is the index for <PAD>

class ReviewsDataset(Dataset):
    def __init__(self, data):
        super().__init__()
        self.y = torch.tensor(data['sentiment'].map({'positive': 1, 'negative': 0}).values)
        self.X = as_matrix(data['review'])

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# Create datasets and dataloaders
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)
train_dataset = ReviewsDataset(train_data)
test_dataset = ReviewsDataset(test_data)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [8]:
class SentimentCNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_filters, filter_sizes, output_dim, dropout):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=1)

        self.convs = nn.ModuleList([
            nn.Conv2d(in_channels=1, out_channels=num_filters, kernel_size=(fs, embed_dim))
            for fs in filter_sizes
        ])

        self.fc = nn.Linear(len(filter_sizes) * num_filters, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):
        embedded = self.embedding(text).unsqueeze(1)

        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]

        cat = self.dropout(torch.cat(pooled, dim=1))
        return self.fc(cat)

In [9]:
# hyperparameters
VOCAB_SIZE = len(vocab)
EMBED_DIM = 100
NUM_FILTERS = 100
FILTER_SIZES = [3, 4, 5]
OUTPUT_DIM = 1
DROPOUT = 0.5

if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")
print(device)

model = SentimentCNN(
    VOCAB_SIZE, EMBED_DIM, NUM_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT
).to(device)

cuda


In [10]:
from tqdm.auto import tqdm

# Define loss function and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters())


# Training function
def train(model, iterator, optimizer, criterion):
    model.train()
    epoch_loss = 0
    epoch_acc = 0

    for X, y in tqdm(iterator):
        X, y = X.to(device), y.to(device)
        optimizer.zero_grad()

        predictions = model(X).squeeze(1)
        loss = criterion(predictions, y.float())

        rounded_preds = torch.round(torch.sigmoid(predictions))
        correct = (rounded_preds == y).float()
        acc = correct.sum() / len(correct)

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)


# Evaluation function
def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    epoch_acc = 0

    with torch.no_grad():
        for X, y in iterator:
            X, y = X.to(device), y.to(device)
            predictions = model(X).squeeze(1)
            loss = criterion(predictions, y.float())

            rounded_preds = torch.round(torch.sigmoid(predictions))
            correct = (rounded_preds == y).float()
            acc = correct.sum() / len(correct)

            epoch_loss += loss.item()
            epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [11]:
# Training loop
N_EPOCHS = 5

for epoch in range(N_EPOCHS):
    train_loss, train_acc = train(model, train_loader, optimizer, criterion)
    test_loss, test_acc = evaluate(model, test_loader, criterion)

    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Test Loss: {test_loss:.3f} |  Test Acc: {test_acc*100:.2f}%')

  0%|          | 0/1250 [00:00<?, ?it/s]

Epoch: 01
	Train Loss: 0.525 | Train Acc: 73.00%
	 Test Loss: 0.358 |  Test Acc: 84.15%


  0%|          | 0/1250 [00:00<?, ?it/s]

Epoch: 02
	Train Loss: 0.380 | Train Acc: 83.13%
	 Test Loss: 0.302 |  Test Acc: 86.79%


  0%|          | 0/1250 [00:00<?, ?it/s]

Epoch: 03
	Train Loss: 0.314 | Train Acc: 86.23%
	 Test Loss: 0.284 |  Test Acc: 87.37%


  0%|          | 0/1250 [00:00<?, ?it/s]

Epoch: 04
	Train Loss: 0.266 | Train Acc: 88.85%
	 Test Loss: 0.280 |  Test Acc: 88.08%


  0%|          | 0/1250 [00:00<?, ?it/s]

Epoch: 05
	Train Loss: 0.228 | Train Acc: 90.63%
	 Test Loss: 0.303 |  Test Acc: 87.36%
