In [8]:
import pandas as pd
import numpy as np

In [11]:
url = "https://raw.githubusercontent.com/aspisov/paperweights/main/CNNClassification/IMDB_Dataset.csv"
df = pd.read_csv(url)
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


# Data Cleaning

In [62]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from bs4 import BeautifulSoup

nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()


def preprocess_text(text):
    # remove HTML tags
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text()

    # tokenize text
    tokens = word_tokenize(text)

    # removal of stopwords and punctuation and stemming
    tokens = [
        stemmer.stem(word)
        for word in tokens
        if word.isalnum() and word not in stop_words
    ]

    return " ".join(tokens).lower()


df["review"] = df["review"].apply(preprocess_text)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [63]:
from collections import Counter

token_counts = Counter()
for tokens in df["review"]:
    token_counts.update(tokens.split())

vocab = [t for t, c in token_counts.items() if c > 10]
vocab = ["<UNK>"] + ["<PAD>"] + vocab

token_to_idx = {token: idx for idx, token in enumerate(vocab)}
len(vocab)

18389

In [64]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import numpy as np

def as_matrix(sequences):
    sequences = [torch.tensor([token_to_idx.get(word, 0) for word in seq.split()]) for seq in sequences]
    return pad_sequence(sequences, batch_first=True, padding_value=1)  # 1 is the index for <PAD>

class ReviewsDataset(Dataset):
    def __init__(self, data):
        super().__init__()
        self.y = torch.tensor(data['sentiment'].map({'positive': 1, 'negative': 0}).values)
        self.X = as_matrix(data['review'])

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# split into train/test and create datasets and dataloaders
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)
train_dataset = ReviewsDataset(train_data)
test_dataset = ReviewsDataset(test_data)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [65]:
class SentimentCNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_filters, filter_sizes, dropout):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=1)

        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=embed_dim, out_channels=num_filters, kernel_size=fs)
            for fs in filter_sizes
        ])

        self.fc = nn.Linear(len(filter_sizes) * num_filters, 1)
        self.dropout = nn.Dropout(dropout)
        print(f"{self._count_parameters()/1e6:.2f}M parameters")

    def _count_parameters(self):
        return sum(p.numel() for p in self.parameters() if p.requires_grad)

    def forward(self, text):
        # text shape: (batch_size, seq_len)
        embedded = self.embedding(text)  # (batch_size, seq_len, embed_dim)
        embedded = embedded.permute(0, 2, 1)  # (batch_size, embed_dim, seq_len)

        conved = [F.relu(conv(embedded)) for conv in self.convs]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]

        cat = self.dropout(torch.cat(pooled, dim=1))
        return self.fc(cat)

In [66]:
# hyperparameters
VOCAB_SIZE = len(vocab)
EMBED_DIM = 100
NUM_FILTERS = 100
FILTER_SIZES = [3, 4, 5]
DROPOUT = 0.5

if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")
print("using: ", device)

model = SentimentCNN(
    VOCAB_SIZE, EMBED_DIM, NUM_FILTERS, FILTER_SIZES, DROPOUT
).to(device)

cuda
1.96M parameters


In [67]:
from tqdm.auto import tqdm

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters())


# Training function
def train(model, iterator, optimizer, criterion):
    model.train()
    epoch_loss = 0
    epoch_acc = 0

    for X, y in tqdm(iterator):
        X, y = X.to(device), y.to(device)
        optimizer.zero_grad()

        predictions = model(X).squeeze(1)
        loss = criterion(predictions, y.float())

        rounded_preds = torch.round(torch.sigmoid(predictions))
        correct = (rounded_preds == y).float()
        acc = correct.sum() / len(correct)

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)


# Evaluation function
def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    epoch_acc = 0

    with torch.no_grad():
        for X, y in iterator:
            X, y = X.to(device), y.to(device)
            predictions = model(X).squeeze(1)
            loss = criterion(predictions, y.float())

            rounded_preds = torch.round(torch.sigmoid(predictions))
            correct = (rounded_preds == y).float()
            acc = correct.sum() / len(correct)

            epoch_loss += loss.item()
            epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [68]:
# Training loop
N_EPOCHS = 5

for epoch in range(N_EPOCHS):
    train_loss, train_acc = train(model, train_loader, optimizer, criterion)
    test_loss, test_acc = evaluate(model, test_loader, criterion)

    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Test Loss: {test_loss:.3f} |  Test Acc: {test_acc*100:.2f}%')

  0%|          | 0/1250 [00:00<?, ?it/s]

Epoch: 01
	Train Loss: 0.532 | Train Acc: 72.63%
	 Test Loss: 0.352 |  Test Acc: 84.95%


  0%|          | 0/1250 [00:00<?, ?it/s]

Epoch: 02
	Train Loss: 0.382 | Train Acc: 82.87%
	 Test Loss: 0.306 |  Test Acc: 87.12%


  0%|          | 0/1250 [00:00<?, ?it/s]

Epoch: 03
	Train Loss: 0.316 | Train Acc: 86.29%
	 Test Loss: 0.288 |  Test Acc: 87.86%


  0%|          | 0/1250 [00:00<?, ?it/s]

Epoch: 04
	Train Loss: 0.274 | Train Acc: 88.54%
	 Test Loss: 0.277 |  Test Acc: 88.37%


  0%|          | 0/1250 [00:00<?, ?it/s]

Epoch: 05
	Train Loss: 0.231 | Train Acc: 90.56%
	 Test Loss: 0.282 |  Test Acc: 88.38%
