In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from collections import Counter
import gensim.downloader as api
import nltk
import re

nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [2]:
stop_words = set(nltk.corpus.stopwords.words("english"))
lemmatizer = nltk.stem.WordNetLemmatizer()


def preprocess(text):
    text = text.lower()

    # remove HTML tags
    text = re.sub("<.*?>", "", text)

    # remove special characters and digits
    text = re.sub(r"[^a-zA-Z\s]", "", text)

    text = nltk.tokenize.word_tokenize(text)

    # remove stopwords and lemmatize
    text = [
        lemmatizer.lemmatize(word) for word in text if word not in stop_words
    ]

    return " ".join(text)


url = "https://raw.githubusercontent.com/aspisov/paperweights/main/CNNClassification/IMDB_Dataset.csv"
df = pd.read_csv(url)

df["review"] = df["review"].apply(preprocess)

In [3]:
word_vectors = api.load("glove-wiki-gigaword-100")



In [4]:
def create_vocab(df, min_freq=10):
    word_freq = Counter()

    for sentence in df["review"]:
        word_freq.update(sentence.split())

    vocab = ["<UNK>", "<PAD>"] + [
        w
        for w, c in word_freq.items()
        if c >= min_freq and w in word_vectors.key_to_index
    ]
    word_to_idx = {w: i for i, w in enumerate(vocab)}

    return vocab, word_to_idx

vocab, word_to_idx = create_vocab(df)

In [5]:
EMBED_DIM = word_vectors.vector_size
embedding_matrix = np.zeros((len(vocab), EMBED_DIM))
for i, word in enumerate(vocab):
    if word in word_vectors.key_to_index:
        embedding_matrix[i] = word_vectors[word]
    else:
        embedding_matrix[i] = np.random.normal(size=EMBED_DIM)

In [13]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader


def as_matrix(sequence):
    sequence = [
        torch.tensor([word_to_idx.get(word, 0) for word in text.split()])
        for text in sequence
    ]
    return pad_sequence(sequence, batch_first=True, padding_value=1)


class ReviewsDataset(Dataset):
    def __init__(self, df):
        super().__init__()
        self.y = torch.tensor(
            df["sentiment"].map({"positive": 1, "negative": 0}).values
        )
        self.X = as_matrix(df["review"])

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_dataset, test_dataset = ReviewsDataset(train_df), ReviewsDataset(test_df)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)

In [24]:
class SentimentCNN(nn.Module):
    def __init__(
        self, vocab_size, embed_dim, num_filters, filter_sizes, embedding_matrix
    ):
        super().__init__()

        self.embedding = nn.Embedding.from_pretrained(
            torch.FloatTensor(embedding_matrix), freeze=False, padding_idx=1
        )

        self.convs = nn.ModuleList(
            [
                nn.Conv1d(
                    in_channels=embed_dim,
                    out_channels=num_filters,
                    kernel_size=fs,
                )
                for fs in filter_sizes
            ]
        )

        self.fc = nn.Linear(len(filter_sizes) * num_filters, 1)
        self.dropout = nn.Dropout(0.1)
        print(f"{self._count_parameters()/1e6:.2f}M parameters")

    def _count_parameters(self):
        return sum(p.numel() for p in self.parameters() if p.requires_grad)

    def forward(self, input):
        # input shape: (batch_size, seq_len)
        embedded = self.embedding(input)  # (batch_size, seq_len, embed_dim)
        embedded = embedded.permute(0, 2, 1)  # (batch_size, embed_dim, seq_len)

        conved = [F.relu(conv(embedded)) for conv in self.convs]
        pooled = [
            F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved
        ]
        cat = self.dropout(torch.cat(pooled, dim=1))
        return self.fc(cat)

In [25]:
# hyperparameters
VOCAB_SIZE = len(vocab)
NUM_FILTERS = 100
FILTER_SIZES = [3, 4, 5]

if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")
print("using: ", device)

model = SentimentCNN(
    VOCAB_SIZE, EMBED_DIM, NUM_FILTERS, FILTER_SIZES, embedding_matrix
).to(device)

using:  cuda
2.54M parameters


In [26]:
from tqdm.auto import tqdm

criterion = torch.nn.BCEWithLogitsLoss()
optimizer = optim.AdamW(model.parameters())


# Training function
def train(model, iterator, optimizer, criterion):
    model.train()
    epoch_loss = 0
    epoch_acc = 0

    for batch in tqdm(iterator):
        X, y = batch[0].to(device), batch[1].to(device)
        optimizer.zero_grad()

        predictions = model(X).squeeze(1)
        loss = criterion(predictions, y.float())

        rounded_preds = torch.round(torch.sigmoid(predictions))
        correct = (rounded_preds == y).float()
        acc = correct.sum() / len(correct)

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

# Evaluation function
def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    epoch_acc = 0

    with torch.no_grad():
        for batch in iterator:
            X, y = batch[0].to(device), batch[1].to(device)
            predictions = model(X).squeeze(1)
            loss = criterion(predictions, y.float())

            rounded_preds = torch.round(torch.sigmoid(predictions))
            correct = (rounded_preds == y).float()
            acc = correct.sum() / len(correct)

            epoch_loss += loss.item()
            epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [27]:
N_EPOCHS = 5

for epoch in range(N_EPOCHS):
    train_loss, train_acc = train(model, train_loader, optimizer, criterion)
    test_loss, test_acc = evaluate(model, test_loader, criterion)

    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Test Loss: {test_loss:.3f} |  Test Acc: {test_acc*100:.2f}%')

  0%|          | 0/1250 [00:00<?, ?it/s]

Epoch: 01
	Train Loss: 0.360 | Train Acc: 83.58%
	 Test Loss: 0.264 |  Test Acc: 88.71%


  0%|          | 0/1250 [00:00<?, ?it/s]

Epoch: 02
	Train Loss: 0.210 | Train Acc: 91.76%
	 Test Loss: 0.253 |  Test Acc: 89.52%


  0%|          | 0/1250 [00:00<?, ?it/s]

Epoch: 03
	Train Loss: 0.112 | Train Acc: 95.98%
	 Test Loss: 0.303 |  Test Acc: 89.10%


  0%|          | 0/1250 [00:00<?, ?it/s]

Epoch: 04
	Train Loss: 0.048 | Train Acc: 98.52%
	 Test Loss: 0.365 |  Test Acc: 88.90%


  0%|          | 0/1250 [00:00<?, ?it/s]

Epoch: 05
	Train Loss: 0.017 | Train Acc: 99.62%
	 Test Loss: 0.427 |  Test Acc: 88.91%
