# Convolutional Neural Networks for Sentence Classification

# Reviews Dataset

In [1]:
import pathlib
import random
import itertools
import torch
from typing import Any


MOVIE_REVIEW_DIR = pathlib.Path('data/movie_reviews/txt_sentoken/')


def pick_device() -> torch.device:
    if torch.backends.mps.is_available():
        print("Using mps backend")
        return torch.device("mps")
    if torch.cuda.is_available():
        print("Using cuda backend")
        print(f'There are {torch.cuda.device_count()} GPU(s) available.')
        return torch.device("cuda")
    print('No GPU available, using the CPU.')
    return torch.device("cpu")

device = pick_device()  # torch.device("cpu")

def load_reviews(path: pathlib.Path, glob: str) -> list[str]:
    """Build movie review training set from the dataset directory"""
    x = []
    for filename in path.glob(glob):
        with filename.open() as f:
            x.extend(f.readlines())
    return x

def tokenize(sentence: str) -> list[str]:
    """Tokenize the sentence into words."""
    return sentence.lower().split(' ')


MAX_SENTENCE_LEN = 60  # Assumes only first N elements are allowed
PAD = '<pad>'
PAD_IDX = 0
UNK = '<unk>'
UNK_IDX = 1
START_IDX = 2

def build_vocab(contents: list[str]) -> dict[str, int]:
    """Build an index of words to token number."""
    vocab: dict[str, int] = {}
    vocab[PAD] = PAD_IDX
    vocab[UNK] = UNK_IDX
    index = START_IDX
    max_words = 0
    for line in contents:
        tokens = tokenize(line)
        max_words = max(len(tokens), max_words)
        for word in itertools.islice(tokens, MAX_SENTENCE_LEN):
            if word not in vocab:
                vocab[word] = index
                index += 1
    return vocab, max_words

POS = load_reviews(MOVIE_REVIEW_DIR, "rt-polarity.pos")
NEG = load_reviews(MOVIE_REVIEW_DIR, "rt-polarity.neg")
#POS = load_reviews(MOVIE_REVIEW_DIR / "pos", glob="*.txt")
#NEG = load_reviews(MOVIE_REVIEW_DIR / "neg", glob="*.txt")

vocab, max_words = build_vocab(POS + NEG)
vocab_inv = { v: k for k, v in vocab.items()}

# From the paper: The input is a concatenated sentence (length n) where each word is
# represented as a k-dimensional word vector. X[i, i+j] = concat(Xi, Xi+1, ..., Xi+j)
# The sentence is padded where necessary.

def encode_sentence(sentence: str) -> list[int]:
    """Encode a sentence as a series of token indexes."""
    encoded = [ vocab.get(word.lower(), UNK_IDX) for word in itertools.islice(tokenize(sentence), MAX_SENTENCE_LEN) ]
    if len(encoded) < MAX_SENTENCE_LEN:
        encoded.extend([PAD_IDX] * (MAX_SENTENCE_LEN - len(encoded)))
    return encoded


encoded_reviews: list[list[int]] = []
labels: list[tuple[float]] = []
for review in POS:
    encoded_reviews.append(encode_sentence(review))
    labels.append((0.0, 1.0))
for review in NEG:
    encoded_reviews.append(encode_sentence(review))
    labels.append((1.0, 0.0))
review_inputs = torch.tensor(encoded_reviews)
review_labels = torch.tensor(labels)

print(f"Vocab size: {len(vocab)}, {max_words}")
print(f"Num reviews: {len(review_inputs)} / {len(review_labels)}")
print(f"Vocab sample: {list(vocab.items())[0:15]}")
print(f"Vocab inv sample: {list(vocab_inv.items())[0:15]}")
print(review_inputs[0])
print(review_labels[0])
print(review_inputs[-1])
print(review_labels[-1])

Using cuda backend
There are 1 GPU(s) available.
Vocab size: 21619, 61
Num reviews: 10662 / 10662
Vocab sample: [('<pad>', 0), ('<unk>', 1), ('the', 2), ('rock', 3), ('is', 4), ('destined', 5), ('to', 6), ('be', 7), ('21st', 8), ("century's", 9), ('new', 10), ('"', 11), ('conan', 12), ('and', 13), ('that', 14)]
Vocab inv sample: [(0, '<pad>'), (1, '<unk>'), (2, 'the'), (3, 'rock'), (4, 'is'), (5, 'destined'), (6, 'to'), (7, 'be'), (8, '21st'), (9, "century's"), (10, 'new'), (11, '"'), (12, 'conan'), (13, 'and'), (14, 'that')]
tensor([ 2,  3,  4,  5,  6,  7,  2,  8,  9, 10, 11, 12, 11, 13, 14, 15, 16,  6,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0])
tensor([0., 1.])
tensor([12681,     4,  1473,    25,    58,   132,   653,   700,  2838,    13,
          700,  9113,    32,    33,     0,     0,     0,     0,     0,     0,
            0,     0, 

# word2vec

In [5]:
import numpy
import torch
from tqdm import tqdm
from gensim.models import KeyedVectors
from gensim import models

WORD2VEC = 'data/GoogleNews-vectors-negative300.bin'
MAX_WORDLEN = 50
WEIGHT_SIZE = 4
EMBED_DIM = 300  # Embedding size


def load_word2vec(vocab: dict[str, int]) -> torch.tensor:
    # Could be improved matching distribution of word2vec
    vocab_vec = numpy.random.uniform(-0.25, 0.25, (len(vocab), EMBED_DIM))
    vocab_vec[PAD_IDX] = numpy.zeros(EMBED_DIM)
    found: set[int] = set()
    with open(WORD2VEC, 'rb') as fd:
        line = fd.readline()
        parts = line.decode('utf-8').split(' ')
        words = int(parts[0])
        word_size = int(parts[1])
        if word_size != EMBED_DIM:
            raise ValueError(f"Unexpected word size {word_size} != {EMBED_DIM}")
        for i in tqdm(range(0, words)):
            # Read the next word
            s = b''
            while True:
                ch = fd.read(1)
                if ch == b' ':
                    break
                if ch == b'':
                    raise ValueError("Unexpected eof")
                if ch != b'\n':
                    s += ch
                if len(s) > word_size:
                    raise ValueError(f"Word was too long {s}")
            weights = fd.read(word_size * WEIGHT_SIZE)
            wd = numpy.frombuffer(weights, dtype=numpy.float32)
            word = s.decode('utf-8').strip().lower()
            # Only load words in the vocabulary
            if (idx := vocab.get(word)) is not None:
                if idx not in found:
                    vocab_vec[idx] = wd
                    found |= set({idx})

    return torch.tensor(vocab_vec)


w2v_model = models.KeyedVectors.load_word2vec_format(WORD2VEC, binary=True)


def encode_word2vec(vocab: dict[str, int]) -> torch.tensor:
    # Could be improved matching distribution of word2vec
    vocab_vec = numpy.random.uniform(-0.25, 0.25, (len(vocab), EMBED_DIM))
    vocab_vec[PAD_IDX] = numpy.zeros(EMBED_DIM)
    found = 0
    for word, idx in tqdm(vocab.items()):
        if word in w2v_model:
            vocab_vec[idx] = w2v_model[word]
            found += 1
    return torch.tensor(vocab_vec, dtype=torch.float64), found

vocab2vec, found = encode_word2vec(vocab)
print(len(vocab), len(vocab2vec), found)
print(vocab2vec.shape)

100%|██████████| 21619/21619 [00:00<00:00, 415947.97it/s]

21619 21619 15877
torch.Size([21619, 300])





In [10]:
import torch.nn as nn

print(vocab2vec[0][:10])
print(vocab2vec[1][:10])
print(vocab2vec[2][:10])
print()
print("vocab['car']")
print(vocab['car'])
print("vocab2vec['car']")
print(vocab['car'])
print(vocab2vec[6514][:10])
print("w2v_model['car']")
print(w2v_model['car'][:10])


tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=torch.float64)
tensor([-0.1839,  0.1440,  0.0476, -0.0091,  0.1883, -0.1222,  0.0843, -0.0199,
        -0.1804,  0.2110], dtype=torch.float64)
tensor([ 0.0801,  0.1050,  0.0498,  0.0535, -0.0674, -0.1206,  0.0352, -0.1187,
         0.0439,  0.0302], dtype=torch.float64)

vocab['car']
6514
vocab2vec['car']
6514
tensor([ 0.1309,  0.0084,  0.0334, -0.0588,  0.0400, -0.1426,  0.0493, -0.1689,
         0.2090,  0.1196], dtype=torch.float64)
w2v_model['car']
[ 0.13085938  0.00842285  0.03344727 -0.05883789  0.04003906 -0.14257812
  0.04931641 -0.16894531  0.20898438  0.11962891]


In [11]:
import torch
from sklearn.model_selection import train_test_split

Xtr, Xdev, Ytr, Ydev = train_test_split(review_inputs, review_labels, test_size=0.1, random_state=31337)

print(Xtr.shape, Ytr.shape)
print(Xdev.shape, Ydev.shape)

torch.Size([9595, 60]) torch.Size([9595, 2])
torch.Size([1067, 60]) torch.Size([1067, 2])


# Model

In [17]:
import torch.nn as nn
import torch.nn.functional as F

FILTER_WINDOWS = (3, 4, 5)
FEATURE_MAPS = 200


class Model(nn.Module):
    """A model for training a CNN text classifier."""

    def __init__(self, embeddings: torch.tensor, rand_embed: bool, freeze_embedding: bool, num_classes: int, dropout: float = 0.5):
        """Initialize Model."""
        super().__init__()
        if not rand_embed:
            self.embedding = nn.Embedding.from_pretrained(embeddings, freeze=freeze_embedding).float()
        else:
            self.embedding = nn.Embedding(
                num_embeddings=vocab2vec.shape[0],
                embedding_dim=vocab2vec.shape[1],
                padding_idx=0,
                max_norm=5.0
            )
        self.conv_list = nn.ModuleList([
            nn.Conv1d(
                in_channels=EMBED_DIM,
                out_channels=FEATURE_MAPS,
                kernel_size=filter_size,
            )
            for filter_size in FILTER_WINDOWS
        ])
        self.fc = nn.Linear(FEATURE_MAPS * len(FILTER_WINDOWS), num_classes)
        self.dropout = nn.Dropout(p=dropout)


    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """Forward pass"""
        # Out: (BATCH_SZ, MAX_SENTENCE_LEN, WORD_SIZE)
        self.x_embed = self.embedding(x)
        # Reshape to match conv1d input
        # Out: (BATCH_SZ, WORD_SIZE, MAX_SENTENCE_LEN)
        self.x_reshape = self.x_embed.permute(0, 2, 1) 
        # (BATCH_SZ, FEATURE_MAPS, MAX_SENTENCE_LEN-2)
        # (BATCH_SZ, FEATURE_MAPS, MAX_SENTENCE_LEN-3)
        # (BATCH_SZ, FEATURE_MAPS, MAX_SENTENCE_LEN-4)
        self.x_conv_list = [F.relu(conv(self.x_reshape)) for conv in self.conv_list]
        # (BATCH_SZ, FEATURE_MAPS, 1)
        self.x_pool_list = [
            F.max_pool1d(x_conv, kernel_size=x_conv.shape[2])
            for x_conv in self.x_conv_list
        ]
        # (BATCH_SZ, FEATURE_MAPS * 3)
        self.x_fc = torch.cat([x_pool.squeeze(dim=2) for x_pool in self.x_pool_list], dim=1)
        # (BATCH_SZ, FEATURE_MAPS * 3)
        self.logits = self.fc(self.dropout(self.x_fc))
        self.logits_class = F.softmax(self.logits, dim=1)
        return self.logits_class

    def __str__(self) -> str:
        return self.__class__.__name__

# Training

In [19]:
from torch.utils.data import (TensorDataset, DataLoader, RandomSampler,
                              SequentialSampler)
from tqdm import tqdm
import math
import time

MINI_BATCH_SZ = 128

g = torch.Generator(device=device).manual_seed(31337)
g_cpu = torch.Generator().manual_seed(31337)


tr_data = TensorDataset(Xtr, Ytr)
tr_loader = DataLoader(tr_data, sampler=RandomSampler(tr_data, generator=g_cpu), batch_size=MINI_BATCH_SZ, pin_memory=True, pin_memory_device=device.type)

val_data = TensorDataset(Xdev, Ydev)
val_loader = DataLoader(val_data, sampler=SequentialSampler(val_data), batch_size=MINI_BATCH_SZ, pin_memory=True, pin_memory_device=device.type)

model = Model(embeddings=vocab2vec, rand_embed=False, freeze_embedding=False, num_classes=2, dropout=0.5)

lr = 0.25
opt = torch.optim.Adadelta(model.parameters(), lr=lr, rho=0.95)

model.to(device)
for epoch in range(20):
    model.train()
    lossi = []
    accuracyi = []
    t0_epoch = time.time()

    with torch.enable_grad():
        for step, batch in enumerate(tr_loader):
            Xb, Yb = tuple(t.to(device) for t in batch)
            model.zero_grad()
            logits = model.forward(Xb)
            loss = F.cross_entropy(logits, Yb)
            loss.backward()

            lossi.append(loss.item())
            accuracy = (torch.round(logits) == Yb).cpu().numpy().mean() * 100
            accuracyi.append(accuracy)

            opt.step()

    avg_train_loss = numpy.mean(lossi)
    iters = len(lossi)
    time_elapsed = time.time() - t0_epoch

    t0_epoch = time.time()

    model.eval()
    with torch.no_grad():
        lossi = []
        accuracyi = []
        for step, batch in enumerate(val_loader):
            Xb, Yb = tuple(t.to(device) for t in batch)
            logits = model.forward(Xb)
            loss = F.cross_entropy(logits, Yb)

            lossi.append(loss.item())
            accuracy = (torch.round(logits) == Yb).cpu().numpy().mean() * 100
            accuracyi.append(accuracy)

        avg_eval_loss = numpy.mean(lossi)
        avg_eval_accuracy = numpy.mean(accuracyi)

    eval_time_elapsed = time.time() - t0_epoch

    print(f"{epoch:^3} | {iters/time_elapsed:3.2f}i/s | {avg_train_loss:0.3f} | {avg_eval_loss:0.3f} | {avg_eval_accuracy:0.2f} | {time_elapsed:^7.2f} / {eval_time_elapsed:^7.2f}")

       

 0  | 35.49i/s | 0.687 | 0.680 | 64.88 |  2.11   /  0.05  
 1  | 39.67i/s | 0.664 | 0.655 | 65.92 |  1.89   /  0.05  
 2  | 40.21i/s | 0.626 | 0.604 | 73.29 |  1.87   /  0.05  
 3  | 40.31i/s | 0.587 | 0.563 | 76.23 |  1.86   /  0.05  
 4  | 40.29i/s | 0.556 | 0.541 | 77.71 |  1.86   /  0.05  
 5  | 40.44i/s | 0.534 | 0.528 | 78.40 |  1.85   /  0.05  
 6  | 39.31i/s | 0.519 | 0.522 | 78.92 |  1.91   /  0.05  
 7  | 40.24i/s | 0.505 | 0.513 | 79.27 |  1.86   /  0.05  
 8  | 40.18i/s | 0.493 | 0.509 | 79.79 |  1.87   /  0.05  
 9  | 40.34i/s | 0.480 | 0.506 | 80.31 |  1.86   /  0.05  
10  | 40.18i/s | 0.471 | 0.503 | 79.71 |  1.87   /  0.05  
11  | 40.33i/s | 0.461 | 0.502 | 79.88 |  1.86   /  0.05  
12  | 40.16i/s | 0.452 | 0.501 | 80.57 |  1.87   /  0.05  
13  | 40.39i/s | 0.444 | 0.501 | 80.05 |  1.86   /  0.05  
14  | 40.31i/s | 0.433 | 0.498 | 79.97 |  1.86   /  0.05  
15  | 40.16i/s | 0.427 | 0.498 | 80.31 |  1.87   /  0.05  
16  | 39.08i/s | 0.418 | 0.497 | 80.40 |  1.92   /  0.05

# Prediction

In [25]:

model.eval()

def predict(text: str) -> None:
    """Predict probability that a review is positive."""
    text = text.replace(",", " ,")
    text = text.replace(".", " .").lower()
    #print(text)
    encoded = torch.tensor(encode_sentence(text), device=device)
    input_data = encoded.unsqueeze(dim=0)

    # Compute logits
    with torch.no_grad():
        logits = model.forward(input_data)

    # Compute probability
    prob = logits.squeeze(dim=0)
    print(f"This review is {prob[1] * 100:.2f}% positive.")


predict("All of friends slept while watching this movie, but I really enjoyed it.")
predict("I have waited so long for this movie and I am now so satisfied and happy.")
predict("This is a great movie.")
predict("I was laughing the whole time.")
predict("Fantastic movie that I would watch again.")
print("---")
predict("This movie is long and boring.")
predict("I don't like the ending.")
predict("Do not bother watching this movie.")
predict("I hated this movie more than any other movie.")



This review is 69.52% positive.
This review is 73.19% positive.
This review is 98.76% positive.
This review is 9.61% positive.
This review is 54.91% positive.
---
This review is 0.01% positive.
This review is 9.06% positive.
This review is 0.42% positive.
This review is 4.02% positive.
