# Convolutional Neural Networks for Sentence Classification

# Reviews Dataset

In [5]:
import pathlib
import random
import itertools
import torch


MOVIE_REVIEW_DIR = pathlib.Path('data/movie_reviews/txt_sentoken/')


def load_reviews(path: pathlib.Path) -> list[str]:
    """Build movie review training set from the dataset directory"""
    x = []
    for filename in path.glob("*.txt"):
        with filename.open() as f:
            x.extend(f.readlines())
    return x


def tokenize(sentence: str) -> list[str]:
    """Tokenize the sentence into words."""
    return sentence.lower().split(' ')


MAX_SENTENCE_LEN = 30  # Assumes only first N elements are allowed


def build_vocab(contents: list[str]) -> dict[str, int]:
    """Build an index of words to token number."""
    vocab: dict[str, int] = {}
    vocab['</s>'] = 0
    index = 1
    for line in load_reviews(MOVIE_REVIEW_DIR / "pos") + load_reviews(MOVIE_REVIEW_DIR / "neg"):
        tokens = tokenize(line)
        for word in itertools.islice(tokens, MAX_SENTENCE_LEN):
            if word not in vocab:
                vocab[word] = index
                index += 1
    return vocab


vocab = build_vocab(load_reviews(MOVIE_REVIEW_DIR / "pos") + load_reviews(MOVIE_REVIEW_DIR / "neg"))
vocab_inv = { v: k for k, v in vocab.items()}


def encode_sentence(sentence: str) -> torch.tensor:
    """Encode a sentence as a series of token indexes."""
    encoded = [ vocab[word.lower()] for word in itertools.islice(tokenize(sentence), MAX_SENTENCE_LEN) ]
    if len(encoded) < MAX_SENTENCE_LEN:
        encoded.extend([0] * (MAX_SENTENCE_LEN - len(encoded)))
    return torch.tensor(encoded)


reviews: list[tuple[str, float]] = []
for review in load_reviews(MOVIE_REVIEW_DIR / "pos"):
    reviews.append((encode_sentence(review), 1.0))
for review in load_reviews(MOVIE_REVIEW_DIR / "neg"):
    reviews.append((encode_sentence(review), 0.0))

print(f"Vocab size: {len(vocab)}")
print(f"Num reviews: {len(reviews)}")

Vocab size: 47431
Num reviews: 64720


# word2vec

In [26]:
import numpy
import torch
from tqdm import tqdm


WORD2VEC = 'data/GoogleNews-vectors-negative300.bin'
MAX_WORDLEN = 50
WEIGHT_SIZE = 4
WORD_SIZE = 300


def load_word2vec(vocab: dict[str, int]) -> torch.tensor:
    vocab_vec = numpy.zeros((len(vocab), WORD_SIZE))
    #vocab_vec[0] = torch.rand(WORD_SIZE) - 0.5
    with open(WORD2VEC, 'rb') as fd:
        line = fd.readline()
        parts = line.decode('utf-8').split(' ')
        words = int(parts[0])
        word_size = int(parts[1])
        if word_size != WORD_SIZE:
            raise ValueError(f"Unexpected word size {word_size} != {WORD_SIZE}")
        for i in tqdm(range(0, words)):
            # Read the next word
            s = b''
            while True:
                ch = fd.read(1)
                if ch == b' ':
                    break
                if ch == b'':
                    raise ValueError("Unexpected eof")
                if ch != b'\n':
                    s += ch
                if len(s) > word_size:
                    raise ValueError(f"Word was too long {s}")
            weights = fd.read(word_size * WEIGHT_SIZE)
            wd = numpy.frombuffer(weights, dtype=numpy.float32)
            word = s.decode('utf-8').strip().lower()
            # Only load words in the vocabulary
            if (idx := vocab.get(word)) is not None:
                vocab_vec[idx] = wd
    return torch.tensor(vocab_vec, dtype=float)


vocab2vec = load_word2vec(vocab)
len(vocab), len(vocab2vec)

100%|██████████| 3000000/3000000 [00:07<00:00, 379677.05it/s]


(47431, 47431)

In [27]:
def word2vec(word: str) -> torch.Tensor:
    """Lookup the word using the word2vec encoding or init to random."""
    if word not in vocab:
        word = '</s>'
    idx = vocab[word]
    return vocab2vec[idx]


# Sample from word2vec
print("Example:")
print(min(word2vec('car')))
print(max(word2vec('car')))

print("Sentence example:")
print(tokenize("My car is a bus"))
print(encode_sentence("My car is a bus"))

Example:
tensor(-0.2305, dtype=torch.float64)
tensor(0.2539, dtype=torch.float64)
Sentence example:
['my', 'car', 'is', 'a', 'bus']
tensor([ 297,  561,    7,   40, 6821,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0])


In [28]:
import torch

# From the paper: The input is a concatenated sentence (length n) where each word is
# represented as a k-dimensional word vector. X[i, i+j] = concat(Xi, Xi+1, ..., Xi+j)
# The sentence is padded where necessary.


def build_dataset(dataset: list[tuple[str, float]]) -> tuple[torch.Tensor, torch.Tensor]:
    x, y = [], []
    for (data, category) in dataset:
        x.append(data)
        y.append(category)
    X = torch.stack(x)
    Y = torch.tensor(y)
    return X, Y


random.seed(31337)
random.shuffle(reviews)
n1 = int(0.8*len(reviews))
n2 = int(0.9*len(reviews))

Xtr, Ytr = build_dataset(reviews[:n1])
Xdev, Ydev = build_dataset(reviews[n1:n2])
Xte, Yte = build_dataset(reviews[n2:])

Xtr.shape, Ytr.shape

(torch.Size([51776, 30]), torch.Size([51776]))

In [29]:
r1 = reviews[5][0]
r2 = reviews[5][0]
print((r1.shape, r2.shape))
torch.stack([r1, r2]).shape

(torch.Size([30]), torch.Size([30]))


torch.Size([2, 30])

# Model

In [30]:
import torch.nn as nn
import torch.nn.functional as F

FILTER_WINDOWS = (3, 4, 5)
FEATURE_MAPS = 100


class Model(nn.Module):
    """A model for training a CNN text classifier."""

    def __init__(self, embeddings: torch.tensor):
        """Initialize Model."""
        super().__init__()
        self.embedding = nn.Embedding.from_pretrained(embeddings, freeze=True)
        self.conv = nn.ModuleList([
            nn.Conv1D(
                in_channels=embeddings.shape,
                out_channels=FEATURE_MAPS,
                kernel_size=filter_size,
            )
            for filter_size in FILTER_WINDOWS
        ])
        self.pool = nn.MaxPool2d(2, 2)
        

    def __call__(self, x: torch.Tensor) -> torch.Tensor:
        """Forward pass"""
        x = self.embedding(x)
        x = self.conv(x)
        x = self.pool(x)
        x = F.relu(x)
        return x

    def parameters(self):
        return self.layers.parameters()

    def __str__(self) -> str:
        return self.__class__.__name__

# Training

In [41]:
MINI_BATCH_SZ = 50

g = torch.Generator().manual_seed(31337)



ix = torch.randint(0, Xtr.shape[0], (MINI_BATCH_SZ,), generator=g) # (MINI_BATCH_SZ)
print(ix)
Xb = Xtr[ix]
Yb = Ytr[ix]



tensor([36087, 50847,  1494, 38461, 23165,  9767, 34169, 41167, 38412, 36431,
        13991, 11532, 10202, 22085,  8680, 49684, 16303,  5096, 50920, 48551,
        17435, 37149, 28246, 34080, 34941,  5518, 18130, 43799, 41903,  4658,
        37791, 13978, 11822, 39560, 41644, 24770, 35510, 12431, 12972, 21601,
        35592, 24727,  5633, 30474, 23279, 26757, 30857, 49843, 45212, 12928])


torch.Size([30, 300])

In [None]:
Xtr.shape, Ytr.shape