In [1]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader, random_split
import pandas as pd
import matplotlib.pyplot as plt
import gensim
from sklearn.preprocessing import OneHotEncoder

In [2]:
movie_reviews = pd.read_csv('data/IMDB Dataset.csv')
movie_reviews.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
print('Number of movie reviews: {}'.format(len(movie_reviews)))
classes = movie_reviews['sentiment'].unique()
print('Number of classes: {} with values: {}'.format(len(classes), classes))

Number of movie reviews: 50000
Number of classes: 2 with values: ['positive' 'negative']


In [4]:
movie_reviews = movie_reviews.head(500)

In [5]:
review_corpus = [list(gensim.utils.tokenize(movie_reviews["review"][i])) for i in range(len(movie_reviews))]

In [6]:
vocab = set()

for i in review_corpus:
    for j in i:
        vocab.add(j)

In [7]:
word2idx = {word: idx for idx, word in enumerate(set(vocab))}

In [8]:
len(word2idx)

14153

In [9]:
VOCAB_SIZE = len(vocab)

In [10]:
# convert to one hot encoding embeddings

vocab = np.array(list(vocab)).reshape(-1, 1)

encoder = OneHotEncoder(sparse_output=False, dtype=np.float32)
one_hot_vocab = encoder.fit_transform(vocab)

# Word2Vec CBOW

predict the word based on the context $p(x_t|x_t-2, x_t-1, x_t+1, x_t+2)$ (conditional probability)

In [None]:
def convert_to_one_hot_encoded_windows(sentences, word2idx, one_hot_vocab, windows_size=2):
    window_context = []
    window_target = []

    for sentence in sentences:
        for i in range(windows_size, len(sentence) - windows_size):
            target_word = sentence[i]
            target_word_one_hot = one_hot_vocab[word2idx[target_word]]

            prefix = sentence[i - windows_size:i]
            suffix = sentence[i + 1: i + windows_size + 1]

            prefix_one_hot = [one_hot_vocab[word2idx[word]] for word in prefix]
            suffix_one_hot = [one_hot_vocab[word2idx[word]] for word in suffix]

            context = np.zeros((2 * windows_size, VOCAB_SIZE), dtype=np.float32)
            context[:windows_size, :] = prefix_one_hot
            context[windows_size:2 * windows_size, :] = suffix_one_hot

            window_context.append(context)
            window_target.append(target_word_one_hot)

    return window_context, window_target

In [12]:
review_context, review_target = convert_to_one_hot_encoded_windows(review_corpus, word2idx, one_hot_vocab, 2)

In [13]:
class ReviewDataset(Dataset):
    def __init__(self, review_context, review_target):
        super(ReviewDataset, self).__init__()

        self.context = review_context
        self.targets = review_target

    def __len__(self):
        return len(self.context)

    def __getitem__(self, index):
        return np.array(self.context[index]), self.targets[index]

In [14]:
review_dataset = ReviewDataset(review_context, review_target)
train_dataset, test_dataset = random_split(review_dataset, [0.8, 0.2])

In [15]:
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

Embeddings are of size $d\times 1$ and U (projection) is of size $V\times d$ (V vocabulary size)

In [None]:
class CBOW(torch.nn.Module):
    def __init__(self, vocab_size, d_size):
        super(CBOW, self).__init__()

        self.W = torch.nn.Linear(vocab_size, d_size) # This can also be a linear layer but the embedding saves us the work by storring the embeddings and we can get them back using indices
        # self.W = torch.nn.Embeddings(vocab_size, d_size)
        self.U = torch.nn.Linear(d_size, vocab_size)

    def forward(self, x):
        x = self.W(x) # project one hot encoding vectors to their embeddings
        x = x.sum(axis=1) # sum the embeddings
        x = self.U(x) # project to the Vocab space (the output embedding)

        # Because we want to predict the output with a softmax (from all the embeddings what should be the output)

        return x


In [17]:
d_size = 100
device = "mps"

In [18]:
model = CBOW(VOCAB_SIZE, d_size)
model = model.to(device)

In [None]:
epochs = 5
lr = 1e-3

In [20]:
optimizer = torch.optim.Adam(model.parameters(), lr)
criterion_loss = torch.nn.CrossEntropyLoss()

In [None]:
for epoch in range(epochs):
    total_loss = 0
    for idx, batch in enumerate(train_dataloader):
        
        if idx % 1000 == 0:
            print(f"{idx}/{len(train_dataloader)}")

        x, y = batch
        x = x.to(device)
        y = y.to(device)

        pred = model(x)
        loss = criterion_loss(pred, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(total_loss / len(train_dataloader))   

0/3048
1000/3048
2000/3048
3000/3048
5.917309471784927
0/3048
1000/3048
2000/3048
3000/3048
4.20934189711343
0/3048
1000/3048
2000/3048
3000/3048
3.296223952075628
0/3048
1000/3048
2000/3048
3000/3048
2.590538275719002
0/3048
1000/3048
2000/3048
3000/3048
2.042055295286529
0/3048
1000/3048
2000/3048
3000/3048
1.6208985336809334
0/3048
1000/3048
2000/3048
3000/3048
1.3041989051411784
0/3048
1000/3048
2000/3048
3000/3048
1.0773685624202092
0/3048
1000/3048
2000/3048
3000/3048
0.9143747106644269
0/3048
1000/3048
2000/3048
3000/3048
0.798372329200503
0/3048
1000/3048
2000/3048
3000/3048
0.7127592436323954
0/3048
1000/3048
2000/3048
3000/3048
0.6437339831700944
0/3048
1000/3048
2000/3048
3000/3048
0.589933202100864
0/3048
1000/3048
