In [36]:
import torch
import torch.nn as nn
import torch.utils.data as data
from tqdm.notebook import tqdm

In [1]:
CORPUS_FILENAME = "text8"

In [15]:
with open(CORPUS_FILENAME, "r") as f:
    text = f.read()
# Split the text into words
words = text.split()
# Create a set of unique words
unique_words = set(words)
# Create a dictionary to map each unique word to an index
word_to_index = {word: index for index, word in enumerate(unique_words)}
# Create a list of words in the same order as they appear in the text
indexed_words = [word_to_index[word] for word in words]

In [4]:
len(words), len(unique_words)

(17005207, 253854)

In [None]:
def tokenize(text):
    words = text.split()
    return [word_to_index[word] for word in words if word in word_to_index]

## CBOW

In [10]:
EMBEDDING_DIM = 100

In [None]:
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        # TODO Do we have to do max norm here?
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, max_norm=1.0)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, inputs):
        x = self.embeddings(inputs)
        x = x.mean(dim=1)
        x = self.linear(x)
        return x

In [11]:
cbow = CBOW(len(unique_words), EMBEDDING_DIM)
print(cbow)

CBOW(
  (embeddings): Embedding(253854, 100)
  (linear): Linear(in_features=100, out_features=253854, bias=True)
)


In [23]:
class SlidingTextDataset(data.Dataset):
    def __init__(self, indexed_words, window_size=5):
        super().__init__()
        self.indexed_words = indexed_words
        assert window_size % 2 == 1, "Window size must be odd"
        self.window_size = window_size
        self.size = len(indexed_words) - window_size + 1

    def __len__(self):
        return self.size
    
    def __getitem__(self, idx):
        return self.indexed_words[idx, :idx + self.window_size // 2] + self.indexed_words[idx, self.window_size // 2 + 1:], self.indexed_words[idx, self.window_size // 2]

In [24]:
train_dataset = SlidingTextDataset(indexed_words, window_size=5)
train_data_loader = data.DataLoader(train_dataset, batch_size=128, shuffle=True)

In [25]:
# TODO Which loss to use?
loss_module = nn.BCEWithLogitsLoss()

In [26]:
# TODO Clarify which optimizer to use
optimizer = torch.optim.SGD(cbow.parameters(), lr=0.1)

In [27]:
device = torch.device("cpu")

In [None]:
def train_model(model, optimizer, data_loader, loss_module, num_epochs=10):
    model.train()
    for epoch in tqdm(range(num_epochs)):
        for data_inputs, data_labels in data_loader:
            # Step 1: Move input data to device
            data_inputs = data_inputs.to(device)
            data_labels = data_labels.to(device)

            # Step 2: Run the model on the input data
            preds = model(data_inputs)
            preds = preds.squeeze(dim=1)  # output is [batch size, 1], but we need [batch size]

            # Step 3: Calculate the loss
            loss = loss_module(preds, data_labels.float())

            # Step 4: Perform backpropagation
            optimizer.zero_grad()
            loss.backward()

            # Step 5: Update the parameters
            optimizer.step()

In [37]:
train_model(cbow, optimizer, train_data_loader, loss_module)

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1/10


TypeError: list indices must be integers or slices, not tuple