In [60]:
!pip install --quiet huggingface_hub

In [63]:
import torch
import torch.nn as nn
import torch.utils.data as data
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from tqdm.notebook import tqdm
from collections import Counter
import huggingface_hub
import os

In [64]:
huggingface_hub.login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
corpus_file_path = huggingface_hub.hf_hub_download(
    repo_id="ardMLX/text8",
    filename="text8",
    repo_type="dataset",
    local_dir="."
)
print(f"File downloaded to: {corpus_file_path}")

text8:   0%|          | 0.00/100M [00:00<?, ?B/s]

In [43]:
if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print("Device:", device)

Device: mps


In [1]:
CORPUS_FILENAME = "text8"

In [15]:
with open(CORPUS_FILENAME, "r") as f:
    text = f.read()
# Split the text into words
words = text.split()
# Create a set of unique words
unique_words = set(words)
# Create a dictionary to map each unique word to an index
word_to_index = {word: index for index, word in enumerate(unique_words)}
# Create a list of words in the same order as they appear in the text
indexed_words = [word_to_index[word] for word in words]

In [4]:
len(words), len(unique_words)

(17005207, 253854)

In [None]:
def tokenize(text):
    words = text.split()
    return [word_to_index[word] for word in words if word in word_to_index]

In [39]:
def tokenize(text):
    return text.lower().split()

def build_vocab(tokens, min_freq=1):
    counter = Counter(tokens)
    vocab = {word: idx for idx, (word, freq) in enumerate(counter.items()) if freq >= min_freq}
    word2idx = {word: i for i, word in enumerate(vocab)}
    idx2word = {i: word for word, i in word2idx.items()}
    return word2idx, idx2word

## CBOW

In [10]:
EMBEDDING_DIM = 100

In [None]:
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        # TODO Do we have to do max norm here?
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, max_norm=1.0)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, inputs):
        x = self.embeddings(inputs)
        x = x.mean(dim=1)
        x = self.linear(x)
        return x

In [11]:
cbow = CBOW(len(unique_words), EMBEDDING_DIM)
print(cbow)

CBOW(
  (embeddings): Embedding(253854, 100)
  (linear): Linear(in_features=100, out_features=253854, bias=True)
)


In [23]:
class SlidingTextDataset(data.Dataset):
    def __init__(self, indexed_words, window_size=5):
        super().__init__()
        self.indexed_words = indexed_words
        assert window_size % 2 == 1, "Window size must be odd"
        self.window_size = window_size
        self.size = len(indexed_words) - window_size + 1

    def __len__(self):
        return self.size
    
    def __getitem__(self, idx):
        return self.indexed_words[idx, :idx + self.window_size // 2] + self.indexed_words[idx, self.window_size // 2 + 1:], self.indexed_words[idx, self.window_size // 2]

In [24]:
train_dataset = SlidingTextDataset(indexed_words, window_size=5)
train_data_loader = data.DataLoader(train_dataset, batch_size=128, shuffle=True)

In [25]:
# TODO Which loss to use?
loss_module = nn.BCEWithLogitsLoss()

In [26]:
# TODO Clarify which optimizer to use
optimizer = torch.optim.SGD(cbow.parameters(), lr=0.1)

In [27]:
device = torch.device("cpu")

In [None]:
def train_model(model, optimizer, data_loader, loss_module, num_epochs=10):
    model.train()
    for epoch in tqdm(range(num_epochs)):
        for data_inputs, data_labels in data_loader:
            # Step 1: Move input data to device
            data_inputs = data_inputs.to(device)
            data_labels = data_labels.to(device)

            # Step 2: Run the model on the input data
            preds = model(data_inputs)
            preds = preds.squeeze(dim=1)  # output is [batch size, 1], but we need [batch size]

            # Step 3: Calculate the loss
            loss = loss_module(preds, data_labels.float())

            # Step 4: Perform backpropagation
            optimizer.zero_grad()
            loss.backward()

            # Step 5: Update the parameters
            optimizer.step()

In [37]:
train_model(cbow, optimizer, train_data_loader, loss_module)

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1/10


TypeError: list indices must be integers or slices, not tuple

In [40]:
class CBOWDataset(Dataset):
    def __init__(self, tokens, word2idx, context_size=2):
        self.data = []
        self.word2idx = word2idx
        self.context_size = context_size
        self.vocab_size = len(word2idx)

        indices = [word2idx[word] for word in tokens if word in word2idx]
        for i in range(context_size, len(indices) - context_size):
            context = (
                indices[i - context_size:i] + indices[i + 1:i + context_size + 1]
            )
            target = indices[i]
            self.data.append((context, target))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        context, target = self.data[idx]
        return torch.tensor(context, dtype=torch.long), torch.tensor(target, dtype=torch.long)

In [57]:
def train_model(model, dataloader, num_epochs, lr, device):
    model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    for epoch in range(num_epochs):
        total_loss = 0
        for i, (context, target) in enumerate(dataloader):
            context, target = context.to(device), target.to(device)
            optimizer.zero_grad()
            output = model(context)
            loss = criterion(output, target)
            total_loss += loss.item()
            if i % 1000 == 0:
                print(f"Epoch {epoch+1}, Step {i}, Loss: {loss.item():.4f}")
            loss.backward()
            optimizer.step()

In [45]:
# Load and process text
with open(CORPUS_FILENAME, "r") as f:
    tokens = tokenize(f.read())

# Build vocab
word2idx, idx2word = build_vocab(tokens, min_freq=1)

# Create dataset and dataloader
context_size = 2
dataset = CBOWDataset(tokens, word2idx, context_size=context_size)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [50]:
len(dataset) / 32

531412.59375

In [51]:
len(dataloader)

531413

In [58]:
# Create model and train
vocab_size = len(word2idx)
embedding_dim = 100
model = CBOW(vocab_size, embedding_dim)

train_model(model, dataloader, num_epochs=1, lr=0.001, device=device)

Epoch 1, Step 0, Loss: 12.4114
Epoch 1, Step 1000, Loss: 8.9346
Epoch 1, Step 2000, Loss: 8.9799
Epoch 1, Step 3000, Loss: 7.9113
Epoch 1, Step 4000, Loss: 8.0418
Epoch 1, Step 5000, Loss: 7.0659


KeyboardInterrupt: 