In [1]:
!pip install --quiet huggingface_hub

In [2]:
import torch
import torch.nn as nn
import torch.utils.data as data
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from tqdm.notebook import tqdm
from collections import Counter
import huggingface_hub
import os

In [None]:
# Login is optional for public models
# huggingface_hub.login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
corpus_file_path = huggingface_hub.hf_hub_download(
    repo_id="ardMLX/text8",
    filename="text8",
    repo_type="dataset",
    local_dir="."
)
print(f"File downloaded to: {corpus_file_path}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


text8:   0%|          | 0.00/100M [00:00<?, ?B/s]

File downloaded to: text8


In [4]:
if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print("Device:", device)

Device: cuda


In [5]:
def tokenize(text):
    return text.lower().split()

# TODO Use torchtext.vocab.build_vocab_from_iterator
def build_vocab(tokens, min_freq=1):
    counter = Counter(tokens)
    vocab = {word: idx for idx, (word, freq) in enumerate(counter.items()) if freq >= min_freq}
    word2idx = {word: i for i, word in enumerate(vocab)}
    idx2word = {i: word for word, i in word2idx.items()}
    return word2idx, idx2word

## CBOW

In [6]:
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        # TODO Do we have to do max norm here?
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, max_norm=1.0)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, inputs):
        x = self.embeddings(inputs)
        x = x.mean(dim=1)
        x = self.linear(x)
        return x

In [7]:
class CBOWDataset(Dataset):
    def __init__(self, tokens, word2idx, context_size=2):
        self.data = []
        self.word2idx = word2idx
        self.context_size = context_size
        self.vocab_size = len(word2idx)

        indices = [word2idx[word] for word in tokens if word in word2idx]
        # TODO add tqdm progress
        for i in range(context_size, len(indices) - context_size):
            context = (
                indices[i - context_size:i] + indices[i + 1:i + context_size + 1]
            )
            target = indices[i]
            self.data.append((context, target))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        context, target = self.data[idx]
        return torch.tensor(context, dtype=torch.long), torch.tensor(target, dtype=torch.long)

In [8]:
def train_model(model, dataloader, num_epochs, lr, device):
    model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    for epoch in range(num_epochs):
        total_loss = 0
        for i, (context, target) in enumerate(dataloader):
            context, target = context.to(device), target.to(device)
            optimizer.zero_grad()
            output = model(context)
            loss = criterion(output, target)
            total_loss += loss.item()
            if i % 1000 == 0:
                print(f"Epoch {epoch+1}, Step {i}, Loss: {loss.item():.4f}")
            loss.backward()
            optimizer.step()

In [10]:
# Load and process text
with open(corpus_file_path, "r") as f:
    tokens = tokenize(f.read())

# Build vocab
word2idx, idx2word = build_vocab(tokens, min_freq=1)

# Create dataset and dataloader
context_size = 2
dataset = CBOWDataset(tokens, word2idx, context_size=context_size)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [11]:
len(dataset) / 32

531412.59375

In [12]:
len(dataloader)

531413

In [None]:
# Create model and train
vocab_size = len(word2idx)
embedding_dim = 100
model = CBOW(vocab_size, embedding_dim)
# TODO Print model

train_model(model, dataloader, num_epochs=1, lr=0.001, device=device)

Epoch 1, Step 0, Loss: 12.4242
Epoch 1, Step 1000, Loss: 9.1692
Epoch 1, Step 2000, Loss: 7.7933
Epoch 1, Step 3000, Loss: 8.1644
Epoch 1, Step 4000, Loss: 7.3388
Epoch 1, Step 5000, Loss: 7.1061
Epoch 1, Step 6000, Loss: 8.2516
Epoch 1, Step 7000, Loss: 7.8015
Epoch 1, Step 8000, Loss: 6.8521
Epoch 1, Step 9000, Loss: 8.4640
Epoch 1, Step 10000, Loss: 6.0802
Epoch 1, Step 11000, Loss: 7.8232
Epoch 1, Step 12000, Loss: 8.2532
Epoch 1, Step 13000, Loss: 7.3536
Epoch 1, Step 14000, Loss: 7.0593
Epoch 1, Step 15000, Loss: 6.9970
Epoch 1, Step 16000, Loss: 7.0552
Epoch 1, Step 17000, Loss: 6.8838
Epoch 1, Step 18000, Loss: 7.7295
Epoch 1, Step 19000, Loss: 8.1813
Epoch 1, Step 20000, Loss: 6.9780
Epoch 1, Step 21000, Loss: 8.9604
Epoch 1, Step 22000, Loss: 8.8395
Epoch 1, Step 23000, Loss: 6.5744
Epoch 1, Step 24000, Loss: 7.2382
Epoch 1, Step 25000, Loss: 6.9960
Epoch 1, Step 26000, Loss: 7.3981
Epoch 1, Step 27000, Loss: 6.7376
Epoch 1, Step 28000, Loss: 7.7332
Epoch 1, Step 29000, Loss: