In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

In [None]:
import numpy as np
import matplotlib.pyplot as plt

import torch
from torch.utils.data import TensorDataset, random_split, DataLoader
import torch.nn as nn
from torch import optim

from datasets import load_dataset
from tokenizers import Tokenizer

## 🟣 From Scratch

### 🟡 Step 1: Load and Preprocess the Dataset

In [None]:
dataset = load_dataset("roneneldan/TinyStories")
dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

(…)-00000-of-00004-2d5a1467fff1081b.parquet:   0%|          | 0.00/249M [00:00<?, ?B/s]

(…)-00001-of-00004-5852b56a2bd28fd9.parquet:   0%|          | 0.00/248M [00:00<?, ?B/s]

(…)-00002-of-00004-a26307300439e943.parquet:   0%|          | 0.00/246M [00:00<?, ?B/s]

(…)-00003-of-00004-d243063613e5a057.parquet:   0%|          | 0.00/248M [00:00<?, ?B/s]

(…)-00000-of-00001-869c898b519ad725.parquet:   0%|          | 0.00/9.99M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2119719 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/21990 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 2119719
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 21990
    })
})

In [None]:
text_samples =dataset['validation']['text']
text_samples[:10]

['Spot. Spot saw the shiny car and said, "Wow, Kitty, your car is so bright and clean!" Kitty smiled and replied, "Thank you, Spot. I polish it every day."\n\nAfter playing with the car, Kitty and Spot felt thirsty. They found a small pond with clear water. They drank the water and felt very happy. They played together all day and became best friends.',
 'Once upon a time, in a big forest, there lived a rhinoceros named Roxy. Roxy loved to climb. She climbed trees, rocks, and hills. One day, Roxy found an icy hill. She had never seen anything like it before. It was shiny and cold, and she wanted to climb it.\n\nRoxy tried to climb the icy hill, but it was very slippery. She tried again and again, but she kept falling down. Roxy was sad. She wanted to climb the icy hill so much. Then, she saw a little bird named Billy. Billy saw that Roxy was sad and asked, "Why are you sad, Roxy?"\n\nRoxy told Billy about the icy hill and how she couldn\'t climb it. Billy said, "I have an idea! Let\'s 

In [None]:
print(type(text_samples))
print(len(text_samples))

<class 'list'>
21990


### 🟡 Step 2: Tokenization

In [None]:
# Load and use the tokenizer
tokenizer = Tokenizer.from_pretrained("EleutherAI/gpt-neo-125M")

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

In [None]:
tokenized_data = [tokenizer.encode(sample).ids for sample in text_samples]
print(len(tokenized_data))

21990


In [None]:
tokenized_data[:1]

[[32565,
  13,
  15899,
  2497,
  262,
  22441,
  1097,
  290,
  531,
  11,
  366,
  22017,
  11,
  21168,
  11,
  534,
  1097,
  318,
  523,
  6016,
  290,
  3424,
  2474,
  21168,
  13541,
  290,
  8712,
  11,
  366,
  10449,
  345,
  11,
  15899,
  13,
  314,
  25245,
  340,
  790,
  1110,
  526,
  198,
  198,
  3260,
  2712,
  351,
  262,
  1097,
  11,
  21168,
  290,
  15899,
  2936,
  47124,
  13,
  1119,
  1043,
  257,
  1402,
  16723,
  351,
  1598,
  1660,
  13,
  1119,
  24070,
  262,
  1660,
  290,
  2936,
  845,
  3772,
  13,
  1119,
  2826,
  1978,
  477,
  1110,
  290,
  2627,
  1266,
  2460,
  13]]

### 🟡 Step 3: Prepare Training Data (Skip-Gram)

In [None]:
# Parameters
window_size = 2  # Number of tokens to the left/right to consider as context

# Generate Skip-Gram pairs
skip_gram_pairs = []
for tokens in tokenized_data:
    for i, target_token in enumerate(tokens):
        for j in range(max(0, i - window_size), min(len(tokens), i + window_size + 1)):
            if i != j:  # Avoid pairing the target token with itself
                skip_gram_pairs.append((target_token, tokens[j]))

# Convert pairs to PyTorch tensors
skip_gram_pairs = torch.tensor(skip_gram_pairs, dtype=torch.long)
print(skip_gram_pairs.shape)

torch.Size([18843772, 2])


In [None]:
skip_gram_pairs

tensor([[32565,    13],
        [32565, 15899],
        [   13, 32565],
        ...,
        [35548,     0],
        [    0,  1266],
        [    0, 35548]])

In [None]:
# tokens = ['salam', 'halet', 'khube', '?']

In [None]:
skip_gram_pairs[:2]

tensor([[32565,    13],
        [32565, 15899]])

In [None]:
tokenizer.id_to_token(3198), tokenizer.id_to_token(1110), tokenizer.id_to_token(11)

('One', 'Ġday', ',')

In [None]:
train_set = TensorDataset(skip_gram_pairs[:, 0], skip_gram_pairs[:, 1])
train_set, valid_set = random_split(train_set, [0.8, 0.2])

len(train_set), len(valid_set)

(15075018, 3768754)

In [None]:
train_loader = DataLoader(train_set, batch_size=128, shuffle=True)
valid_loader = DataLoader(valid_set, batch_size=128, shuffle=False)

len(train_loader), len(valid_loader)

(117774, 29444)

In [None]:
next(iter(train_loader))

### 🟡 Step 4: Define Word2Vec Model

In [None]:
class SkipGram(nn.Module):

    def __init__(self, vocab_size, embedding_dim):
        super(SkipGram, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, target_word):
        embedded = self.embeddings(target_word)
        output = self.linear(embedded)
        return output

### 🟡 Step 5: Config

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)


Device: cuda


In [None]:
# Hyperparameters
embedding_dim = 100  # Size of the word embeddings
learning_rate = 0.01
num_epochs = 10
vocab_size = tokenizer.get_vocab_size()

# Initialize model, loss, and optimizer
model = SkipGram(vocab_size, embedding_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate, weight_decay=0.001)

sum([param.numel() for param in model.parameters()]) / 1e6

10.101657

In [None]:
model

SkipGram(
  (embeddings): Embedding(50257, 100)
  (linear): Linear(in_features=100, out_features=50257, bias=True)
)

### 🟡 Step 6: Train and Evaluate Model

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim



for epoch in range(num_epochs):
    model.train()  # قرار دادن مدل در حالت آموزش
    train_loss = 0.0

    # حلقه آموزش بر روی batches دیتاست training
    for batch in train_loader:
        # فرض: هر batch یک tuple شامل (target_words, context_words) است.
        target_words, context_words = batch
        target_words, context_words = target_words.to(device), context_words.to(device)

        optimizer.zero_grad()            # صفر کردن گرادیان‌ها برای این batch
        outputs = model(target_words)    # پیش‌بینی خروجی برای target_words
        loss = criterion(outputs, context_words)  # محاسبه loss با استفاده از CrossEntropyLoss
        loss.backward()                  # پس‌انتشار برای محاسبه گرادیان‌ها
        optimizer.step()                 # به‌روزرسانی وزن‌های مدل

        train_loss += loss.item()        # جمع‌آوری loss‌های هر batch

    # محاسبه میانگین loss آموزش در این epoch
    avg_train_loss = train_loss / len(train_loader)

    # مرحله اعتبارسنجی (Validation)
    model.eval()  # قرار دادن مدل در حالت ارزیابی
    valid_loss = 0.0
    with torch.no_grad():  # غیرفعال کردن ردیابی گرادیان‌ها
        for batch in valid_loader:
            target_words, context_words = batch
            target_words, context_words = target_words.to(device), context_words.to(device)
            outputs = model(target_words)
            loss = criterion(outputs, context_words)
            valid_loss += loss.item()

    avg_valid_loss = valid_loss / len(valid_loader)

    print(f"Epoch {epoch+1}/{num_epochs} - Train Loss: {avg_train_loss:.4f}, Validation Loss: {avg_valid_loss:.4f}")


Epoch 1/10 - Train Loss: 7.5558, Validation Loss: 6.9838
Epoch 2/10 - Train Loss: 6.9122, Validation Loss: 6.8789
Epoch 3/10 - Train Loss: 6.8708, Validation Loss: 6.8688
Epoch 4/10 - Train Loss: 6.8679, Validation Loss: 6.8701
Epoch 5/10 - Train Loss: 6.8697, Validation Loss: 6.8717


### 🟡 Step 7: Extract Word Embeddings

In [None]:
model.embeddings.weight.data[11].shape

torch.Size([100])

In [None]:
# Get the embeddings for all tokens
embeddings = model.embeddings.weight.data

# Example: Get the embedding for a specific token
token = "king"
if token in vocab:
    token_id = vocab[token]
    token_embedding = embeddings[token_id]
    print(f"Embedding for '{token}': {token_embedding}")
else:
    print(f"'{token}' not in vocabulary")

### 🟡 Step 8: Save the Model and Tokenizer

In [None]:
# Save the model
torch.save(model.state_dict(), "skipgram_word2vec.pt")

# Save the embeddings
torch.save(embeddings, "word_embeddings.pt")