# Word2Vec CBOW Implementation using PyTorch

In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import numpy as np

from collections import Counter
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import strip_accents_ascii

import re
from nltk.corpus import stopwords
from tqdm import tqdm
import pickle

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using device: {device}')

Using device: cuda


# Hyperparameters

In [2]:
# vector size of each token
EMBED_DIMENSION = 128
# number of tokens from left and right
CONTEXT_SIZE = 2
BATCH_SIZE = 2048
LEARNING_RATE = 0.01
NUM_EPOCHS = 50

## Turkish sentences for word2vec trainingp

In [31]:
# Download from: https://www.kaggle.com/datasets/ahmetax/hury-dataset

### Read datasets

In [3]:
with open('data/turkish_sentences_for_word2vec_training/dunya-nz.txt', encoding='utf-8') as f:
    wiki_train_raw = f.read()

### Preprocessing

In [4]:
stop_words = set(stopwords.words('english'))

In [5]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove accents
    text = strip_accents_ascii(text)

    # Remove non-alphanumeric characters
    text = re.sub(r'[^\w\s]', '', text)

    # Remove stopwords
    tokens = text.split()
    tokens = [token for token in tokens if token not in stop_words]

    # Join the tokens back into a string
    text = ' '.join(tokens)

    return text

In [6]:
wikitext_preprocessed_train = preprocess_text(wiki_train_raw)
wikitext_tokens_train = word_tokenize(wikitext_preprocessed_train)

### Build Vocabulary
* **NOTE:** It is best to build vocab on the entire dataset (train/test combined)

In [7]:
word_counts = Counter(wikitext_tokens_train)

vocab = {word: idx for idx, (word, _) in enumerate(word_counts.most_common(n=None))}

print(f'Vocab size: {len(vocab):,}')

Vocab size: 226,473


### Convert word tokens to indices

In [8]:
indexed_tokens_train = [vocab[word] for word in wikitext_tokens_train]

# Custom Dataset

In [9]:
class CBOWDataset(Dataset):
    def __init__(self, tokens, context_size):
        self.tokens = tokens
        self.context_size = context_size

    def __len__(self):
        # (leave space for starting and ending context windows)
        return len(self.tokens) - 2*self.context_size

    def __getitem__(self, idx):
        context = self.tokens[idx:idx + self.context_size] + self.tokens[idx+self.context_size+1 : idx+2*self.context_size+1]
        target = self.tokens[idx + self.context_size]
        return torch.tensor(context), torch.tensor(target)

In [10]:
train_dataset = CBOWDataset(indexed_tokens_train, CONTEXT_SIZE)

# Dataloader

In [11]:
train_dataloader = DataLoader(
    train_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=True,
    num_workers=8, # number of cpu cores
    pin_memory=True, # faster GPU memory allocation
)

# Print Sample Context and Target

In [34]:
for context, target in train_dataloader:
    # Extract the first sample from the batch
    # idx (integer)
    context_sample = context[0].numpy()
    target_sample = target[0].item()

    # Convert vocab IDs to string tokens
    context_tokens_str = [word for word, idx in vocab.items() if idx in context_sample]
    target_token_str = [word for word, idx in vocab.items() if idx == target_sample][0]

    print('Vocab IDs - Context:', context_sample)
    print('Vocab IDs - Target:', target_sample)
    print('String Tokens - Context:', context_tokens_str)
    print('String Token - Target:', target_token_str)
    break

Vocab IDs - Context: [3546  142  545  529]
Vocab IDs - Target: 3
String Tokens - Context: ['fransa', 'sonunda', 'olayn', 'umarm']
String Token - Target: bu


# Embedding Table for PyTorch
* maps discrete token ids to continuous-valued vectors
* It is like a matrix where rows are token ids, columns are vectors
* It has "weight" as the learnble parameter (no bias)

In [13]:
# total 10 tokens
_vocab_size = 10
# dimension of vector for each token
_emb_dim = 4

emb = nn.Embedding(
    num_embeddings=_vocab_size,
    embedding_dim=_emb_dim,
)

print(emb.weight.shape)

torch.Size([10, 4])


In [14]:
emb.weight

Parameter containing:
tensor([[-0.9288, -1.0496,  1.0969,  0.3053],
        [-0.3824,  0.9868,  0.1592, -2.7793],
        [-0.0398, -0.0793,  0.6736,  0.0517],
        [-0.1260, -0.9008, -0.0438,  0.4446],
        [-1.2505,  0.3022,  0.6373,  1.2476],
        [ 0.0953,  1.3099, -1.1293, -0.7064],
        [-1.7267, -0.8775,  0.5312,  0.0625],
        [-0.5806, -1.4279, -0.4218, -0.2073],
        [ 1.3390,  0.6697,  0.6564, -1.1044],
        [ 1.1431, -0.5799,  1.4895,  0.2775]], requires_grad=True)

# CBOW Model
*  https://arxiv.org/abs/1301.3781

In [15]:
class CBOW_Model(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        
        self.embeddings = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=EMBED_DIMENSION,
        )
        
        self.linear = nn.Linear(
            in_features=EMBED_DIMENSION,
            out_features=vocab_size,
        )

    def forward(self, inputs_):
        x = self.embeddings(inputs_).mean(dim=1)
        x = self.linear(x)
        return x

### Initialize the model

In [16]:
model = CBOW_Model(vocab_size=len(vocab))

# Optimizer & Loss

In [17]:
criterion = nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

2024-01-09 12:42:11.537366: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-01-09 12:42:11.539088: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-01-09 12:42:11.562153: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-09 12:42:11.562176: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-09 12:42:11.562190: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to regi

# Training the model

### Start training

### Training iteration function that loops over all the batches

In [18]:
def train_iter(model, dataloader, criterion, optimizer, device):
    model.train()

    loss_history = []

    for context_tokens_batch, target_tokens_batch in tqdm(dataloader, total=len(dataloader)):
        
        context_tokens_batch = context_tokens_batch.to(device)
        target_tokens_batch = target_tokens_batch.to(device)
            
        # Forward pass
        preds_tokens = model(context_tokens_batch)
        # Compute error
        loss = criterion(preds_tokens, target_tokens_batch)
    
        # Clear previously computed gradients
        optimizer.zero_grad()
        # Compute gradients
        loss.backward()
        # Update parameters (weights and biases)
        optimizer.step()

        loss_history.append(loss.item())

    avg_loss = sum(loss_history)/len(loss_history)
    return avg_loss

In [19]:
def start_training(model, train_dataloader, optimizer, criterion, num_epochs, device):
    model = model.to(device)
    
    # Loop over all epochs
    for epoch in range(1, num_epochs+1):
        avg_train_loss = train_iter(model, train_dataloader, criterion, optimizer, device)

        print(f'Epoch: [{epoch+1}/{num_epochs}], Avg loss: {avg_train_loss:.4f}')

# Load Embeddings (Optional)

In [20]:
load_filename = f'turkish_emb_{EMBED_DIMENSION}_vocab_{len(vocab)}'

with open(f'{load_filename}.npy', 'rb') as f:
    trained_embeddings = np.load(f)

with open(f'vocab_emb_{EMBED_DIMENSION}_vocab_{len(vocab)}.pkl', 'rb') as f:
    vocab = pickle.load(f)

### Start Training
* This cell is commented
* Change cell type to "code" to run this

### Get Trained Embeddings
* This cell is commented
* Change cell type to "code" to run this

# Save Embeddings (Optional)
* This cell is commented
* Change cell type to "code" to run this

# Similarity Search

### Cosine Similarity
* Cosine similarity is a measure of similarity between two vectors in an inner product space. 
* In the context of natural language processing and text analysis, cosine similarity is commonly used to quantify the similarity between two documents or text passages
* Cosine similarity is between -1.0 and 1.0.

$ \text{Cosine Similarity}(\mathbf{A}, \mathbf{B}) = \frac{\mathbf{A} \cdot \mathbf{B}}{\|\mathbf{A}\| \cdot \|\mathbf{B}\|} $
* $\mathbf{A} \cdot \mathbf{B}$ represents dot product between vectors
* $\|\mathbf{A}\|$ and $\|\mathbf{B}\|$ Euclidean norms


In [21]:
def custom_cosine_similarity(embedding1, embedding2):
    dot_product = np.dot(embedding1, embedding2)
    norm_embedding1 = np.linalg.norm(embedding1)
    norm_embedding2 = np.linalg.norm(embedding2)
    similarity = dot_product / (norm_embedding1 * norm_embedding2)
    return similarity

### Search similar words

In [22]:
def word_similarity_search(query_word, embeddings, vocab, top_n=5):
    # Check if the query word is in the vocabulary
    if query_word not in vocab:
        return f"{query_word} is not in the vocabulary."

    # Get the index of the query word in the vocabulary
    query_idx = vocab[query_word]

    # Extract the embedding of the query word
    query_embedding = embeddings[query_idx]

    # Calculate cosine similarity between the query embedding and all other embeddings
    similarities = [custom_cosine_similarity(query_embedding, embedding) for embedding in embeddings]

    # Get the indices of the top N similar words
    # [::-1] -> reverse the order
    # [:top_n] -> fetch top n
    top_indices = np.argsort(similarities)[::-1][:top_n]

    # Extract the top N similar words and their cosine similarities
    similar_words = [word for word, idx in vocab.items() if idx in top_indices and word != query_word]
    cosine_similarities = np.array([similarities[idx] for idx in top_indices if idx != query_idx])

    # Display the results
    print(f"Similar words to '{query_word}':")
    for word, similarity in zip(similar_words, cosine_similarities):
        print(f"{word}: {similarity:.4f}")

### Example Searches

In [23]:
word_similarity_search("araba", trained_embeddings, vocab)

Similar words to 'araba':
arac: 0.5006
otomobil: 0.4463
silahn: 0.4312
polimer: 0.3980


In [24]:
word_similarity_search("gencler", trained_embeddings, vocab)

Similar words to 'gencler':
ulkeler: 0.5031
genclerin: 0.4317
suriyeliler: 0.4253
yps: 0.4098


In [25]:
word_similarity_search("televizyon", trained_embeddings, vocab)

Similar words to 'televizyon':
haber: 0.4532
tv: 0.4438
televizuyon: 0.4326
chew: 0.4287


In [26]:
word_similarity_search("yol", trained_embeddings, vocab)

Similar words to 'yol':
yolunu: 0.5377
onunu: 0.5261
kaplarn: 0.4731
kucak: 0.4593


In [27]:
word_similarity_search("ucak", trained_embeddings, vocab)

Similar words to 'ucak':
ucagn: 0.5492
gemi: 0.4722
tren: 0.4618
vinc: 0.4379


In [28]:
word_similarity_search("meteoroloji", trained_embeddings, vocab)

Similar words to 'meteoroloji':
uzay: 0.4441
itfaiye: 0.4354
televizyonlarnda: 0.4044
astrofizik: 0.4018


In [37]:
word_similarity_search("futbol", trained_embeddings, vocab)

Similar words to 'futbol':
basketbol: 0.5539
golf: 0.5095
satranc: 0.4773
voleybol: 0.4566
