In [1]:
import os
import re
from collections import Counter
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
text_dir = 'Sans_dataset'

In [2]:
all_text = ""
for file_name in os.listdir(text_dir):
    if file_name.endswith(".txt"):
        with open(os.path.join(text_dir, file_name), 'r', encoding='utf-8') as f:
            all_text += f.read() + "\n"

In [3]:
all_text = re.sub(r'\s+', ' ', all_text).strip()

In [4]:
len(all_text)

8507929

In [5]:
tokens = all_text.split()
len(tokens)

1028452

In [6]:
word_counts = Counter(tokens)

In [7]:
min_freq = 7  
vocab = [word for word, freq in word_counts.items() if freq >= min_freq]

In [8]:
len(vocab)

11272

In [9]:
word2index = {word: i for i, word in enumerate(vocab)}
index2word = {i: word for word, i in word2index.items()}

In [10]:
filtered_tokens = [w for w in tokens if w in word2index]
data = []
window_size = 2

In [11]:
for i,central_words in enumerate(filtered_tokens):
  central_word_index = word2index[central_words]
  for j in range(-window_size, window_size + 1):
    if j!=0 and i+j >= 0 and i+j < len(filtered_tokens):
      context_word = filtered_tokens[i+j]
      context_word_index = word2index[context_word]
      data.append((central_word_index, context_word_index))

In [12]:
class Word2VecScratch(nn.Module):
  def __init__(self, vocab_size, embedding_dim):
    super(Word2VecScratch, self).__init__()
    self.hidden = nn.Linear(vocab_size, embedding_dim, bias = False)
    self.output = nn.Linear(embedding_dim, vocab_size, bias = False)
  def forward(self, x):
    x = self.hidden(x)
    x = self.output(x)
    return x

In [13]:
vocab_size= len(vocab)

In [14]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F

In [15]:
central_word_tensor = torch.tensor([pair[0] for pair in data])
context_word_tensor = torch.tensor([pair[1] for pair in data])
dataset = TensorDataset(central_word_tensor, context_word_tensor)
train_loader = loader = DataLoader(dataset, batch_size = 2048, shuffle = True, num_workers=100, pin_memory=True  )

In [16]:
from torch.optim.lr_scheduler import LambdaLR
from torch.optim import Adam

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Word2VecScratch(vocab_size, 500).to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=0.001)  

def get_cosine_schedule_with_warmup(optimizer, warmup_steps, total_steps, num_cycles=0.5, last_epoch=-1):
    def lr_lambda(current_step):
        if current_step < warmup_steps:
            return float(current_step) / float(max(1, warmup_steps))
        # Cosine decay
        progress = float(current_step - warmup_steps) / float(max(1, total_steps - warmup_steps))
        return max(0.0, 0.5 * (1.0 + torch.cos(torch.tensor(progress * 3.1415926535 * num_cycles))))
    return LambdaLR(optimizer, lr_lambda, last_epoch)

batch_size = 2048
total_steps = 300 * len(dataset) // batch_size
warmup_steps = int(total_steps * 0.1)  

lr_scheduler = get_cosine_schedule_with_warmup(optimizer, warmup_steps, total_steps)

In [None]:
from torch.utils.tensorboard import SummaryWriter
import torch.nn.functional as F
import torch
import os

# Setup TensorBoard
writer = SummaryWriter(log_dir='runs/word2vec')

# For saving the best model
best_loss = float('inf')
best_model_path = 'best_model.pth'

epochs = 300

for epoch in range(epochs):
    model.train()
    epoch_loss = 0

    print(f"\nEpoch {epoch+1}/{epochs}")

    for batch_idx, (central_word, context_word) in enumerate(train_loader):
        central_word_onehot = F.one_hot(central_word, num_classes=vocab_size).float().to(device)
        context_word_onehot = F.one_hot(context_word, num_classes=vocab_size).float().to(device)

        outputs = model(central_word_onehot)
        batch_loss = loss_fn(outputs, context_word_onehot)

        optimizer.zero_grad()
        batch_loss.backward()
        optimizer.step()
        lr_scheduler.step()

        batch_loss_value = batch_loss.item()
        epoch_loss += batch_loss_value

        # Verbose per batch (similar to TensorFlow)
        print(f"  Batch {batch_idx+1}/{len(train_loader)} - Loss: {batch_loss_value:.4f}", end='\r')

    avg_loss = epoch_loss / len(train_loader)
    print(f"\n→ Epoch {epoch+1} finished - Avg Loss: {avg_loss:.4f}")

    # Log to TensorBoard
    writer.add_scalar('Loss/train', avg_loss, epoch + 1)

    # Save best model based on loss
    if avg_loss < best_loss:
        best_loss = avg_loss
        torch.save(model.state_dict(), best_model_path)
        print(f"✅ Best model saved at epoch {epoch+1} with loss {best_loss:.4f}")

writer.close()



Epoch 1/300
  Batch 1142/1142 - Loss: 9.1320
→ Epoch 1 finished - Avg Loss: 9.3020
✅ Best model saved at epoch 1 with loss 9.3020

Epoch 2/300
  Batch 1142/1142 - Loss: 7.0535
→ Epoch 2 finished - Avg Loss: 8.2640
✅ Best model saved at epoch 2 with loss 8.2640

Epoch 3/300
  Batch 1142/1142 - Loss: 6.7854
→ Epoch 3 finished - Avg Loss: 6.9233
✅ Best model saved at epoch 3 with loss 6.9233

Epoch 4/300
  Batch 1142/1142 - Loss: 6.4642
→ Epoch 4 finished - Avg Loss: 6.5011
✅ Best model saved at epoch 4 with loss 6.5011

Epoch 5/300
  Batch 549/1142 - Loss: 6.3433

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



  Batch 1142/1142 - Loss: 5.9520
→ Epoch 24 finished - Avg Loss: 5.3180
✅ Best model saved at epoch 24 with loss 5.3180

Epoch 25/300
  Batch 1142/1142 - Loss: 5.0424
→ Epoch 25 finished - Avg Loss: 5.2697
✅ Best model saved at epoch 25 with loss 5.2697

Epoch 26/300
  Batch 895/1142 - Loss: 5.2131

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



  Batch 1142/1142 - Loss: 4.5975
→ Epoch 44 finished - Avg Loss: 5.0562
✅ Best model saved at epoch 44 with loss 5.0562

Epoch 45/300
  Batch 1142/1142 - Loss: 5.1877
→ Epoch 45 finished - Avg Loss: 5.0557
✅ Best model saved at epoch 45 with loss 5.0557

Epoch 46/300
  Batch 441/1142 - Loss: 4.9036

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



  Batch 1142/1142 - Loss: 4.9761
→ Epoch 47 finished - Avg Loss: 5.0538
✅ Best model saved at epoch 47 with loss 5.0538

Epoch 48/300
  Batch 159/1142 - Loss: 4.8132

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



  Batch 1142/1142 - Loss: 4.4459
→ Epoch 121 finished - Avg Loss: 5.0332
✅ Best model saved at epoch 121 with loss 5.0332

Epoch 122/300
  Batch 1142/1142 - Loss: 5.0327
→ Epoch 122 finished - Avg Loss: 5.0333

Epoch 123/300
  Batch 1142/1142 - Loss: 5.2166
→ Epoch 123 finished - Avg Loss: 5.0334

Epoch 124/300
  Batch 1142/1142 - Loss: 5.5636
→ Epoch 124 finished - Avg Loss: 5.0334

Epoch 125/300
  Batch 613/1142 - Loss: 5.0264

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



  Batch 1142/1142 - Loss: 5.7771
→ Epoch 125 finished - Avg Loss: 5.0334

Epoch 126/300
  Batch 1142/1142 - Loss: 4.9969
→ Epoch 126 finished - Avg Loss: 5.0325
✅ Best model saved at epoch 126 with loss 5.0325

Epoch 127/300
  Batch 1142/1142 - Loss: 5.3210
→ Epoch 127 finished - Avg Loss: 5.0324
✅ Best model saved at epoch 127 with loss 5.0324

Epoch 128/300
  Batch 1142/1142 - Loss: 5.2161
→ Epoch 128 finished - Avg Loss: 5.0322
✅ Best model saved at epoch 128 with loss 5.0322

Epoch 129/300
  Batch 1142/1142 - Loss: 4.3955
→ Epoch 129 finished - Avg Loss: 5.0312
✅ Best model saved at epoch 129 with loss 5.0312

Epoch 130/300
  Batch 1142/1142 - Loss: 5.3261
→ Epoch 130 finished - Avg Loss: 5.0318

Epoch 131/300
  Batch 1142/1142 - Loss: 4.9858
→ Epoch 131 finished - Avg Loss: 5.0314

Epoch 132/300
  Batch 1142/1142 - Loss: 5.0769
→ Epoch 132 finished - Avg Loss: 5.0311
✅ Best model saved at epoch 132 with loss 5.0311

Epoch 133/300
  Batch 1142/1142 - Loss: 4.9538
→ Epoch 133 finish

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



  Batch 1142/1142 - Loss: 5.7716
→ Epoch 230 finished - Avg Loss: 5.0020

Epoch 231/300
  Batch 663/1142 - Loss: 4.9750

In [17]:
central_word_tensor.shape

torch.Size([2336858])

In [18]:
state_dict = torch.load('best_model.pth')
model.load_state_dict(state_dict)


<All keys matched successfully>

In [19]:
weight_matrix = model.hidden.weight.data
weight_matrix.shape

torch.Size([500, 11272])

In [20]:
for name,_ in model.named_parameters():
    print(name)

hidden.weight
output.weight


In [21]:
weight_matrix2 = model.output.weight.data
weight_matrix2.shape

torch.Size([11272, 500])

In [22]:
idx = word2index['पापी']

In [23]:
weight_matrix2[idx].shape

torch.Size([500])

In [24]:
def get_embedding(word):
    idx = word2index[word]
    emb = weight_matrix2[idx] + weight_matrix.T[idx]
    return emb
    

In [25]:
def cosine_similarity_tensors(tensor1, tensor2):
    tensor1 = tensor1.float()
    tensor2 = tensor2.float()

    if tensor1.ndim == 1:
        tensor1_normalized = F.normalize(tensor1, p=2, dim=0)
        tensor2_normalized = F.normalize(tensor2, p=2, dim=0)
    elif tensor1.ndim == 2:
        tensor1_normalized = F.normalize(tensor1, p=2, dim=1)
        tensor2_normalized = F.normalize(tensor2, p=2, dim=1)
    else:
        raise ValueError("Input tensors should be 1D or 2D.")
    similarity = torch.sum(tensor1_normalized * tensor2_normalized, dim=-1)

    return similarity

In [26]:
def sim_pairs(wrd1,wrd2):
    return cosine_similarity_tensors(get_embedding(wrd1), get_embedding(wrd2))

In [27]:
sim_pairs("पापी","पापं")

tensor(0.2897, device='cuda:0')

In [28]:
sim_pairs("पद्मनाभ", "दामोदर")

tensor(0.3712, device='cuda:0')

In [29]:
sim_pairs("वासुदेवाय", "नारायणाय")

tensor(0.4786, device='cuda:0')

In [31]:
sim_pairs("राम","पापं")

tensor(-0.0122, device='cuda:0')