### Load Dataset

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import joblib
data_batches = joblib.load("/content/drive/MyDrive/data_batches.joblib")
data_batches[0][:2]

[['বোরকা',
  'নিষিদ্ধ',
  'পথ',
  'ধাপ',
  'এগি',
  'শ্রীলঙ্কা',
  'দেশ',
  'মন্ত্রিসভা',
  'সংক্রান্ত',
  'প্রস্তাব',
  'অনুমোদন',
  'দেশ',
  'জননিরাপত্তাবিষয়ক',
  'মন্ত্রী',
  'শরৎ',
  'বীরসেকেরা',
  'তথ্য',
  'জানিয়েছেন',
  'কাতারভিত্তিক',
  'সংবাদমাধ্যম',
  'জাজি',
  'বুধবার',
  'প্রতিবেদন',
  'জানা',
  'শ্রীলঙ্কা',
  'মন্ত্রিসভা',
  'সাপ্তাহিক',
  'বৈঠক',
  'অনুষ্ঠিত',
  'হ',
  'গতকাল',
  'বৈঠ',
  'বোরকা',
  'নিষিদ্ধ',
  'প্রস্তাব',
  'সম্মত',
  'হ',
  'জননিরাপত্তাবিষয়ক',
  'মন্ত্রী',
  'শরৎ',
  'বীরসেকেরা',
  'তথ্য',
  'জানি',
  'ফেসবুক',
  'প',
  'পোস্ট',
  'প্রস্তাব',
  'অ্যাটর্নি',
  'জেনারেল',
  'পাঠানো',
  'এরপর',
  'পার্লামেন্ট',
  'পাস',
  'আইন',
  'পরিণত',
  'শ্রীলঙ্কা',
  'পার্লামেন্ট',
  'সরকারি',
  'দল',
  'সংখ্যাগরিষ্ঠতা',
  'র',
  'সহজ',
  'আইন',
  'হিস',
  'পাস'],
 ['মার্চ',
  'সংবাদ',
  'সম্মেলন',
  'শ্রীলঙ্কা',
  'জননিরাপত্তাবিষয়ক',
  'মন্ত্রী',
  'বোরকা',
  'নিষিদ্ধ',
  'উদ্যোগ',
  'নেওয়ার',
  'কথা',
  'জানিয়েছিলেন',
  'সম',
  'জানান',
  'ধর্মী',
  'উগ্রবাদ',
  '

In [3]:
num_batches = len(data_batches)

### Batch Processing

In [4]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from gensim.models import Word2Vec
import numpy as np
from tqdm import tqdm

# Define the Word2Vec model using PyTorch
class Word2VecPytorch(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(Word2VecPytorch, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, input_words):
        embeds = self.embeddings(input_words)
        output = self.linear(embeds)
        return output

def save_checkpoint(model, epoch, optimizer, loss):
    model_dir = "checkpoints"
    os.makedirs(model_dir, exist_ok=True)
    model_path = os.path.join(model_dir, f"word2vec_model_epoch{epoch}.pt")
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss,
    }, model_path)
    print(f"Checkpoint saved: {model_path}")

In [5]:
processed_sentences = []
for batch in tqdm(data_batches, desc="Flattening Batches", unit="batch"):
    for sentence in batch:
        processed_sentences.append(sentence)

Flattening Batches: 100%|██████████| 4/4 [00:00<00:00, 105.29batch/s]


In [6]:
from collections import Counter
words = []
for sentence in tqdm(processed_sentences, desc=f"Counting ", unit="sentence"):
  for word in sentence:
    words.append(word)
word_counter = Counter(words)
vocab = {word: idx for idx, (word, _) in enumerate(word_counter.items())}
vocab_size = len(vocab)
print(vocab_size)

Counting : 100%|██████████| 400000/400000 [00:02<00:00, 184219.05sentence/s]


316005


In [7]:
vocab["টাকা"]

224

In [8]:
embedding_dim = 400

# Initialize Word2Vec model with PyTorch
model = Word2VecPytorch(vocab_size, embedding_dim)
model = model.cuda()  # Move model to GPU
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

In [11]:
vocab["জরুরিভাবে"]

13016

In [None]:
for epoch, data_batch in enumerate(data_batches):
    total_loss = 0
    for sentence in tqdm(data_batch, desc=f"Epoch {epoch + 1}/{num_batches}", unit="sentence"):
        for target_word in tqdm(sentence, desc="Processing Words", unit="word"):
            # Iterate over context words
            for context_word in sentence:
                if context_word != target_word:
                    # Convert words to indices
                    target_idx = torch.tensor([vocab[target_word]], dtype=torch.long).cuda()
                    context_idx = torch.tensor([vocab[context_word]], dtype=torch.long).cuda()

                    # Zero the gradients
                    optimizer.zero_grad()

                    # Forward pass
                    output = model(context_idx)

                    # Calculate loss
                    loss = criterion(output, target_idx)

                    # Backward pass
                    loss.backward()

                    # Update weights
                    optimizer.step()

                    # Accumulate loss
                    total_loss += loss.item()

    # Print epoch loss
    print(f"Epoch {epoch + 1}, Loss: {total_loss:.4f}")

    # Save checkpoint
    save_checkpoint(model, epoch, optimizer, total_loss)

Epoch 1/4:   0%|          | 0/100000 [00:00<?, ?sentence/s]
Processing Words:   0%|          | 0/65 [00:00<?, ?word/s][A
Processing Words:   2%|▏         | 1/65 [00:00<00:18,  3.39word/s][A
Processing Words:   3%|▎         | 2/65 [00:00<00:18,  3.39word/s][A
Processing Words:   5%|▍         | 3/65 [00:00<00:18,  3.37word/s][A
Processing Words:   6%|▌         | 4/65 [00:01<00:17,  3.39word/s][A
Processing Words:   8%|▊         | 5/65 [00:01<00:17,  3.41word/s][A
Processing Words:   9%|▉         | 6/65 [00:01<00:17,  3.45word/s][A
Processing Words:  11%|█         | 7/65 [00:02<00:16,  3.45word/s][A
Processing Words:  12%|█▏        | 8/65 [00:02<00:16,  3.46word/s][A
Processing Words:  14%|█▍        | 9/65 [00:02<00:16,  3.45word/s][A
Processing Words:  15%|█▌        | 10/65 [00:02<00:15,  3.48word/s][A
Processing Words:  17%|█▋        | 11/65 [00:03<00:15,  3.47word/s][A
Processing Words:  18%|█▊        | 12/65 [00:03<00:15,  3.47word/s][A
Processing Words:  20%|██        | 

In [None]:

# Save the final trained model
final_model_path = "final_word2vec_model.pt"
torch.save(model.state_dict(), final_model_path)
print(f"Final model saved: {final_model_path}")
