# SVD

In [None]:
import numpy as np
import nltk
import torch
import re
import collections
from nltk.corpus import brown
from collections import Counter, defaultdict
from scipy.sparse import coo_matrix
from scipy.sparse.linalg import svds

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

nltk.download("brown")
nltk.download("punkt")



def tokenize_corpus():
    corpus = brown.sents()
    processed_corpus = []
    for sentence in corpus:
        sentence = [word.lower() for word in sentence]
        sentence = [re.sub(r'[^a-z]', '', word) for word in sentence]
        sentence = [word for word in sentence if word]
        if sentence:
            processed_corpus.append(sentence)
    return processed_corpus

def build_vocabulary(corpus, min_freq=2):
    word_counts = collections.Counter(word for sentence in corpus for word in sentence)
    vocab = {}
    vocab["<UNK>"] = 0
    idx = 1
    for word, count in word_counts.items():
        if count >= min_freq:
            vocab[word] = idx
            idx += 1
    return vocab

tokenized_corpus = tokenize_corpus()
vocab = build_vocabulary(tokenized_corpus)
print(f"Vocabulary size: {len(vocab)}")


window_size = 2
embedding_dim = 200

word_to_id = {word: i for i, word in enumerate(sorted(vocab))}
id_to_word = {i: word for word, i in word_to_id.items()}
vocab_size = len(vocab)

co_occurrence = defaultdict(Counter)
for sentence in tokenized_corpus:
    sentence = [word.lower() for word in sentence if word in vocab]
    for i, target in enumerate(sentence):
        for j in range(max(0, i - window_size), min(len(sentence), i + window_size + 1)):
            if i != j:
                co_occurrence[target][sentence[j]] += 1

rows, cols, data = [], [], []
for word, context_words in co_occurrence.items():
    for context_word, count in context_words.items():
        rows.append(word_to_id[word])
        cols.append(word_to_id[context_word])
        data.append(count)

co_matrix = coo_matrix((data, (rows, cols)), shape=(vocab_size, vocab_size), dtype=np.float64)
print("Sparse Co-occurrence Matrix is built.")

print("Truncated SVD ...")
U, S, Vt = svds(co_matrix, k=embedding_dim)
U = U[:, ::-1]
S = S[::-1]
Vt = Vt[::-1, :]

embeddings = U * np.sqrt(S)

embedding_dict = {
    "word_to_id": word_to_id,
    "word_vectors": embeddings
}
torch.save(embedding_dict, "svd.pt")

print("Embeddings saved to svd.pt")

Using device: cuda


[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Vocabulary size: 26753
Sparse Co-occurrence Matrix is built.
Truncated SVD ...
Embeddings saved to svd.pt


# CBOW

In [None]:
import torch
import numpy as np
import re
import collections
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from nltk.corpus import brown
from collections import Counter
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

nltk.download("brown")
nltk.download("punkt")


def tokenize_corpus():
    corpus = brown.sents()
    processed_corpus = []
    for sentence in corpus:
        sentence = [word.lower() for word in sentence]
        sentence = [re.sub(r'[^a-z]', '', word) for word in sentence]
        sentence = [word for word in sentence if word]
        if sentence:
            processed_corpus.append(sentence)
    return processed_corpus

def build_vocabulary(corpus, min_freq=2):
    word_counts = collections.Counter(word for sentence in corpus for word in sentence)
    vocab = {}
    vocab["<UNK>"] = 0
    idx = 1
    for word, count in word_counts.items():
        if count >= min_freq:
            vocab[word] = idx
            idx += 1
    return vocab


tokenized_corpus = tokenize_corpus()
vocab = build_vocabulary(tokenized_corpus)
print(f"Vocabulary size: {len(vocab)}")


word_to_id = {word: i for i, word in enumerate(vocab)}
id_to_word = {i: word for word, i in word_to_id.items()}
vocab_size = len(vocab)

# word_to_id = vocab
# idx2word = {idx: word for word, idx in word_to_id.items()}
print(f"Loaded vocabulary size: {vocab_size}")


embedding_dim = 100
window_size = 2
num_negative = 10
num_epochs = 5
learning_rate = 0.005
batch_size = 128

class CBOWDataset(Dataset):
    def __init__(self, sentences, word_to_id, window_size):
        self.examples = []
        for sentence in sentences:
            indices = [word_to_id.get(word, word_to_id.get("<UNK>")) for word in sentence]
            for i, target in enumerate(indices):
                context = []
                for j in range(max(0, i - window_size), min(len(indices), i + window_size + 1)):
                    if j != i:
                        context.append(indices[j])
                if context:
                    self.examples.append((context, target))

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        return self.examples[idx]

def collate_fn(batch):
    contexts = [torch.tensor(item[0], dtype=torch.long) for item in batch]
    targets = torch.tensor([item[1] for item in batch], dtype=torch.long)
    contexts_padded = pad_sequence(contexts, batch_first=True, padding_value=-1)
    lengths = torch.tensor([len(x) for x in contexts], dtype=torch.float)
    return contexts_padded.to(device), targets.to(device), lengths.to(device)

dataset = CBOWDataset(tokenized_corpus, word_to_id, window_size)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)


# Xavier Initialization

W = torch.empty(vocab_size, embedding_dim, device=device)
torch.nn.init.xavier_uniform_(W)
W.requires_grad = True

C = torch.empty(vocab_size, embedding_dim, device=device)
torch.nn.init.xavier_uniform_(C)
C.requires_grad = True

optimizer = torch.optim.Adam([W, C], lr=learning_rate)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)

def sample_negative(batch_targets, num_negative, vocab_size):
    B = batch_targets.size(0)
    # Generate random indices in [0, vocab_size-2]
    neg_samples = torch.randint(0, vocab_size-1, (B, num_negative), device=device)
    # Shift indices >= target to exclude the target value
    mask = neg_samples >= batch_targets.unsqueeze(1)
    neg_samples += mask.long()  # Add 1 to masked positions
    return neg_samples

for epoch in range(num_epochs):
    total_loss = 0.0
    sample_count = 0
    epoch_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}", unit="batch")
    for contexts, targets, lengths in epoch_bar:
        context_embeds = W[contexts]
        mask = (contexts != -1).unsqueeze(-1).float()
        masked_context_embeds = context_embeds * mask
        sum_context = masked_context_embeds.sum(dim=1)
        avg_context = sum_context / lengths.unsqueeze(1)
        pos_scores = (avg_context * C[targets]).sum(dim=1)
        pos_loss = -torch.log(torch.sigmoid(pos_scores) + 1e-10)
        neg_samples = sample_negative(targets, num_negative, vocab_size)
        neg_embeds = C[neg_samples]
        neg_scores = torch.bmm(neg_embeds, avg_context.unsqueeze(2)).squeeze(2)
        neg_loss = -torch.log(torch.sigmoid(-neg_scores) + 1e-10).sum(dim=1)
        loss = (pos_loss + neg_loss).mean()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        sample_count += 1
        epoch_bar.set_postfix(loss=f"{loss.item():.4f}")
    scheduler.step()
    avg_epoch_loss = total_loss / sample_count if sample_count > 0 else 0
    print(f"Epoch {epoch+1}, Average Loss: {avg_epoch_loss:.4f}")

word_vectors = (W + C).detach().cpu().numpy()

embedding_dict = {
    "word_to_id": word_to_id,
    "word_vectors": word_vectors
}
torch.save(embedding_dict, "cbow_manual.pt")
print("CBOW embeddings saved in the required format as cbow.pt")

Using device: cuda


[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Vocabulary size: 26753
Loaded vocabulary size: 26753


Epoch 1: 100%|██████████| 15693/15693 [01:42<00:00, 152.40batch/s, loss=2.5152]


Epoch 1, Average Loss: 2.0690


Epoch 2: 100%|██████████| 15693/15693 [01:41<00:00, 155.14batch/s, loss=1.8429]


Epoch 2, Average Loss: 1.5950


Epoch 3: 100%|██████████| 15693/15693 [01:42<00:00, 153.75batch/s, loss=1.7058]


Epoch 3, Average Loss: 1.2931


Epoch 4: 100%|██████████| 15693/15693 [01:41<00:00, 154.48batch/s, loss=0.9544]


Epoch 4, Average Loss: 1.0798


Epoch 5: 100%|██████████| 15693/15693 [01:40<00:00, 155.78batch/s, loss=1.2062]


Epoch 5, Average Loss: 0.9307
CBOW embeddings saved in the required format as cbow_manual.pt


# SkipGram

In [None]:
import torch
import numpy as np
import re
import collections
from nltk.corpus import brown
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from collections import Counter
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

nltk.download("brown")
nltk.download("punkt")

def tokenize_corpus():
    corpus = brown.sents()
    processed_corpus = []
    for sentence in corpus:
        sentence = [word.lower() for word in sentence]
        sentence = [re.sub(r'[^a-z]', '', word) for word in sentence]
        sentence = [word for word in sentence if word]
        if sentence:
            processed_corpus.append(sentence)
    return processed_corpus

def build_vocabulary(corpus, min_freq=2):
    word_counts = collections.Counter(word for sentence in corpus for word in sentence)

    vocab = {}
    vocab["<UNK>"] = 0
    idx = 1
    for word, count in word_counts.items():
        if count >= min_freq:
            vocab[word] = idx
            idx += 1
    return vocab

tokenized_corpus = tokenize_corpus()
vocab = build_vocabulary(tokenized_corpus)

word_to_id = {word: i for i, word in enumerate(vocab)}
id_to_word = {i: word for word, i in word_to_id.items()}
vocab_size = len(vocab)
print(f"Vocabulary size: {len(vocab)}")


embedding_dim = 100
window_size = 2
num_negative = 20
num_epochs = 10
learning_rate = 0.005
batch_size = 64


class SkipGramDataset(Dataset):
    def __init__(self, sentences, word_to_id, window_size):
        self.examples = []
        for sentence in sentences:
            indices = [word_to_id.get(word, word_to_id["<UNK>"]) for word in sentence]
            for i, target in enumerate(indices):
                start = max(0, i - window_size)
                end = min(len(indices), i + window_size + 1)
                for j in range(start, end):
                    if j != i:
                        context = indices[j]
                        self.examples.append((target, context))

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        return self.examples[idx]

def collate_fn(batch):
    targets = torch.tensor([item[0] for item in batch], dtype=torch.long)
    contexts = torch.tensor([item[1] for item in batch], dtype=torch.long)
    return targets.to(device), contexts.to(device)

dataset = SkipGramDataset(tokenized_corpus, word_to_id, window_size)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

W = torch.empty(vocab_size, embedding_dim, device=device)
torch.nn.init.xavier_uniform_(W)
W.requires_grad = True

C = torch.empty(vocab_size, embedding_dim, device=device)
torch.nn.init.xavier_uniform_(C)
C.requires_grad = True

optimizer = torch.optim.Adam([W, C], lr=learning_rate)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)

def sample_negative(batch_targets, num_negative, vocab_size):  # Fixed: use targets
    B = batch_targets.size(0)
    neg_samples = torch.randint(0, vocab_size-1, (B, num_negative), device=device)
    mask = neg_samples >= batch_targets.unsqueeze(1)
    neg_samples += mask.long()
    return neg_samples

for epoch in range(num_epochs):
    total_loss = 0.0
    sample_count = 0
    epoch_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}", unit="batch")
    for targets, contexts in epoch_bar:
        pos_embeds = W[targets]
        pos_scores = (pos_embeds * C[contexts]).sum(dim=1)
        pos_loss = -torch.log(torch.sigmoid(pos_scores) + 1e-10).mean()

        neg_samples = sample_negative(targets, num_negative, vocab_size)  # Pass targets
        neg_embeds = C[neg_samples]
        neg_scores = (pos_embeds.unsqueeze(1) * neg_embeds).sum(dim=2)
        neg_loss = -torch.log(torch.sigmoid(-neg_scores) + 1e-10).sum(dim=1).mean()  # Sum over negatives

        loss = pos_loss + neg_loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        sample_count += 1
        epoch_bar.set_postfix(loss=f"{loss.item():.4f}")
    scheduler.step()
    avg_epoch_loss = total_loss / sample_count if sample_count > 0 else 0
    print(f"Epoch {epoch+1}, Average Loss: {avg_epoch_loss:.4f}")

word_vectors = (W + C).detach().cpu().numpy()

embedding_dict = {
    "word_to_id": word_to_id,
    "word_vectors": word_vectors
}
torch.save(embedding_dict, "skipgram_manual.pt")

print("Skip-gram embeddings saved as skipgram_manual.pt")

Using device: cuda


[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Vocabulary size: 26753
Loaded vocabulary size: 26753


Epoch 1: 100%|██████████| 57517/57517 [05:31<00:00, 173.74batch/s, loss=2.8274]


Epoch 1, Average Loss: 2.5265


Epoch 2: 100%|██████████| 57517/57517 [05:19<00:00, 180.26batch/s, loss=2.9136]


Epoch 2, Average Loss: 2.3932


Epoch 3: 100%|██████████| 57517/57517 [05:25<00:00, 176.62batch/s, loss=2.1991]


Epoch 3, Average Loss: 2.2918


Epoch 4: 100%|██████████| 57517/57517 [05:17<00:00, 181.12batch/s, loss=2.5104]


Epoch 4, Average Loss: 2.2151


Epoch 5: 100%|██████████| 57517/57517 [05:11<00:00, 184.38batch/s, loss=2.6108]


Epoch 5, Average Loss: 2.1546


Epoch 6: 100%|██████████| 57517/57517 [05:18<00:00, 180.74batch/s, loss=1.5990]


Epoch 6, Average Loss: 2.0924


Epoch 7: 100%|██████████| 57517/57517 [05:18<00:00, 180.82batch/s, loss=2.0380]


Epoch 7, Average Loss: 2.0277


Epoch 8: 100%|██████████| 57517/57517 [05:19<00:00, 180.25batch/s, loss=1.4837]


Epoch 8, Average Loss: 1.9636


Epoch 9: 100%|██████████| 57517/57517 [05:16<00:00, 181.50batch/s, loss=2.1201]


Epoch 9, Average Loss: 1.8999


Epoch 10: 100%|██████████| 57517/57517 [05:26<00:00, 176.41batch/s, loss=2.3489]

Epoch 10, Average Loss: 1.8459
Skip-gram embeddings saved as skipgram_manual.pt



