In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/brown-corpus/brown.csv
/kaggle/input/brown-corpus/cats.csv
/kaggle/input/brown-corpus/brown-meta.json
/kaggle/input/brown-corpus/brown/brown/cb09
/kaggle/input/brown-corpus/brown/brown/cb10
/kaggle/input/brown-corpus/brown/brown/cj06
/kaggle/input/brown-corpus/brown/brown/cd17
/kaggle/input/brown-corpus/brown/brown/ca35
/kaggle/input/brown-corpus/brown/brown/cg20
/kaggle/input/brown-corpus/brown/brown/cf02
/kaggle/input/brown-corpus/brown/brown/cc17
/kaggle/input/brown-corpus/brown/brown/cl07
/kaggle/input/brown-corpus/brown/brown/cj74
/kaggle/input/brown-corpus/brown/brown/cg40
/kaggle/input/brown-corpus/brown/brown/cl15
/kaggle/input/brown-corpus/brown/brown/cj72
/kaggle/input/brown-corpus/brown/brown/cm03
/kaggle/input/brown-corpus/brown/brown/cc03
/kaggle/input/brown-corpus/brown/brown/cg72
/kaggle/input/brown-corpus/brown/brown/cj16
/kaggle/input/brown-corpus/brown/brown/ch02
/kaggle/input/brown-corpus/brown/brown/cg34
/kaggle/input/brown-corpus/brown/brown/cj42
/kag

In [4]:
import pandas as pd
brown_df = pd.read_csv("/kaggle/input/brown-corpus/brown.csv")

In [5]:
brown_df.head()

Unnamed: 0,filename,para_id,sent_id,raw_text,tokenized_text,tokenized_pos,label
0,cd05,0,0,"Furthermore/rb ,/, as/cs an/at encouragement/n...","Furthermore , as an encouragement to revisioni...","rb , cs at nn in nn nn , pps rb bez jj to vb c...",religion
1,cd05,0,1,The/at Unitarian/jj clergy/nns were/bed an/at ...,The Unitarian clergy were an exclusive club of...,at jj nns bed at jj nn in vbn nns -- cs at nn ...,religion
2,cd05,0,2,"Ezra/np Stiles/np Gannett/np ,/, an/at honorab...","Ezra Stiles Gannett , an honorable representat...","np np np , at jj nn in at nn , vbd ppl rb in a...",religion
3,cd05,0,3,"Even/rb so/rb ,/, Gannett/np judiciously/rb ar...","Even so , Gannett judiciously argued , the Ass...","rb rb , np rb vbd , at nn-tl md rb vb cs np ``...",religion
4,cd05,0,4,We/ppss today/nr are/ber not/* entitled/vbn to...,We today are not entitled to excoriate honest ...,ppss nr ber * vbn to vb jj nns wps vbd np to b...,religion


no NAs

In [6]:
brown_df.isna().sum()

filename          0
para_id           0
sent_id           0
raw_text          0
tokenized_text    0
tokenized_pos     0
label             0
dtype: int64

In [7]:
brown_df["tokenized_text"][0]

'Furthermore , as an encouragement to revisionist thinking , it manifestly is fair to admit that any fraternity has a constitutional right to refuse to accept persons it dislikes .'

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
train_df, test_df = train_test_split(brown_df, test_size=0.2)

In [10]:
import re 
import torch
from torch.utils.data import Dataset
from collections import Counter



In [11]:
min_frequency = 5
whole_df = brown_df
all_tokens = []
for row in brown_df["tokenized_text"]:
    clean_text = re.sub(r"[^\w\s]", "", row.lower())
    tokens = clean_text.split()
    all_tokens.extend(tokens)

counts = Counter(all_tokens)

vocab_words = [w for w, c in counts.items() if c >= min_frequency]

vocab_words = sorted(vocab_words, key=lambda w: counts[w], reverse=True)

vocab_words.append("<UNK>")

vocab = vocab_words
word2idx = {w: i for i, w in enumerate(vocab)}
idx2word = {i: w for w, i in word2idx.items()}



In [12]:
len(word2idx)

14175

In [13]:
len(idx2word)

14175

In [14]:
class CBOW_Dataset(Dataset):
    def __init__(self, df, window_size=2, min_frequency=5, word2idx=None, idx2word=None):
        self.data = df
        self.window_size = window_size
        self.word2idx = word2idx
        self.idx2word = idx2word 
        
        

        # Step 7: build (context, target) pairs
        self.pairs = []
        for text in self.data['tokenized_text']:
            tokens = re.sub(r"[^\w\s]", "", text.lower()).split()
            for i in range(self.window_size, len(tokens) - self.window_size):
                context = tokens[i - self.window_size:i] + tokens[i + 1:i + self.window_size + 1]
                target = tokens[i]

                # convert context and target to indices, using <UNK> if missing
                context_ids = [self.word2idx[w] if w in self.word2idx else self.word2idx["<UNK>"] for w in context]
                target_id = self.word2idx[target] if target in self.word2idx else self.word2idx["<UNK>"]

                self.pairs.append((torch.tensor(context_ids, dtype=torch.long),
                                   torch.tensor(target_id, dtype=torch.long)))

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        return self.pairs[idx]

    def get_word2idx(self):
        return self.word2idx

    def get_idx2word(self):
        return self.idx2word


In [15]:
from torch.nn import Module
import torch.nn as nn
from torch.nn import functional as F 
class CBOW_MODEL(Module): 
    def __init__(self, vocab_size, embedding_size, context_size):  #context size is how many words around the the center word we're predicting
        super().__init__()
        self.embedding_layer = nn.Embedding(vocab_size, embedding_size) #create embedding layer 
        self.fc1 = nn.Linear(embedding_size* context_size, 128) #check why we do embedding size * context size 
        self.fc2 = nn.Linear(128, vocab_size) #project to a word within the vocab size
    
    def forward(self, input):  
        input = self.embedding_layer(input) #after doing this, our input will have shape: (batch, context_size, d_model), we need to flatten it before passing into fc1
        input = input.view(input.size(0), -1) #flattens the embedding and context size. New tensor is (batch_size, embedding_size * context size). Note that this linear layer is taking a
        #two dimensional output, but it's okay. The linear transformation is perforemd on the last dimension of the tensor (in this case, embedding_size * context_size)
        input = self.fc1(input)
        input = F.relu(input)
        logits = self.fc2(input)

        logits = F.log_softmax(logits, dim=1)

        #input = F.softmax(input, dim=-1) already incorporated in loss 
        return logits
    


In [16]:
train_dataset = CBOW_Dataset(train_df, window_size=2, min_frequency=5, word2idx=word2idx, idx2word=idx2word)
test_dataset = CBOW_Dataset(test_df, window_size=2, min_frequency=5, word2idx=word2idx, idx2word=idx2word)


In [17]:
num_epochs = 10
batch_size = 16 

In [18]:
from torch.utils.data import DataLoader

In [19]:
train_loader = DataLoader(train_dataset, batch_size, num_workers=2, shuffle=False)

In [20]:
test_loader = DataLoader(test_dataset, 
                         batch_size, 
                         num_workers = 2, 
                         shuffle=True
)


In [21]:
vocab_size = len(word2idx)

In [22]:
from torch import optim
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CBOW_MODEL(vocab_size=vocab_size, embedding_size=250, context_size=4).to(device)
criterion = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr = 0.001)

In [23]:
torch.cuda.empty_cache()

In [24]:
n_total_steps = len(train_loader) 
for epoch in range(num_epochs):
    for i, (context,target) in enumerate(train_loader): 
        
        context = context.to(device)
        target = target.to(device)
        outputs = model(context) 
        loss_value = criterion(outputs, target) 
        optimizer.zero_grad()
        loss_value.backward()
        optimizer.step()

        if(i+1) % 2000 ==0:
            print (f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss: {loss_value.item():.4f}')


Epoch [1/10], Step [2000/39702], Loss: 9.5270
Epoch [1/10], Step [4000/39702], Loss: 7.4291
Epoch [1/10], Step [6000/39702], Loss: 8.4964
Epoch [1/10], Step [8000/39702], Loss: 9.0812
Epoch [1/10], Step [10000/39702], Loss: 7.4219
Epoch [1/10], Step [12000/39702], Loss: 7.9809
Epoch [1/10], Step [14000/39702], Loss: 6.6303
Epoch [1/10], Step [16000/39702], Loss: 7.1974
Epoch [1/10], Step [18000/39702], Loss: 7.0509
Epoch [1/10], Step [20000/39702], Loss: 6.7003
Epoch [1/10], Step [22000/39702], Loss: 6.2237
Epoch [1/10], Step [24000/39702], Loss: 8.6847
Epoch [1/10], Step [26000/39702], Loss: 8.1463
Epoch [1/10], Step [28000/39702], Loss: 6.8349
Epoch [1/10], Step [30000/39702], Loss: 6.0667
Epoch [1/10], Step [32000/39702], Loss: 6.6440
Epoch [1/10], Step [34000/39702], Loss: 7.1799
Epoch [1/10], Step [36000/39702], Loss: 7.1924
Epoch [1/10], Step [38000/39702], Loss: 6.1190
Epoch [2/10], Step [2000/39702], Loss: 6.8721
Epoch [2/10], Step [4000/39702], Loss: 6.5380
Epoch [2/10], Step 

In [25]:
model.eval()  # set model to evaluation mode

with torch.no_grad():
    n_correct = 0
    n_samples = 0
    vocab_size = len(word2idx)  # size of your vocabulary
    n_class_correct = [0 for _ in range(vocab_size)]
    n_class_samples = [0 for _ in range(vocab_size)]

    for context_idxs, target_idxs in test_loader:
        context_idxs = context_idxs.to(device)  # shape: (batch_size, context_size)
        target_idxs = target_idxs.to(device)    # shape: (batch_size,)

        outputs = model(context_idxs)  # shape: (batch_size, vocab_size)
        _, predicted = torch.max(outputs, dim=1)  # predicted word indices

        n_samples += target_idxs.size(0)
        n_correct += (predicted == target_idxs).sum().item()

        # per-word accuracy
        for i in range(len(target_idxs)):
            label = target_idxs[i].item()
            pred = predicted[i].item()
            if label == pred:
                n_class_correct[label] += 1
            n_class_samples[label] += 1

    overall_acc = 100.0 * n_correct / n_samples
    print(f'Overall accuracy: {overall_acc:.2f}%')

    # per-word accuracy (optional, can be slow for large vocab)
    for i in range(vocab_size):
        if n_class_samples[i] > 0:
            word_acc = 100.0 * n_class_correct[i] / n_class_samples[i]
            print(f'Accuracy of "{idx2word[i]}": {word_acc:.2f}%')


Overall accuracy: 17.93%
Accuracy of "the": 79.83%
Accuracy of "of": 67.67%
Accuracy of "and": 24.84%
Accuracy of "to": 47.70%
Accuracy of "a": 21.45%
Accuracy of "in": 21.83%
Accuracy of "that": 18.15%
Accuracy of "is": 22.21%
Accuracy of "was": 17.38%
Accuracy of "he": 34.21%
Accuracy of "for": 2.09%
Accuracy of "it": 22.67%
Accuracy of "with": 3.11%
Accuracy of "as": 26.96%
Accuracy of "his": 6.51%
Accuracy of "on": 0.66%
Accuracy of "be": 66.76%
Accuracy of "at": 13.09%
Accuracy of "by": 4.09%
Accuracy of "i": 10.31%
Accuracy of "this": 0.34%
Accuracy of "had": 26.91%
Accuracy of "not": 37.86%
Accuracy of "are": 10.64%
Accuracy of "but": 1.06%
Accuracy of "from": 1.77%
Accuracy of "or": 0.14%
Accuracy of "have": 20.20%
Accuracy of "an": 0.00%
Accuracy of "they": 14.29%
Accuracy of "which": 5.23%
Accuracy of "were": 3.28%
Accuracy of "one": 9.05%
Accuracy of "you": 6.25%
Accuracy of "her": 0.00%
Accuracy of "all": 1.11%
Accuracy of "she": 0.00%
Accuracy of "there": 15.23%
Accuracy o

In [26]:
torch.save(model,"cbow_model.pt")

In [27]:
embed_layer = model.embedding_layer.weight

In [39]:
import pickle

my_dictionary= word2idx

# Save the dictionary to a file
with open("word2idx.pkl", "wb") as file:
    pickle.dump(my_dictionary, file)



# Load the dictionary back from the file
with open("word2idx.pkl", "rb") as file:
    loaded_dictionary = pickle.load(file)

print(loaded_dictionary)




In [29]:
import torch
import torch.nn.functional as F

def closest_words(model, word, word_to_idx, idx_to_word, top_k=10):
    # 1. look up index of the word
    if word not in word_to_idx:
        return f"'{word}' not in vocabulary"

    word_idx = word_to_idx[word]

    # 2. get embedding for the query word (1 x d)
    word_vec = model.embedding_layer.weight[word_idx]  # (d,)
    word_vec = word_vec.unsqueeze(0)                   # (1, d)

    # 3. get all embeddings (vocab_size x d)
    all_embeds = model.embedding_layer.weight          # (V, d)

    # 4. compute cosine similarity with every word
    sims = F.cosine_similarity(word_vec, all_embeds)   # (V,)

    # 5. sort by similarity (descending)
    topk = torch.topk(sims, top_k + 1)                 # +1 to skip the word itself

    # 6. convert indices back to words
    results = []
    for idx in topk.indices:
        w = idx_to_word[int(idx)]
        if w != word:   # skip the query word
            results.append(w)

        if len(results) == top_k:
            break

    return results
    


In [34]:
results = closest_words(model, "hello", word2idx, idx2word, top_k=10)

In [35]:
print(len(idx2word))

14175


In [36]:
print(len(word2idx))

14175


In [37]:
print(results)

['defenses', '1949', 'leavitt', 'tenants', 'gave', 'truthfully', 'fingerprint', 'smile', 'disputes', 'warmed']
