> **Note:** This notebook is my personal practice notebook on Feedforward neural language model
>  
> I’m following along with the course materials and using ideas/code inspired by:
> 
> - [CMU ANLP Course Page](https://cmu-l3.github.io/anlp-spring2025/)  
>   *(Lecture 2: Neural Bag-of-Words Classifiers, Spring 2025)*  

## Notebook Outline

1. [Training Tokenizer](#Training-Tokenizer)
2. [Data Loading](#Data-Loading)
3. [Understanding Concepts](#Understanding-Concepts)
4. [Continuous Bag of Words](#Continuous-Bag-of-Words)

In [61]:
# Imports
import json
import os
import pandas as pd
import random
import time
import torch
import sentencepiece as spm

from torch import nn
from torch.nn import functional as F

### Training Tokenizer

In [2]:
# Understanding tweet sentiment dataset (https://huggingface.co/datasets/mteb/tweet_sentiment_extraction)
df = pd.read_json('train.jsonl', lines=True)
df.head()

Unnamed: 0,id,text,label,label_text
0,cb774db0d1,"I`d have responded, if I were going",1,neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,0,negative
2,088c60f138,my boss is bullying me...,0,negative
3,9642c003ef,what interview! leave me alone,0,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...",0,negative


In [11]:
# Writing all the sentences into a single text file
with open('bow_tokenizer.txt', 'w', encoding='utf-8') as f:
    for _, text in df['text'].items():
        f.write(text)
        f.write('\n')


In [67]:
# Training a sentence piece trainer 
spm.SentencePieceTrainer.train(
    input='bow_tokenizer.txt',
    input_format='text',
    model_prefix='bow_tok',
    model_type='bpe',
    vocab_size=1024,
    byte_fallback=True,
    num_threads=os.cpu_count()
)

In [69]:
sp = spm.SentencePieceProcessor()
sp.load('bow_tok.model')

vocab = [[sp.id_to_piece(idx), idx]for idx in range(sp.get_piece_size())]
print(vocab[1000:1020])

[["'", 1000], ['4', 1001], ['K', 1002], [')', 1003], ['5', 1004], ['&', 1005], ['(', 1006], ['V', 1007], ['q', 1008], ['6', 1009], ['#', 1010], ['8', 1011], ['7', 1012], ['9', 1013], [';', 1014], ['<', 1015], ['@', 1016], ['=', 1017], ['¿', 1018], ['ï', 1019]]


### Data Loading

Read in the data, tokenize it, and split it into a training and dev set. 

In [70]:
def dataset_creation(file_name:str, sp:spm.SentencePieceProcessor):
    '''Read file and convert each sentence into tokens and also map tokens to a label'''
    df = pd.read_json(file_name, lines=True)
    for _, row in df.iterrows():
        label_to_text[row['label']]=row['label_text']
        tokens = sp.encode(row['text'])
        yield (tokens, row['label'])

In [71]:
label_to_text = {}
df = pd.read_json('train.jsonl', lines=True)
for _, row in df.iterrows():
    label_to_text[row['label']]=row['label_text']

# Read in the data
ds = list(dataset_creation("train.jsonl", sp))
random.shuffle(ds)
train = ds[:-1000]
dev = ds[1000:]

nwords = len(sp)
ntags = len(label_to_text)

### Understanding Concepts

In [72]:
class Embedding(nn.Module):
    '''Building embedddin layer from scratch'''
    def __init__(self, vocab_size, embedding_size):
        super(Embedding, self).__init__()
        self.weight = nn.Parameter(torch.randn(vocab_size, embedding_size))
        self.vocab_size = vocab_size

        nn.init.xavier_uniform_(self.weight)
    def forward(self, x):
        x = F.one_hot(x, num_classes=self.vocab_size).float
        x = torch.matmul(x, self.weight)
        return x

In [73]:
def ce_loss(logits, target):
    '''Computes cross entropy loss'''
    log_probs = F.log_softmax(logits, dim=1)
    loss = -log_probs[torch.arange(len(logits)), target]
    return loss.mean()


In [74]:
ce_loss(torch.tensor([[19.1, 9.5, 0.15], [0.2, 9.1, 0.2], [0.2, 9.1, 1.3]]), torch.tensor([0, 1, 1]))

tensor(0.0003)

### Continuous Bag of Words

In [75]:
class CBoW(nn.Module):
    def __init__(self, vocab_size, embedding_size, num_labels):
        super(CBoW, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_size)
        self.output_layer = nn.Linear(embedding_size, num_labels)

        nn.init.xavier_uniform_(self.embedding.weight)
        nn.init.xavier_uniform_(self.output_layer.weight)
    
    def forward(self, tokens):
        emb = self.embedding(tokens)
        emb_sum = torch.sum(emb, dim=0)
        h = emb_sum.view(1, -1)
        logits = self.output_layer(h)
        return logits


In [76]:
EMB_SIZE=32
model = CBoW(nwords, EMB_SIZE, ntags)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)

for ITER in range(5):
    random.shuffle(train)
    train_loss = 0.0
    start = time.time()
    model.train()
    for x, y in train:
        x = torch.tensor(x, dtype=torch.long)
        y = torch.tensor([y])
        logits = model(x)
        loss = criterion(logits, y)
        train_loss += loss.item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print("iter %r: train loss/sent=%.4f, time=%.2fs" % (
                ITER, train_loss/len(train), time.time()-start))
    model.eval()
    # Perform testing
    test_correct = 0.0
    for x, y in dev:
        x = torch.tensor(x, dtype=torch.long)
        logits = model(x)[0].detach()
        predict = logits.argmax().item()
        if predict == y:
            test_correct += 1
    print("iter %r: dev acc=%.4f" % (ITER, test_correct/len(dev)))

iter 0: train loss/sent=0.8728, time=27.18s
iter 0: dev acc=0.6682
iter 1: train loss/sent=0.8068, time=28.56s
iter 1: dev acc=0.6751
iter 2: train loss/sent=0.7928, time=27.80s
iter 2: dev acc=0.6783
iter 3: train loss/sent=0.7855, time=28.80s
iter 3: dev acc=0.6798
iter 4: train loss/sent=0.7819, time=31.19s
iter 4: dev acc=0.6777
