> **Note:** This notebook is my personal practice notebook on Feedforward neural language model
>  
> I’m following along with the course materials and using ideas/code inspired by:
> 
> - [CMU ANLP Course Page](https://cmu-l3.github.io/anlp-spring2025/)  
>   *(Lecture 3: Language Modeling Fundamentals, Spring 2025)*  

## Notebook Outline

1. [Build the Dataset](#Build-the-dataset)
2. [Define the Model](#Define-the-model)
3. [Training](#Training)
4. [Conditional Generation](#Conditional-generation)

In [41]:
# Imports
import torch
import random
from torch import nn, optim

from typing import Dict, List, Tuple
random.seed(123)

### Build the Dataset

In [6]:
# Understanding the dataset
data=open('names.txt').read().splitlines()
print(data[:10])

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia', 'harper', 'evelyn']


In [7]:
# Tokenizing the letters by mapping it to an index

token_to_index = {token: i for i,token in enumerate('abcdefghijklmnopqrstuvwxyz')}
token_to_index['[S]'] = 26 # Appending [S] token
index_to_token = {i:token for token,i in token_to_index.items()}

In [8]:
# Creating the Dataset class
# Our dataset consists of x, y pairs, where x is a (n-1) token context and y is a token

class Dataset:
    """
    Dataset class for building context-target pairs from a sequence of tokens.
    """
    def __init__(self, context_size:int, token_to_index:Dict[str, int]):
        self.context_size = context_size
        self.token_to_index = token_to_index

    def build_dataset(self, data:List[str]) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Build the dataset of context-target pairs.

        Args:
            data (List[str]): List of string sequences.

        Returns:
            Tuple[torch.Tensor, torch.Tensor]:
                X: Tensor of shape (num_samples, context_size)
                Y: Tensor of shape (num_samples,)
        """ 
        X, Y = [], []
        for item in data:
            context = [self.token_to_index['[S]']] * self.context_size
            tokens = list(item) + ['[S]'] # Appending an [S] token to mark the end of the sequence.
            for token in tokens:
                X.append(context)
                Y.append(self.token_to_index[token])
                context = context[1:] + [self.token_to_index[token]]
        X = torch.tensor(X)
        Y = torch.tensor(Y)
        return X, Y

In [9]:
## Building the datasets

# shuffle the data
random.shuffle(data)

# train/val/test split as 80/10/10
n1 = int(0.8 * len(data))
n2 = int(0.9 * len(data))

context_size = 5 # taking last five characters as a context
dataset = Dataset(context_size=context_size, token_to_index=token_to_index)
x_train, y_train = dataset.build_dataset(data[:n1])
x_val, y_val = dataset.build_dataset(data[n1:n2])
x_test, y_test = dataset.build_dataset(data[n2:])

In [10]:
# Understanding the created dataset
print(x_train[:10])
print(y_train[:10])

tensor([[26, 26, 26, 26, 26],
        [26, 26, 26, 26, 11],
        [26, 26, 26, 11, 20],
        [26, 26, 11, 20,  0],
        [26, 11, 20,  0, 13],
        [11, 20,  0, 13, 13],
        [26, 26, 26, 26, 26],
        [26, 26, 26, 26, 18],
        [26, 26, 26, 18,  7],
        [26, 26, 18,  7,  0]])
tensor([11, 20,  0, 13, 13, 26, 18,  7,  0,  8])


### Define the Model

In [85]:
class MLPLM(nn.Module):
    '''Defining the Multi Layer Perceptro based Language model'''
    def __init__(self, vocab_size:int, embedding_size:int, context_size:int, hidden_size:int):
        super(MLPLM, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_size)
        self.fc1 = nn.Linear(in_features=context_size*embedding_size, out_features=hidden_size)
        self.fc2 = nn.Linear(in_features=hidden_size, out_features=vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        x = x.view(x.shape[0], -1)
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return x

In [86]:
model = MLPLM(
    vocab_size=len(token_to_index),
    embedding_size=64,
    context_size=5,
    hidden_size=32
)

In [87]:
output=model(x_train[:2])
print(output.shape)

torch.Size([2, 27])


### Training

In [88]:
# Defining the model
model = MLPLM(
    vocab_size=len(token_to_index),
    embedding_size=64,
    context_size=5,
    hidden_size=32
)

# Hyper parameters
learning_rate = 0.001
num_epochs = 10
batch_size = 32

# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [89]:
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for i in range(0, len(x_train), batch_size):
        x_batch = x_train[i:i+batch_size]
        y_batch = y_train[i:i+batch_size]

        # forward pass
        outputs = model(x_batch)
        loss = criterion(outputs, y_batch)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    
    average_loss = total_loss / (len(x_train)// batch_size)
    print(f'epoch {epoch}: Average loss is {average_loss}')

epoch 0: Average loss is 2.46578948694363
epoch 1: Average loss is 2.367275342753059
epoch 2: Average loss is 2.3490918047595444
epoch 3: Average loss is 2.3385062630971274
epoch 4: Average loss is 2.3324026145642263
epoch 5: Average loss is 2.327908363802391
epoch 6: Average loss is 2.324646507271549
epoch 7: Average loss is 2.322029471815678
epoch 8: Average loss is 2.319872084542325
epoch 9: Average loss is 2.317828746783106


### Conditional Generation

In [106]:
def generator(model, context, max_length=25):
    model.eval()
    output = []

    with torch.no_grad():
        context = torch.tensor(context).unsqueeze(0)
        for _ in range(max_length):
            logits = model(context)
            probs = torch.softmax(logits, dim=-1)
            token = torch.multinomial(probs, num_samples=1) # intuition: Instead of just multinomial, probably use topk with multinomial for better results
            context = torch.cat([context[:, 1:], token], dim=-1)

            output.append(index_to_token[token.item()])
            if index_to_token[token.item()] == '[S]':
                return ''.join(output)[:-3]
    return ''.join(output)[:-3]

In [107]:
prompts = ['so', 'so', 'so', 'so']

for prompt in prompts:
    output = generator(model, ([token_to_index['[S]']] * (context_size-len(prompt))) + [token_to_index[c] for c in prompt])
    print(prompt+output)

sockenmene
sogbabel
somiacausyn
sofiquiat
