# 1. Import Modules and Prepare sample data

In [207]:
import torch
import torch.nn as nn
import torch.optim as optim
from collections import Counter
import numpy as np

In [208]:
text = "hello world. hello machine learning. hello neural networks." # In General Known as Corpus | Vocabulary | Dictionary

# 2. Tokenization

Converting raw text into a numerical format that neural networks can understand.
By creating a vocabulary and mapping words to integers, we prepare the data for training a next-token prediction model.

1. Tokenize text into words → words
2. Build a vocabulary of unique words → vocab
3. Map words to IDs → word_to_idx
4. Map IDs back to words → idx_to_word
5. Finally converting Input Text to TokenId's 

In [209]:
words = text.lower().split()
vocab = sorted(set(words))
word_to_idx = {word: idx for idx, word in enumerate(vocab)}
idx_to_word = {idx: word for word, idx in word_to_idx.items()}
token_ids = [word_to_idx[w] for w in words]

# 3. Create training data

Preparing input-output pairs for next-token prediction by sliding a window of size 1 over the tokenized text.

  1. Take each token as input (X) and the next token as target (y).
  2. Convert X and y lists into PyTorch tensors for training.
  3. One-hot encode the input tokens so the network receives numerical vectors:
      * Length of each vector = vocabulary size
      * Only the index corresponding to the token is set to 1

3.1 Input (X) and Target (y) Creation Using Sliding Window

In [210]:
X = []
y = []

for i in range(len(token_ids) - 1):
    X.append(token_ids[i])
    y.append(token_ids[i + 1])

X = torch.tensor(X)
y = torch.tensor(y)

3.2 One-hot encode inputs

In [211]:
input_size = len(vocab)
X_onehot = torch.zeros(len(X), input_size)
X_onehot[torch.arange(len(X)), X] = 1


# 4. Define Model

4.1 A simple Feedforward neural network in PyTorch for next-token prediction.

In [212]:
hidden_size = 16
output_size = len(vocab)

class FeedforwardNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(FeedforwardNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x 

model = FeedforwardNN(input_size, hidden_size, output_size)


# Training

Optimize the model to accurately predict the next token.

1. Loss Function: Use CrossEntropyLoss to compare predicted logits with true token IDs.
2. Optimizer: Use Adam to update network weights via backpropagation.
3. Training Loop:
    * Zero gradients for each iteration
    * Forward pass to compute predictions
    * Compute loss and perform backpropagation
    * Update weights
4. Repeat for multiple epochs until the loss decreases and the model learns token patterns.

In [213]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

epochs = 500
for epoch in range(epochs):
    optimizer.zero_grad()
    outputs = model(X_onehot)
    loss = criterion(outputs, y)
    loss.backward()
    optimizer.step()

    if (epoch+1) % 100 == 0:
        print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}')


Epoch 100/500, Loss: 0.4787
Epoch 200/500, Loss: 0.4731
Epoch 300/500, Loss: 0.4719
Epoch 400/500, Loss: 0.4715
Epoch 500/500, Loss: 0.4713


# 6. Prediction

Predict the next word for 'hello'. Generate predictions from the trained model.

1. Prepare Input: Convert a word to its one-hot vector representation.
2. Forward Pass: Feed the input into the trained network.
3. Prediction: Take the argmax of the output logits to get the predicted token ID.
4. Map Back to Word: Convert the predicted token ID to its corresponding word using idx_to_word.

In [214]:
input_word = "hello"
input_idx = torch.tensor([word_to_idx[input_word]])
input_onehot = torch.zeros(1, input_size)
input_onehot[0, input_idx] = 1

with torch.no_grad():
    logits = model(input_onehot)
    predicted_idx = torch.argmax(logits, dim=1).item()
    print(f"Next word after '{input_word}' is '{idx_to_word[predicted_idx]}'")


Next word after 'hello' is 'neural'
