![ffn_lm](./images/ffn_class.png)

**TO BUILD A FEEDFORWARD NEURAL NETWORK FOR FORWARD INFERENCE AND TEXT GENERATION**

In [1]:
import numpy as np
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
from collections import Counter
from sklearn.model_selection import train_test_split

In [2]:
with open("./data/input.txt", "r", encoding="utf-8") as f:
    text = f.read().lower()

# preprocess text
import re

text = re.sub(r"[^a-zA-Z0-9]", " ", text)
tokens = text.split()

In [3]:
len(tokens), tokens[:10]

(208530,
 ['first',
  'citizen',
  'before',
  'we',
  'proceed',
  'any',
  'further',
  'hear',
  'me',
  'speak'])

In [4]:
word_count = Counter(tokens)

word_2_index = {word: index for index, word in enumerate(word_count.keys())}
index_to_word = {index: word for word, index in word_2_index.items()}

vocab_size = len(word_2_index)

In [5]:
print(vocab_size)

11456


In [12]:
# creating a dataset with a sequence_length as the context (window size)
class TextDataset(Dataset):
    def __init__(self, tokens, word_2_index, sequence_length):
        self.tokens = tokens
        self.word_2_index = word_2_index
        self.sequence_length = sequence_length
        self.int_text = [
            self.word_2_index[word] for word in self.tokens if word in self.word_2_index
        ]

    def __len__(self):
        return len(self.int_text) - self.sequence_length

    def __getitem__(self, index):
        return (
            # self.int_text[index : index + self.sequence_length],
            # self.int_text[index + self.sequence_length],
            torch.tensor(self.int_text[index : index + self.sequence_length], dtype=torch.long),
            torch.tensor(self.int_text[index + self.sequence_length], dtype=torch.long),
        )

In [13]:
# testing the TextDataset class
dataset = TextDataset(tokens, word_2_index, 5)
dataset[26569]

(tensor([ 103, 3502,   32,  490, 3621]), tensor(217))

In [14]:
# A feedforward model for text generation
class FFn_LM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.fc1 = nn.Linear(embedding_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embeds = self.embedding(x)
        embeds = embeds.mean(dim=1)
        out = F.relu(self.fc1(embeds))
        out = self.fc2(out)
        return out

In [15]:
tokens_train, test_token = train_test_split(tokens, test_size=0.2)

# building the dataloader
sequence_length = 8
batch_size = 64

train_dataset = TextDataset(tokens_train, word_2_index, sequence_length)
test_dataset = TextDataset(test_token, word_2_index, sequence_length)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [16]:
# instantiating the model
embedding_dim = 256
hidden_dim = 512
output_dim = vocab_size

model = FFn_LM(vocab_size, embedding_dim, hidden_dim, output_dim)
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [18]:
# training the model
num_epochs = 20
for epoch in range(num_epochs):
    for i, (seq, label) in enumerate(train_loader):
        optimizer.zero_grad()
        output = model(seq)
        loss = loss_function(output, label)
        loss.backward()
        optimizer.step()

        if (i + 1) % 100 == 0:
            print(f"Epoch: {epoch+1}, Batch: {i+1}, Loss: {loss.item():.4f}")

Epoch: 1, Batch: 100, Loss: 6.8662
Epoch: 1, Batch: 200, Loss: 6.9605
Epoch: 1, Batch: 300, Loss: 6.5465
Epoch: 1, Batch: 400, Loss: 6.0991
Epoch: 1, Batch: 500, Loss: 6.5380
Epoch: 1, Batch: 600, Loss: 6.5286
Epoch: 1, Batch: 700, Loss: 6.4445
Epoch: 1, Batch: 800, Loss: 6.3816
Epoch: 1, Batch: 900, Loss: 6.4270
Epoch: 1, Batch: 1000, Loss: 6.4918
Epoch: 1, Batch: 1100, Loss: 6.8839
Epoch: 1, Batch: 1200, Loss: 7.2240
Epoch: 1, Batch: 1300, Loss: 6.5994
Epoch: 1, Batch: 1400, Loss: 6.7963
Epoch: 1, Batch: 1500, Loss: 6.4626
Epoch: 1, Batch: 1600, Loss: 6.3890
Epoch: 1, Batch: 1700, Loss: 6.7548
Epoch: 1, Batch: 1800, Loss: 6.1394
Epoch: 1, Batch: 1900, Loss: 6.5871
Epoch: 1, Batch: 2000, Loss: 6.5121
Epoch: 1, Batch: 2100, Loss: 6.2538
Epoch: 1, Batch: 2200, Loss: 6.6523
Epoch: 1, Batch: 2300, Loss: 6.5359
Epoch: 1, Batch: 2400, Loss: 6.7929
Epoch: 1, Batch: 2500, Loss: 6.2310
Epoch: 1, Batch: 2600, Loss: 7.1097
Epoch: 2, Batch: 100, Loss: 5.9863
Epoch: 2, Batch: 200, Loss: 6.3892
Epo

In [25]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = seed_text.split()
        token_list = token_list[-max_sequence_len:]
        token_list = [word_2_index[word] for word in token_list]

        token_list = torch.tensor(token_list, dtype=torch.long).unsqueeze(0)
        output = model(token_list)
        _, output = torch.max(output, dim=1)
        output_word = index_to_word[output.item()]
        seed_text += " " + output_word
    return seed_text

In [31]:
print(generate_text("to be or not to be that", 10, model, 8))

to be or not to be that a break tis purple you the i hand i strike


In [32]:
# saving the model
torch.save(model.state_dict(), "./models/ffn_lm.pth")

# loading the model
model = FFn_LM(vocab_size, embedding_dim, hidden_dim, output_dim)
model.load_state_dict(torch.load("./models/ffn_lm.pth"))

print(generate_text("to be or not to be that", 10, model, 8))

to be or not to be that a break tis purple you the i hand i strike
