We use character-level language modeling to generate text similar to math phrases scraped from the [Encylopedia of Mathematics](https://encyclopediaofmath.org/wiki/Special:AllPages).

This is heavily based on Chapter 15 of **Machine Learning with PyTorch and Scikit-Learn** by Raschka, Liu, and Mirjalili. The main technical difference is that I did not use an embedding as the first layer of the network, but instead used a one-hot encoding.

In [31]:
import numpy as np
import random

In [32]:
with open('math_phrases_final.txt', 'r', encoding="utf8") as fp:
    text = fp.read()
phrase_list = text.split('\n')
phrase_list = phrase_list[:-1]
random.seed(42)
random.shuffle(phrase_list)
text = '\n'.join(phrase_list)
char_set = set(text)
print('Total Length:', len(text))
print('Unique Characters:', len(char_set))

Total Length: 206528
Unique Characters: 97


In [33]:
chars_sorted = sorted(char_set)
char2int = {ch : i for i, ch in enumerate(chars_sorted)}
char_array = np.array(chars_sorted)
text_encoded = np.array([char2int[ch] for ch in text], dtype=np.int32)
print('Text encoded shape:', text_encoded.shape, '\n')
print(text[:11], '==> Encoded:', text_encoded[:11], '\n')
print(text_encoded[12:28], '==> Decoded:', ''.join(char_array[text_encoded[12:28]]))

Text encoded shape: (206528,) 

Moore space ==> Encoded: [32 61 61 64 51  1 65 62 47 49 51] 

[23 47 64 59 61 55 65  8  1 26 51 61 64 53 51 65] ==> Decoded: Darmois, Georges


In [34]:
import torch
from torch.utils.data import Dataset
seq_length = 40
chunk_size = seq_length + 1
text_chunks = [text_encoded[i : i + chunk_size] for i in range(len(text_encoded) - chunk_size)]
class TextDataset(Dataset):
    def __init__(self, text_chunks):
        self.text_chunks = text_chunks
    def __len__(self):
        return len(self.text_chunks)
    def __getitem__(self, idx):
        text_chunk = self.text_chunks[idx]
        return text_chunk[:-1].long(), text_chunk[1:].long()
seq_dataset = TextDataset(torch.tensor(text_chunks))

In [35]:
for i, (seq, target) in enumerate(seq_dataset):
    print(' Input (x): ', repr(''.join(char_array[seq])))
    print('Target (y): ', repr(''.join(char_array[target])))
    print()
    if i == 1:
        break

 Input (x):  'Moore space\nDarmois, Georges\nSymplectic '
Target (y):  'oore space\nDarmois, Georges\nSymplectic s'

 Input (x):  'oore space\nDarmois, Georges\nSymplectic s'
Target (y):  'ore space\nDarmois, Georges\nSymplectic sp'



In [36]:
from torch.utils.data import DataLoader
batch_size = 64
torch.manual_seed(1)
seq_dl = DataLoader(seq_dataset, batch_size=batch_size, shuffle=True, drop_last=True)

In [37]:
import torch.nn as nn
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size):
        super().__init__()
        self.rnn_hidden_size = rnn_hidden_size
        self.rnn = nn.LSTM(vocab_size, rnn_hidden_size, batch_first=True)
        self.fc = nn.Linear(rnn_hidden_size, vocab_size)
    def forward(self, x, hidden, cell):
        out = torch.nn.functional.one_hot(x, num_classes=97).float().unsqueeze(1)
        out, (hidden, cell) = self.rnn(out, (hidden, cell))
        out = self.fc(out).reshape(out.size(0), -1)
        return out, hidden, cell
    def init_hidden(self, batch_size):
        hidden = torch.zeros(1, batch_size, self.rnn_hidden_size)
        cell = torch.zeros(1, batch_size, self.rnn_hidden_size)
        return hidden, cell

In [38]:
vocab_size = len(char_array)
embed_dim = vocab_size
rnn_hidden_size = 512
torch.manual_seed(1)
model = RNN(vocab_size, embed_dim, rnn_hidden_size)
model

RNN(
  (rnn): LSTM(97, 512, batch_first=True)
  (fc): Linear(in_features=512, out_features=97, bias=True)
)

In [39]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [40]:
from datetime import datetime
start_time = datetime.now()

num_epochs = 10000
torch.manual_seed(1)
for epoch in range(num_epochs):
    hidden, cell = model.init_hidden(batch_size)
    seq_batch, target_batch = next(iter(seq_dl))
    optimizer.zero_grad()
    loss = 0
    for c in range(seq_length):
        pred, hidden, cell = model(seq_batch[:, c], hidden, cell)
        loss += loss_fn(pred, target_batch[:, c])
    loss.backward()
    optimizer.step()
    loss = loss.item() / seq_length
    if epoch % 100 == 0:
        print(f'Epoch {epoch} loss: {loss:.4f}')
        print('Time elapsed: ', datetime.now() - start_time)

Epoch 0 loss: 4.5786
Time elapsed:  0:00:00.605819
Epoch 100 loss: 3.2197
Time elapsed:  0:01:01.830133
Epoch 200 loss: 2.7251
Time elapsed:  0:02:00.716220
Epoch 300 loss: 2.4723
Time elapsed:  0:03:01.245320
Epoch 400 loss: 2.2895
Time elapsed:  0:04:02.286744
Epoch 500 loss: 2.2854
Time elapsed:  0:05:08.924041
Epoch 600 loss: 2.1464
Time elapsed:  0:06:11.047495
Epoch 700 loss: 2.1592
Time elapsed:  0:07:17.325632
Epoch 800 loss: 2.1319
Time elapsed:  0:08:17.632523
Epoch 900 loss: 2.0456
Time elapsed:  0:09:17.320054
Epoch 1000 loss: 1.9324
Time elapsed:  0:10:17.900096
Epoch 1100 loss: 1.8673
Time elapsed:  0:11:17.895343
Epoch 1200 loss: 1.9153
Time elapsed:  0:12:18.378661
Epoch 1300 loss: 1.8174
Time elapsed:  0:13:19.031256
Epoch 1400 loss: 1.7354
Time elapsed:  0:14:19.073976
Epoch 1500 loss: 1.7110
Time elapsed:  0:15:22.502012
Epoch 1600 loss: 1.7134
Time elapsed:  0:16:26.297331
Epoch 1700 loss: 1.6507
Time elapsed:  0:17:24.587615
Epoch 1800 loss: 1.5146
Time elapsed:  0

In [41]:
from torch.distributions.categorical import Categorical
def sample(model, starting_str, len_generated_text=5000, scale_factor=1.0):
    encoded_input = torch.tensor([char2int[s] for s in starting_str])
    encoded_input = torch.reshape(encoded_input, (1, -1))
    generated_str = starting_str
    model.eval()
    hidden, cell = model.init_hidden(1)
    for c in range(len(starting_str)-1):
        _, hidden, cell = model(encoded_input[:, c].view(1), hidden, cell)
    last_char = encoded_input[:, -1]
    for i in range(len_generated_text):
        logits, hidden, cell = model(last_char.view(1), hidden, cell)
        logits = torch.squeeze(logits, 0)
        scaled_logits = logits * scale_factor
        m = Categorical(logits=scaled_logits)
        last_char = m.sample()
        generated_str += str(char_array[last_char])
    return generated_str

In [44]:
torch.manual_seed(1)
print(sample(model, starting_str='Algebraic', scale_factor=0.9))

Algebraic equality of a complex varistictest
Tirchequiforbiling operator
Ergodic shmorigh
Frotuet-Dönitolly space
Prepistul-Lyoption
Cohm-group
Bunnagow-Lagrange method
Dios a coation
Franti-gemeeribility transformation
Singular semi-group
Vinogradov-Goldbach theorem
Schur method
One-sided and linesd
Sperlew operator
Pro-barr–Doxine laction
Conjugate class of functions, extremal equation
Approximation
Bourled brank
Axtric group
Rebuture condition
De Rham
Hodfordject method
Almost-periodic functional
Newton-babid factorization of
Hilbert'-Skeröshhey zycomp(sia closure
Katon formula
Supersal
Recurring series
Primitive representation of andidion
Boxes-Heaing representation
Protedenoso, method of a complex variation problem
Game on property
Decision ring
Distance-Teurn-Granse semier
K-sacaukal phine
Doudingroup
Giassdary
Axtoncical ideal surface
Optimal numbers
Poisson distribution
Schledeam group
Witt chain
Problème des group
Regular algebra of a group
Cubically class
Locally convex space