In [1]:
import torch
import math
import time

import numpy as np
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torchinfo import summary

In [2]:
text = open("nietzsche.txt").read()
chars = sorted(list(set(text)))

chars.insert(0, '\0')

In [3]:
char_to_index = {v:i for i,v in enumerate(chars)}
index_to_char = {i:v for i,v in enumerate(chars)}

In [4]:
total_index = [char_to_index[char] for char in text]
pred_num = 25 #max character length to input in one go
xin = [[total_index[j+i] for j in range(0, len(total_index)-1-pred_num, pred_num)] for i in range(pred_num)]
y = [total_index[i+pred_num] for i in range(0, len(total_index)-1-pred_num, pred_num)]

In [5]:
X = np.stack([np.stack(xin[i][:-2]) for i in range(pred_num)],1)
Y = np.stack(y[:-2])

In [6]:
X_tensor = torch.tensor(X, dtype=torch.long)
Y_tensor = torch.tensor(Y, dtype=torch.long)

In [7]:
class LSTMCell(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(LSTMCell, self).__init__()
        self.hidden_size = hidden_size
        self.input_size = input_size

        self.W_ii = nn.Parameter(torch.Tensor(hidden_size, input_size))
        self.W_hi = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
        self.b_i = nn.Parameter(torch.Tensor(hidden_size))
        
        self.W_if = nn.Parameter(torch.Tensor(hidden_size, input_size))
        self.W_hf = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
        self.b_f = nn.Parameter(torch.Tensor(hidden_size))
        
        self.W_ig = nn.Parameter(torch.Tensor(hidden_size, input_size))
        self.W_hg = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
        self.b_g = nn.Parameter(torch.Tensor(hidden_size))
        
        self.W_io = nn.Parameter(torch.Tensor(hidden_size, input_size))
        self.W_ho = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
        self.b_o = nn.Parameter(torch.Tensor(hidden_size))
        
        self.init_weights()

    def init_weights(self):
        for param in self.parameters():
            nn.init.uniform_(param, -0.1, 0.1)
        
    def forward(self, x, hidden):
        h_prev, c_prev = hidden

        i_t = torch.sigmoid(x @ self.W_ii.T + h_prev @ self.W_hi.T + self.b_i)
        f_t = torch.sigmoid(x @ self.W_if.T + h_prev @ self.W_hf.T + self.b_f)
        g_t = torch.tanh(x @ self.W_ig.T + h_prev @ self.W_hg.T + self.b_g)
        o_t = torch.sigmoid(x @ self.W_io.T + h_prev @ self.W_ho.T + self.b_o)
        
        c_t = f_t * c_prev + i_t * g_t
        h_t = o_t * torch.tanh(c_t)
        
        return h_t, (h_t, c_t)


class CustomLSTM(nn.Module):
    def __init__(self, input_size, hidden_layers, num_layers=1):
        super(CustomLSTM, self).__init__()
        self.hidden_layers = hidden_layers
        self.num_layers = num_layers
        self.cells = nn.ModuleList([LSTMCell(input_size, hidden_layers) if i == 0 
                                    else LSTMCell(hidden_layers, hidden_layers) 
                                    for i in range(num_layers)])

    def forward(self, x):
        batch_size, seq_len, _ = x.size()
        h = [torch.zeros(batch_size, self.hidden_layers).to(x.device) for _ in range(self.num_layers)]
        c = [torch.zeros(batch_size, self.hidden_layers).to(x.device) for _ in range(self.num_layers)]
        
        out = torch.zeros(batch_size, seq_len, self.hidden_layers, device=x.device) # Initialize output tensor
        for t in range(seq_len):
            x_t = x[:, t, :]
            for i, cell in enumerate(self.cells):
                h[i], (h[i], c[i]) = cell(x_t, (h[i], c[i]))
                x_t = h[i]
                out[:,i,:] = x_t
        
        h_n = x_t
        return out,h_n

In [16]:
class SimpleLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_layers):
        super(SimpleLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM( input_size=embedding_dim, hidden_size=hidden_layers, batch_first=True )
        # self.rnn = CustomLSTM(embedding_dim, hidden_layers)  
        self.dense = nn.Linear(hidden_layers, vocab_size)

        nn.init.xavier_normal_(self.embedding.weight)
        nn.init.xavier_normal_(self.dense.weight)

    def forward(self, x):
        embedded = self.embedding(x)
        out, _ = self.rnn(embedded)
        out = self.dense(out[:, -1, :]) 
        return out

In [17]:
# Hyperparameters (same as Keras)
hidden_layers = 128
vocab_size = 86
embedding_dim = 42
batch_size = 64
epochs = 150

In [18]:
model = SimpleLSTM(vocab_size, embedding_dim, hidden_layers)

In [19]:
summary(model)

Layer (type:depth-idx)                   Param #
SimpleLSTM                               --
├─Embedding: 1-1                         3,612
├─LSTM: 1-2                              88,064
├─Linear: 1-3                            11,094
Total params: 102,770
Trainable params: 102,770
Non-trainable params: 0

In [20]:
criterion = nn.CrossEntropyLoss()  # PyTorch uses CrossEntropyLoss for multi-class classification
optimizer = optim.Adam(model.parameters())

In [21]:
for epoch in range(epochs):
    current_time = time.time()
    model.train()  # Set the model to training mode
    optimizer.zero_grad() # zero the gradient
    outputs = model(X_tensor) # foward pass
    loss = criterion(outputs, Y_tensor) # calculate the loss
    loss.backward() # calculate the gradients
    optimizer.step() # update the weights

    print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}, Time: {time.time() - current_time:.2f}s')

Epoch [1/150], Loss: 4.4554, Time: 1.92s
Epoch [2/150], Loss: 4.4396, Time: 1.42s
Epoch [3/150], Loss: 4.4233, Time: 1.40s
Epoch [4/150], Loss: 4.4059, Time: 1.48s
Epoch [5/150], Loss: 4.3869, Time: 1.36s
Epoch [6/150], Loss: 4.3657, Time: 1.35s
Epoch [7/150], Loss: 4.3412, Time: 1.29s
Epoch [8/150], Loss: 4.3119, Time: 1.39s
Epoch [9/150], Loss: 4.2755, Time: 1.32s
Epoch [10/150], Loss: 4.2283, Time: 1.29s
Epoch [11/150], Loss: 4.1636, Time: 1.34s
Epoch [12/150], Loss: 4.0696, Time: 1.35s
Epoch [13/150], Loss: 3.9266, Time: 1.38s
Epoch [14/150], Loss: 3.7217, Time: 1.37s
Epoch [15/150], Loss: 3.5346, Time: 1.42s
Epoch [16/150], Loss: 3.5306, Time: 1.33s
Epoch [17/150], Loss: 3.5316, Time: 1.46s
Epoch [18/150], Loss: 3.4801, Time: 1.30s
Epoch [19/150], Loss: 3.4088, Time: 1.30s
Epoch [20/150], Loss: 3.3394, Time: 1.30s
Epoch [21/150], Loss: 3.2828, Time: 1.36s
Epoch [22/150], Loss: 3.2420, Time: 1.35s
Epoch [23/150], Loss: 3.2153, Time: 1.28s
Epoch [24/150], Loss: 3.1989, Time: 1.29s
E

In [22]:
def predict_next_char(inp):
    model.eval() # Set the model to evaluation mode

    index = [char_to_index[i] for i in inp]
    arr = np.expand_dims(np.array(index), axis=0)
    input_tensor = torch.tensor(arr, dtype=torch.long) # Convert to tensor
    with torch.no_grad(): # Disable gradient calculation during inference
        prediction = model(input_tensor)
    predicted_index = torch.argmax(prediction).item() # get the index of the maximum log-probability
    return index_to_char[predicted_index],inp+index_to_char[predicted_index]

In [23]:
print(predict_next_char('those w'))
print(predict_next_char(' th'))
print(predict_next_char(' an'))
print(predict_next_char('does th'))
print(predict_next_char('woma'))
print(predict_next_char('philosoph'))

('h', 'those wh')
('e', ' the')
('d', ' and')
('e', 'does the')
('n', 'woman')
('e', 'philosophe')
