In [162]:
import torch
import math
import time
import json
import tqdm
import os

import numpy as np
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from collections import OrderedDict

In [163]:
with open("../../src/server/movies.json") as f:
    all_movies = json.load(f)
    movies = [movie["name"] for movie in all_movies]
    dataset = []
    padding = '\0'
    char_length = 10
    for item in movies:
        movie = ''.join(char for char in item.lower() if char.isalpha())
        
        if len(movie) < 2:
            continue
        
        for i in range(1, len(movie)):
            prev_chars = movie[max(0, i-char_length):i]
            if len(prev_chars) < char_length:
                prev_chars = padding * (char_length - len(prev_chars)) + prev_chars
            
            current_char = movie[i]
            
            dataset.append((prev_chars, current_char))

In [164]:
dataset[:10]

[('\x00\x00\x00\x00\x00\x00\x00\x00\x00f', 'a'),
 ('\x00\x00\x00\x00\x00\x00\x00\x00fa', 'n'),
 ('\x00\x00\x00\x00\x00\x00\x00fan', 't'),
 ('\x00\x00\x00\x00\x00\x00fant', 'a'),
 ('\x00\x00\x00\x00\x00fanta', 's'),
 ('\x00\x00\x00\x00fantas', 't'),
 ('\x00\x00\x00fantast', 'i'),
 ('\x00\x00fantasti', 'c'),
 ('\x00fantastic', 'f'),
 ('fantasticf', 'o')]

In [165]:
text = "abcdefghijklmnopqrstuvwxyz"
chars = sorted(list(set(text)))

chars.insert(0, '\0')

In [166]:
char_to_index = {v:i for i,v in enumerate(chars)}
index_to_char = {i:v for i,v in enumerate(chars)}

In [167]:
xin = np.array([[char_to_index[ch] for ch in char[0]] for char in dataset]).T
y = [char_to_index[char[1]] for char in dataset]

In [168]:
pred_num = char_length

In [169]:
X = np.stack([np.stack(xin[i][:-2]) for i in range(pred_num)],1)
Y = np.stack(y[:-2])

In [170]:
print(X.shape,Y.shape,[index_to_char[x] for x in X[9]],[index_to_char[Y[9]]])

(12962, 10) (12962,) ['f', 'a', 'n', 't', 'a', 's', 't', 'i', 'c', 'f'] ['o']


In [171]:
X_tensor = torch.tensor(X, dtype=torch.long)
Y_tensor = torch.tensor(Y, dtype=torch.long)

#### Model architecture experimented and iterated with in Pytorch * notebooks:
Sticking with RNN since its a simple model with high accuracy and easy deployability (weights come in at about 200kb)

In [172]:
class RecurrentLayer(nn.Module):
    def __init__(self, input_size, hidden_size, nonlinearity='tanh'):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.nonlinearity = nonlinearity

        self.W_ih = nn.Parameter(torch.Tensor(hidden_size, input_size))
        self.W_hh = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
        self.b_ih = nn.Parameter(torch.Tensor(hidden_size))
        self.b_hh = nn.Parameter(torch.Tensor(hidden_size))

        self.reset_parameters()

    def reset_parameters(self):
        nn.init.kaiming_uniform_(self.W_ih, a=math.sqrt(5))
        nn.init.kaiming_uniform_(self.W_hh, a=math.sqrt(5))
        fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.W_ih)
        bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
        nn.init.uniform_(self.b_ih, -bound, bound)
        nn.init.uniform_(self.b_hh, -bound, bound)


    def forward(self, x, h_0=None):
        batch_size, seq_len, input_size = x.size()
        hidden_size = self.hidden_size
        if h_0 is None:
            h_t = torch.zeros(batch_size, hidden_size, device=x.device) # Initialize hidden state if not provided
        else:
            h_t = h_0
        output = torch.zeros(batch_size, seq_len, hidden_size, device=x.device) # Initialize output tensor
        for t in range(seq_len):
            x_t = x[:, t, :]
            h_t = torch.tanh(F.linear(x_t, self.W_ih, self.b_ih) + F.linear(h_t, self.W_hh, self.b_hh)) #TODO: tanh is hardcoded as activation rn
            output[:, t, :] = h_t
        h_n = h_t

        return output, h_n

In [173]:
class SimpleRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_layers):
        super(SimpleRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = RecurrentLayer(embedding_dim, hidden_layers, nonlinearity='tanh')  
        self.dense = nn.Linear(hidden_layers, vocab_size)
        nn.init.xavier_normal_(self.embedding.weight)
        nn.init.xavier_normal_(self.dense.weight)

    def forward(self, x):
        embedded = self.embedding(x)
        out, _ = self.rnn(embedded)
        out = self.dense(out[:, -1, :]) 
        return out

In [182]:
# Hyperparameters
hidden_layers = 168 #ideal = 320, practical = 168
vocab_size = len(chars)
embedding_dim = 84 #ideal = 256,practical = 84
batch_size = 32
epochs = 100
lr=1e-4

In [176]:
model = SimpleRNN(vocab_size, embedding_dim, hidden_layers)
state_dict = model.state_dict()

In [177]:
print("Trainable Parameters:",sum(p.numel() for p in model.parameters() if p.requires_grad),"Total:",sum(p.numel() for p in model.parameters()))
param_size = 0
for param in model.parameters():
    param_size += param.nelement() * param.element_size()
buffer_size = 0
for buffer in model.buffers():
    buffer_size += buffer.nelement() * buffer.element_size()

size_all_mb = (param_size + buffer_size) / 1024**2
print('Approx. Model size: {:.2f}MB'.format(3*size_all_mb))

Trainable Parameters: 49503 Total: 49503
Approx. Model size: 0.57MB


In [187]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(),lr=lr)
pbar = tqdm.tqdm(range(epochs))

  0%|          | 0/100 [00:00<?, ?it/s]

In [188]:
for epoch in pbar:
    current_time = time.time()
    model.train() 
    optimizer.zero_grad() 
    outputs = model(X_tensor)
    loss = criterion(outputs, Y_tensor) 
    loss.backward() 
    optimizer.step() 

    pbar.set_postfix({"loss":f"{loss.item():.4f}","time/epoch":f"{time.time() - current_time:.2f}s"})

100%|██████████| 100/100 [00:26<00:00,  3.81it/s, loss=0.8550, time/epoch=0.24s]


In [189]:
def predict_next_char(inp):
    model.eval() # Set the model to evaluation mode

    index = [char_to_index[i] for i in inp]
    arr = np.expand_dims(np.array(index), axis=0)
    input_tensor = torch.tensor(arr, dtype=torch.long)
    with torch.no_grad():
        prediction = model(input_tensor)
    predicted_index = torch.argmax(prediction).item() # get the index of the maximum log-probability
    return index_to_char[predicted_index],inp+index_to_char[predicted_index]

In [190]:
print(predict_next_char('sta'))
print(predict_next_char('thematr'))
print(predict_next_char('asgoo'))
print(predict_next_char('thegodfathe'))
print(predict_next_char('u'))
print(predict_next_char('aquama'))

('r', 'star')
('i', 'thematri')
('d', 'asgood')
('a', 'thegodfathea')
('n', 'un')
('n', 'aquaman')


In [194]:
load_pretrained = False

In [195]:
if os.path.isfile("weights.pth") and load_pretrained:
    model.load_state_dict(torch.load("weights.pth"))
    state_dict = model.state_dict()
else:
    state_dict = model.state_dict()
    torch.save(state_dict,"weights.pth")

In [200]:
weights_json = OrderedDict({key: state_dict[key].detach().cpu().half().numpy() for key in state_dict})
for key,value in weights_json.items():
    weights_json[key] = value.astype('float').round(3).tolist()

In [201]:
with open("weights.json","w") as f:
    json.dump(weights_json,f)

In [202]:
print("Approx. JSON size: {:.2f}MB".format(os.path.getsize("weights.json")/1024/1024))

Approx. JSON size: 0.35MB
