# Generate strings of text using a character-level LSTM

## Setup

In [1]:
# imports
import numpy as np
import matplotlib as plt
import torch
from torch import nn
import torch.nn.functional as F
import string
import pandas as pd
import re
torch.manual_seed(1)

<torch._C.Generator at 0x11729abb0>

## Data preparation

### Load and pre-process the data

In [2]:
with open('./Data/timemachine.txt', 'r') as f:
    lines = f.readlines()
    
n_head = 35
t = [line.strip() for line in lines[n_head:] if len(line.strip())>0]
joined_text = " ".join(t)
my_text = [line.strip() + "." + "\n" for line in joined_text.split(".")]
my_text[:5] # check

['I The Time Traveller (for so it will be convenient to speak of him) was expounding a recondite matter to us.\n',
 'His grey eyes shone and twinkled, and his usually pale face was flushed and animated.\n',
 'The fire burned brightly, and the soft radiance of the incandescent lights in the lilies of silver caught the bubbles that flashed and passed in our glasses.\n',
 'Our chairs, being his patents, embraced and caressed us rather than submitted to be sat upon, and there was that luxurious after-dinner atmosphere when thought roams gracefully free of the trammels of precision.\n',
 'And he put it to us in this way--marking the points with a lean forefinger--as we sat and lazily admired his earnestness over this new paradox (as we thought it) and his fecundity.\n']

In [3]:
vocab = list(set(joined_text))
# vocab.sort()
vocab.append('\n')
vocab_dict = dict([[v, ind] for ind, v in enumerate(vocab)])
# vocab_dict
vocab_length = len(vocab)
vocab_weight = torch.zeros(vocab_length)
for char in joined_text:
    vocab_weight[vocab_dict[char]] += 1.0
vocab_weight[vocab_dict['\n']] = len(my_text)
vocab_weight = 1.0/vocab_weight

In [4]:
def get_one_hot_vector(input_string):
    out_vector = torch.zeros(len(input_string),1,vocab_length)
    for i,c in enumerate(input_string):

        out_vector[i,0,vocab_dict[c]] = 1
    return out_vector

print(my_text[0])
get_one_hot_vector(my_text[0])

I The Time Traveller (for so it will be convenient to speak of him) was expounding a recondite matter to us.



tensor([[[0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.]],

        ...,

        [[0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 1.]]])

## Modeling

# Predict function

In [5]:
# Init hidden and cell states
def predict(seed):
    with torch.no_grad():
        this_input = get_one_hot_vector(seed)
        sentence_length = 20
        out_string = ""
        # init_state
        out, (hn, cn) = model(this_input)
    #     for i in range(this_input.shape[0]):
    #         input = this_input[i,:,:].view(1,1,-1)
    #         out, hn_cn = model(input, (hn, cn))
    #         out_string += vocab[int(input.view(-1).argmax())]
        out = out[-1:,:,:]
        for i in range(sentence_length):
    #         hn = torch.randn(D*num_layers, N, H_cell)
    #         cn = torch.randn(D*num_layers, N, H_cell)
            out, (hn, cn) = model(out, (hn, cn))
            out_string += vec2string(out)
    print("< " + out_string + " >")

In [6]:
def vec2string(input):
    out_string = ""
    for i in range(len(input)):
        this_input = input[i,0,:].view(-1)
        out_string += vocab[int(this_input.argmax())]
    return out_string

## Define the model

In [7]:
class sentence_maker(nn.Module):
    def __init__(self, H_in, H_cell, num_layers=1, dropout = .0):
        super(sentence_maker, self).__init__()
        H_out = H_in
        self.D = 1
        self.H_cell = H_cell
        self.num_layers = num_layers
        self.lstm = nn.LSTM(H_in, H_cell, 
                            num_layers = num_layers, 
                            dropout = dropout)
        self.linear = nn.Linear(H_cell, H_out)
        self.logsoftmax = nn.LogSoftmax(2)
        
    def forward(self, input, states=None):
        if states==None:
            out, states = self.lstm(input)
        else:
            out, states = self.lstm(input, states)
        out = self.linear(out)
        out = self.logsoftmax(out)
        return out, states
    def begin_state(self, batch_size=1):
        return (torch.zeros((self.D * self.num_layers,
                             batch_size, self.H_cell)),
                torch.zeros((self.D * self.num_layers,
                             batch_size, self.H_cell)))

In [8]:
H_cell = 256 # hidden size
N = 1 # batch size
L = 8 # sequence length
D = 1 # unidirectional
num_layers = 2 # /!\ not yet inputted to LSTM function
H_in = vocab_length # input size

H_out = H_cell # output size, H_cell by default or project size if project_size>0


# lstm = nn.LSTM(H_in, H_cell, proj_size=proj_size)
model = sentence_maker(H_in, H_cell,num_layers=num_layers, dropout=0.0)

loss_func = nn.NLLLoss(weight=vocab_weight)
optimizer = torch.optim.Adam(model.parameters(),lr=1.0e-2)

hn = torch.zeros(D*num_layers, N, H_cell)
cn = torch.zeros(D*num_layers, N, H_cell)

## Train

In [9]:
n_epoch = 100
batch_size = 50
n_batch = len(my_text)//batch_size

for epoch in range(n_epoch):

    print(f"Epoch #{epoch:02d}: ", end="")
    loss = 0.0    
    i_batch = 0
    for i_sample in range(len(my_text)):
    # for i_sample in range(3):    
        hn.detach_()
        cn.detach_()
        sentence = my_text[i_sample]
        this_L = min(len(sentence),L+1)-1
        if this_L < L:
            I=0
        else:
            I = np.random.randint(0,len(sentence)-this_L) 
            
        # Get a random subsentence    
        
        this_input = get_one_hot_vector(sentence[I:I+this_L])
        this_target = get_one_hot_vector(sentence[I+1:I+1+this_L])

        # Forward/backward pass
        
        out, (hn, cn) = model(this_input)#, (hn, cn))
        
        loss += loss_func(out.view(this_L,H_in),this_target.argmax(dim=2).view(-1))
        if i_sample>0 and i_sample%batch_size==0:
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            i_batch+=1
#             print(i_batch, n_batch)
            if i_batch==n_batch:
                print(f"l={loss:.2e}")
                print("  in:     ", vec2string(this_input))
                print("  target: ", vec2string(this_target))
                print("  out:    ", vec2string(out))
#                 predict()
#             print(f"{i_batch:03d}/{n_batch:03d} l={loss:.2e}")
            loss = 0.0

print("Finished!")

Epoch #00: l=2.16e+02
  in:      p://pgla
  target:  ://pglaf
  out:     


s.
s.
Epoch #01: l=1.99e+02
  in:      please v
  target:  lease vi
  out:     
dddddwh
Epoch #02: 

KeyboardInterrupt: 

## Test

In [10]:
predict("I")

< nnnnnnnnnnnnnnnnnnnn >


In [87]:
out[-1,:,:].shape

IndexError: too many indices for tensor of dimension 2

In [94]:
out.shape

torch.Size([1, 1, 82])

In [23]:
vocab

[' ',
 '!',
 '"',
 '$',
 '%',
 "'",
 '(',
 ')',
 '*',
 ',',
 '-',
 '.',
 '/',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 ';',
 '?',
 '@',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 '[',
 ']',
 '_',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '\n']