<a href="https://colab.research.google.com/github/bellomusodiq/machine_learning/blob/master/text_generation_chars_pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import torch
import torch.nn as nn
import torchtext
import numpy as np
import os

In [0]:
file_path = os.path.join('drive','My Drive', 'alice.txt')

In [0]:
a = set('hello world')

In [5]:
for i in sorted(a): print(i)

 
d
e
h
l
o
r
w


In [0]:
class Dictionary:
  def __init__(self):
    self.char2idx = {}
    self.idx2char = {}

  def get_chars(self, file_path):
    with open(file_path, mode='r') as f:
      for char in sorted(set(f.read())):
        if not char in self.char2idx:
          self.char2idx[char] = len(self.char2idx)
          self.idx2char[len(self.idx2char)] = char

In [0]:
dict_ = Dictionary()
dict_.get_chars(file_path)

In [8]:
dict_.idx2char

{0: '\n',
 1: ' ',
 2: '!',
 3: '"',
 4: "'",
 5: '(',
 6: ')',
 7: ',',
 8: '-',
 9: '.',
 10: ':',
 11: ';',
 12: '?',
 13: 'A',
 14: 'B',
 15: 'C',
 16: 'D',
 17: 'E',
 18: 'F',
 19: 'G',
 20: 'H',
 21: 'I',
 22: 'J',
 23: 'K',
 24: 'L',
 25: 'M',
 26: 'N',
 27: 'O',
 28: 'P',
 29: 'Q',
 30: 'R',
 31: 'S',
 32: 'T',
 33: 'U',
 34: 'V',
 35: 'W',
 36: 'X',
 37: 'Y',
 38: 'Z',
 39: '[',
 40: ']',
 41: '_',
 42: 'a',
 43: 'b',
 44: 'c',
 45: 'd',
 46: 'e',
 47: 'f',
 48: 'g',
 49: 'h',
 50: 'i',
 51: 'j',
 52: 'k',
 53: 'l',
 54: 'm',
 55: 'n',
 56: 'o',
 57: 'p',
 58: 'q',
 59: 'r',
 60: 's',
 61: 't',
 62: 'u',
 63: 'v',
 64: 'w',
 65: 'x',
 66: 'y',
 67: 'z'}

In [0]:
vocab_size = len(dict_.char2idx)
batch_size = 32
rnn_units = 65
rnn_layers = 1
n_epochs = 20
embedding_size = 32

In [0]:
class TextGenerator(nn.Module):

  def __init__(self, vocab_size, rnn_units, rnn_layers, embedding_size):
    super(TextGenerator, self).__init__()
    self.embed = nn.Embedding(vocab_size, embedding_size)
    self.lstm = nn.LSTM(embedding_size, rnn_units, rnn_layers, batch_first=True)
    self.relu = nn.ReLU()
    self.dropout = nn.Dropout(.2)
    self.output = nn.Linear(rnn_units, vocab_size)

  def forward(self, x, h):
    embed = self.embed(x)
    output, (h_l, c_l) = self.lstm(embed)
    # output = self.dropout(output)
    output = self.output(output)
    return output, h

In [0]:
model = TextGenerator(vocab_size, rnn_units, rnn_layers, embedding_size).cuda()

In [35]:
print(model)

TextGenerator(
  (embed): Embedding(68, 32)
  (lstm): LSTM(32, 65, batch_first=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.2, inplace=False)
  (output): Linear(in_features=65, out_features=68, bias=True)
)


In [0]:
hidden_layers = (
    torch.zeros(rnn_layers, batch_size, rnn_units),
    torch.zeros(rnn_layers, batch_size, rnn_units)
)

In [0]:
class AliceDataset(torch.utils.data.Dataset):

  def __init__(self, corpus):
    sequence_length = 100
    num_of_sequence = len(corpus) // (sequence_length + 1)
    num_of_chars = num_of_sequence * (sequence_length + 1)
    corpus = corpus[:num_of_chars]
    self.chunks = [corpus[i:i+sequence_length] for i in range(0, len(corpus), sequence_length+1)]
    self.inputs = torch.tensor(self.chunks[:-1], dtype=torch.int64)
    self.targets = torch.tensor(self.chunks[1:], dtype=torch.int64)

  def __getitem__(self, index):
    return self.inputs[index], self.targets[index]

  def __len__(self):
    return len(self.inputs)

In [0]:
corpus = []
with open(file_path, mode='r') as f:
  for char in f.read():
    corpus.append(dict_.char2idx[char])

In [0]:
dataset = AliceDataset(corpus)

In [0]:
data_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [0]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=.1)

In [44]:
for epoch in range(n_epochs):
  h = (
      torch.zeros(rnn_layers, batch_size, rnn_units).cuda(),
      torch.zeros(rnn_layers, batch_size, rnn_units).cuda()
  )
  for features, targets in data_loader:
    output, h = model(features.cuda(), h)
    optimizer.zero_grad()
    loss = criterion(output.transpose(1,2).cuda(), targets.cuda())
    state_h = h[0].detach()
    state_c = h[1].detach()
    h = state_h, state_c
    loss.backward()
    optimizer.step()
    torch.nn.utils.clip_grad_norm_(model.parameters(), .5)
  print('Epoch:', epoch+1, 'loss:', loss.item())

Epoch: 1 loss: 3.094210624694824
Epoch: 2 loss: 3.1157679557800293
Epoch: 3 loss: 3.1881818771362305
Epoch: 4 loss: 3.1403088569641113
Epoch: 5 loss: 3.1528947353363037
Epoch: 6 loss: 3.1644368171691895
Epoch: 7 loss: 3.256956100463867
Epoch: 8 loss: 3.235138177871704
Epoch: 9 loss: 3.138655185699463
Epoch: 10 loss: 3.257835865020752
Epoch: 11 loss: 3.2225089073181152
Epoch: 12 loss: 3.18119478225708
Epoch: 13 loss: 3.1715281009674072
Epoch: 14 loss: 3.136408567428589
Epoch: 15 loss: 3.1786770820617676
Epoch: 16 loss: 3.1374247074127197
Epoch: 17 loss: 3.145899534225464
Epoch: 18 loss: 3.356417655944824
Epoch: 19 loss: 3.135282278060913
Epoch: 20 loss: 3.1834447383880615


In [43]:
with torch.no_grad():
  with open('results.txt', 'w') as f:
    state = (
        torch.zeros(1, 1, rnn_units).cuda(),
        torch.zeros(1, 1, rnn_units).cuda(),
    )
    input_ = [0]
    
    for i in range(500):
      x = torch.tensor([input_], dtype=torch.int64).cuda()
      output, state = model(x, state)
      prob = output.squeeze(1).exp()
      word_id = torch.multinomial(prob, num_samples=1).item()
      input.append(word_id)

      word = dict_.idx2char[word_id]
      f.write(word)

      if (i+1) % 100 == 0:
        print('Sampled {}/{}'.format(i, 500))

AttributeError: ignored

In [0]:
with torch.no_grad():
  out = model(torch.tensor([[1,2,22,54]], dtype=torch.int64).cuda(), state)