In [43]:
import torch 
import torch.nn as nn
import os
import numpy as np
from torch.nn.utils import clip_grad_norm
from transformers import BertTokenizer, BertForPreTraining, AutoModel, AutoTokenizer

In [26]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
tokenizer.tokenize('CHAPTER I. Down the Rabbit-Hole')

['CHAPTER', 'I', '.', 'Down', 'the', 'Rabbit', '-', 'Hole']

In [21]:
class Dictionary(object):
    def __init__(self):
        self.word2idx = tokenizer.get_vocab()
        self.idx2word = {value:key for key, value in self.word2idx.items()}
    def __len__(self):
        return len(self.word2idx)

In [31]:
class TextProcess(object):
    def __init__(self):
        self.dictionary = Dictionary()
    def get_data(self, path, batch_size = 20):
        rep_tensor = []
        with open(path, 'r') as f:
            tokens = 0
            for line in f:
                words = tokenizer.tokenize(line)+ ['[SEP]']
                for word in words:
                    rep_tensor.append(self.dictionary.word2idx[word])
        rep_tensor = torch.LongTensor(rep_tensor)
        num_batches = rep_tensor.shape[0]//batch_size
        rep_tensor = rep_tensor[:num_batches*batch_size]
        rep_tensor = rep_tensor.view(batch_size, -1)
        return rep_tensor
    

In [32]:
embedd_size = 128
hidden_size = 1024
num_layers = 1
num_epochs = 1000
batch_size = 20
timesteps = 30
learning_rate = 0.002

In [33]:
corpus = TextProcess()

In [36]:
rep_tensor = corpus.get_data('alice.txt', batch_size)
vocab_size = len(corpus.dictionary)
print(vocab_size)

28996


In [35]:
print(rep_tensor.shape)

torch.Size([20, 2048])


In [37]:
class TextGenerator(nn.Module):
    def __init__(self, vocab_size, embedd_size,hidden_size,num_layers):
        super(TextGenerator, self).__init__()
        self.embed = nn.Embedding(vocab_size,embedd_size)
        self.lstm = nn.LSTM(embedd_size, hidden_size, num_layers, batch_first = True)
        self.linear = nn.Linear(hidden_size,vocab_size)
        self.dropout = nn.Dropout(p = 0.5)
    def forward(self,x, h):
        x1 = self.embed(x)
        out, (h,c) = self.lstm(x1)
        out= out.reshape(out.size(0)*out.size(1), out.size(2))
        out = self.dropout(out)
        out = self.linear(out)
        
        return out, (h,c)

In [38]:
model = TextGenerator(vocab_size, embedd_size,hidden_size,num_layers)

In [39]:
loss_fn = nn.CrossEntropyLoss()
optimizer =  torch.optim.Adam(model.parameters(), lr = learning_rate)

In [42]:
for epoch in range(num_epochs):
    states = (torch.zeros(num_layers, batch_size, hidden_size),torch.zeros(num_layers, batch_size, hidden_size))
    for i in range(0,rep_tensor.size(1) - timesteps, timesteps):
        inputs = rep_tensor[:, i:i+timesteps]
        target = rep_tensor[:, (i+1):i+1+timesteps]
        out, _= model(inputs, states)
        loss = loss_fn(out,target.reshape(-1))
        
        
        model.zero_grad()
        loss.backward()
        clip_grad_norm(model.parameters(),0.5)
        optimizer.step()
        step = (i+1)//timesteps
        if not step%100:
            print(f"Epoch {epoch+1}/{num_epochs}, loss{loss.item}")
            print(target.shape)

  if sys.path[0] == "":


Epoch1/1000, loss<built-in method item of Tensor object at 0x00000213E7555318>
torch.Size([20, 30])
Epoch2/1000, loss<built-in method item of Tensor object at 0x00000213E667B8B8>
torch.Size([20, 30])


KeyboardInterrupt: 

In [None]:
with torch.no_grad():
    with open('results.txt',w) as f:
        states = (torch.zeros(num_layers, batch_size, hidden_size),torch.zeros(num_layers, batch_size, hidden_size))
        inputs = torch.randint(0, vocab_size,(1)).long().unsqueeze(1)
        for i in range(500):
            output,_ = model(inputs,state)
            prob = output.exp()
            word_id = torch.multinominal(prob,num_samples = 1).item()
            inputs.fill_(word_id)
            
            word = corpus.dictionary.idx2word[word_id]
            word = word == '[SEP]' ? '/n' : word
            f.write(word)
        