In [8]:
import string
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import nltk
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import re
import PyPDF2
from nltk.corpus import stopwords
import unidecode
import time

In [9]:
torch.cuda.is_available()

True

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [11]:
all_chars = string.printable
print(all_chars)
print(len(all_chars))

0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~ 	

100


In [12]:
# class PdfToText: # extracts text from the pdf file
    
#     def __init__(self , pdf):
        
#         self.text = ''
        
#         pdf_file = open(pdf , 'rb')
#         read_pdf = PyPDF2.PdfFileReader(pdf_file , strict = False)
#         number_of_pages = read_pdf.getNumPages()
#         for i in range(number_of_pages):
#             page = read_pdf.getPage(i)
#             page_content = page.extractText()
            
#             self.text += page_content

In [13]:
with open('Data.txt') as f:
    data = f.read()

In [14]:
len(data)

776697

In [15]:
class TextPreprocessing:
    def __init__(self , text):
        self.text = text
        self.text = unidecode.unidecode(self.text)
        self.text = re.sub(' +',' ', self.text)

In [16]:
t1 = TextPreprocessing(data)

In [17]:
text = t1.text

In [18]:
# def remove_dup(seq): # Removes duplicate words from the dataset passed
#     seen = set()
#     seen_add = seen.add
#     return [x for x in seq if not (x in seen or seen_add(x))]

In [19]:
# text = " ".join(remove_dup(nltk.word_tokenize(t1.text)))

In [20]:
class CharGenRNN(nn.Module):
    def __init__(self , input_size , hidden_size , output_size , n_layers):
        
        super(CharGenRNN , self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(self.input_size, self.hidden_size)
        self.rnn = nn.LSTM(self.hidden_size , 
                           self.hidden_size , 
                           self.n_layers , 
                           batch_first = True)
        self.linear = nn.Linear(self.hidden_size , self.output_size)
   
    def forward(self , batch):
        batch = batch.long()
        out = self.embedding(batch)
        out , _ = self.rnn(out)
        out = self.linear(out)
        
        return out

In [1]:
class Generation:
    def __init__(self , text , all_chars):
        
        self.seq_len = 256 # predicting next character form the previous 32 characters
        self.batch_size = 126 # total of 16 , 32 seq in a batch
        
        self.all_chars = {j:i for i,j in enumerate(all_chars)}
        self.reverse_chars = {i:j for i,j in enumerate(all_chars)}
        
        self.text = text[:-(len(text) % (self.seq_len * self.batch_size))]

        self.input_size = len(all_chars)
        self.hidden_size = 256
        self.output_size = self.input_size
        self.n_layers = 1
        self.epochs = 1000
        self.print_every = 10
        self.generate_every = 50
        self.lr = 0.05
        
        self.rnn = CharGenRNN(self.input_size , 
                              self.hidden_size , 
                              self.output_size , 
                              self.n_layers).to(device)
        

    def get_batches(self):
        
        i = 0
        counter = 0    
        no_time = len(self.text) // (self.seq_len * self.batch_size)
        encoded_text = list(map(lambda x : self.all_chars[x] , self.text))
        
        while i != int(no_time)-1:
            
            x_list = []
            y_seq = []
            
            for _ in range(self.batch_size):
                x_list.append(encoded_text[counter:self.seq_len+counter])
                y_seq.append(encoded_text[counter + 1:self.seq_len+counter+1])
                counter += self.seq_len
            
            i += 1
            yield x_list,y_seq
            
            
    def show_batches(self):
        for i,j in self.get_batches():
            i , j = torch.Tensor(i).long() , torch.Tensor(j).long()
            print(i)
            print(j)
            break
            
    def init_loss(self):
        
        self.criterion = nn.CrossEntropyLoss()
        
        for epoch in range(self.epochs):
            for label,actual in self.get_batches():
                    
                label = torch.Tensor(label).float().to(device)
                actual = torch.Tensor(actual).long().to(device)
            
                y_pred = self.rnn(label)
                y_pred = y_pred.transpose(1,2)
                loss = self.criterion(y_pred,actual)
                
                print(loss.item())
                break
            break
            
    def train(self):
        
        time1 = time.time()
        
        self.iterations = []
        self.loss_ = []
        
        self.criterion = nn.CrossEntropyLoss()
        
        self.optimizer = optim.SGD(self.rnn.parameters() , lr = self.lr)
        
        for epoch in range(self.epochs):
            for label,actual in self.get_batches():

                    
                label = torch.Tensor(label).float().to(device)
                actual = torch.Tensor(actual).long().to(device)
            
                y_pred = self.rnn(label)
                y_pred = y_pred.transpose(1,2)
                
                loss = self.criterion(y_pred,actual)
                
                loss.backward()
                
                with torch.no_grad():
                    
                    self.optimizer.step()
                    self.optimizer.zero_grad()
                    
            self.iterations.append(epoch)
            self.loss_.append(loss)
            
            if epoch % self.print_every == 0:
                print(f"Loss after {epoch} iteration : {loss}")
                
            if epoch % self.generate_every == 0:
                final = time.time() - time1
                print(f"Time elapsed : {final}\n")
                print(f"Generated Text after {epoch} epoch : {self.generate_text()}\n")
                
                
        self.show_plot(self.iterations , self.loss_)
                
                
    def show_plot(self,x,y):
        plt.plot(x,y)
        plt.xlabel('Epoch/Iteration')
        plt.ylabel('Loss')
        plt.show()
        
    
    def generate_text(self , init_str = 'Hello Everybody , I am ',predict_len = 200):
        
        generated_list = [w for w in init_str]
        
        for _ in range(predict_len):
            
            tensor = torch.Tensor([self.all_chars[generated_list[-1].lower()]]).long().unsqueeze(0)
            out = self.rnn(tensor.to(device))
            prob = F.softmax(out.squeeze() , dim = 0)
            
            value , ind = torch.topk(prob , 3)
            index = np.random.choice(ind.tolist())
    
            char = self.reverse_chars[index]
            
            generated_list.append(char)
            
        return ''.join(generated_list)


In [22]:
g1 = Generation(text , all_chars)

In [23]:
g1.show_batches()

tensor([[55, 17, 14,  ..., 14, 27, 16],
        [94, 47, 18,  ...,  8, 96, 96],
        [47, 10, 23,  ..., 27, 28, 29],
        ...,
        [23, 13, 96,  ..., 24, 25, 21],
        [14, 78, 94,  ..., 73, 94, 10],
        [28, 94, 17,  ..., 10, 23, 20]])
tensor([[17, 14, 94,  ..., 27, 16, 94],
        [47, 18, 12,  ..., 96, 96, 47],
        [10, 23, 16,  ..., 28, 29, 74],
        ...,
        [13, 96, 27,  ..., 25, 21, 14],
        [78, 94, 10,  ..., 94, 10, 28],
        [94, 17, 14,  ..., 23, 20, 94]])


In [24]:
g1.init_loss()

4.621908664703369


In [None]:
g1.train()

Loss after 0 iteration : 4.385647296905518
5.490497827529907
Hello Everybody , I am nd hse n  n ndssen hser. hsennndsennnd     nds h n  h nder he nnde nnnds n n n  nnd n n nd nde nnn h   ndendssser.  nnde  nnnd nnd  h h hs hs ndersss hse  hsend n nnnn nd   nnn nndss hs  nn  h   her  
10.848368883132935
Hello Everybody , I am t t tee haeer t h t heeere  t h that h tee h heerth h the teer t   haerer t ter t h  haerer the ther heert  tha   he tha hathaer  th t  ha hae thae ha t ha ha hee  ha h h haee h terereeeertert tert h 
16.172104358673096
Hello Everybody , I am at thater tere ae aerter t te t t t therter thert aerte a ate aee t athat t a a    a  teeeer a ae a  ath ae a atereere tha   te  t   aee  th ate  t tee a    te at a   atee  a aert ae the  t ate teeer 
21.688166618347168
Hello Everybody , I am th a h t a aeerter a a h atee t a aerthathath thae t the te atee at aerert ae aeertha a h at a hat ath h heerth the t hateert ther hee a t haereeerertereere teer thaeree te t atha tha aee

In [None]:
g1.generate_text()