In [None]:
import pandas as pd
import numpy as np 
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
import nltk
import re
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from sklearn.metrics import accuracy_score
import time
device=torch.device('cuda ' if torch.cuda.is_available() else 'cpu')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [33]:


def preprocessing(word):
   # replace digits with no space
     word = re.sub(r"\d", '', word)
   # Replace all runs of whitespaces with no space
     word = re.sub(r"\s+", '', word)
     return word

lines=[]
with open('/content/drive/MyDrive/alice.txt','r') as f:
    for line in f:
        line=line.lower().split()
    
        for i in line:
          i=re.findall(r"[\w]+|[.,!?;]", i) #we separate punctuation marks from words
          lines.append(i)

text=np.concatenate(np.array(lines,dtype=object))

In [34]:
for i,s in enumerate(text):
    text[i]=preprocessing(s)

In [35]:
word_list=[]
for i in text:
    if i not in word_list:
        word_list.append(i)
        
one_hot_dict={w:i+1 for i,w in enumerate(word_list) }

In [36]:
for i,s in enumerate(text):
  text[i]=one_hot_dict[s]

In [37]:
text[:300] #text list preview

array(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
       '13', '14', '15', '16', '17', '18', '19', '20', '5', '21', '22',
       '23', '15', '24', '25', '11', '26', '27', '28', '29', '30', '31',
       '32', '33', '5', '34', '18', '19', '9', '35', '22', '36', '37',
       '31', '38', '39', '28', '40', '41', '37', '22', '23', '42', '43',
       '5', '44', '15', '45', '34', '22', '46', '8', '47', '39', '28',
       '40', '48', '49', '30', '9', '50', '41', '18', '51', '52', '53',
       '54', '53', '30', '55', '22', '56', '5', '57', '58', '59', '18',
       '60', '13', '61', '23', '62', '22', '63', '5', '64', '15', '65',
       '45', '66', '67', '68', '69', '70', '5', '71', '15', '72', '73',
       '23', '74', '5', '75', '22', '76', '77', '45', '78', '6', '79',
       '80', '81', '82', '83', '17', '18', '3', '84', '9', '25', '49',
       '13', '85', '41', '86', '87', '88', '89', '8', '90', '37', '49',
       '13', '91', '92', '15', '5', '93', '11', '94', '5', '6', '95'

In [38]:
#model hyper parameters
batch_size=32
timestep=30 #each time steps occuring 30 words len

vocab_size=len(one_hot_dict)+1 #extra 1 for padding
embed_size=128.  #Input features to the LSTM
hidden_size=512  #Number of LSTM units

rep_tensor=torch.LongTensor(text.astype('int')) #text list converting to  tensor 

num_batch=rep_tensor.shape[0]//batch_size #net number of batches

rep_tensor=rep_tensor[:num_batch*batch_size] #rep tensor 

rep_tensor=rep_tensor.view(32,-1) 

In [44]:
num_batches=rep_tensor.shape[1]//timestep
print(num_batches)

32


In [43]:
rep_tensor.shape

torch.Size([32, 986])

In [42]:
rep_tensor #preview

tensor([[   1,    2,    3,  ...,   23,  146,   20],
        [ 337,   11,  158,  ...,    8,   11,  158],
        [  22,   41,  324,  ...,   99,   97,   99],
        ...,
        [   5, 1210,   22,  ...,   87,   36,   37],
        [ 705,  217,  343,  ...,  225,   86,  259],
        [   5, 1093,   22,  ...,  147,   30,   68]])

In [None]:
class textGenerator(nn.Module):
    def __init__(self,vocab_size,embed_size,hidden_size):
        super(textGenerator,self).__init__()

        self.embed=nn.Embedding(vocab_size,embed_size)

        self.fc=nn.Linear(hidden_size,vocab_size)

        self.lstm=nn.LSTM(input_size=embed_size,
                          hidden_size=hidden_size,
                          num_layers=1,
                          batch_first=True)
        
        self.drop=nn.Dropout(0.3)

        self.relu=nn.ReLU()

    
    def forward(self,x):
        input=x.clone()
        
        # Perform Word Embedding 
        x=self.embed(x)
        #x = x.view(batch_size,timesteps,embed_size)
        x,_=self.lstm(x)
        # (batch_size*timesteps, hidden_size)
        x=x.contiguous().view(-1, hidden_size)
        x=self.drop(x)
        out=self.fc(x)
        #Decode hidden states of all time steps
        out=out.view(input.shape[0],input.shape[1],vocab_size)
        
        return out[:,-1]
    
    
    

In [None]:

model=textGenerator(vocab_size,embed_size,hidden_size)
optimizer=torch.optim.Adam(model.parameters())
criterion=nn.CrossEntropyLoss()

train_loss=[]
train_l=0

model.train()

for epoch in range(10):
    t0 = time.time()

    for i in range(0 ,rep_tensor.shape[1]-timestep):
    
        inputs=rep_tensor[:,i:i+timestep]
        labels=rep_tensor[:,i+timestep]

        outputs=model(inputs)
        loss = criterion(outputs, labels.reshape(-1))
        
        train_l+=loss
        
        model.zero_grad()
        loss.backward()
        optimizer.step()
        acc=accuracy_score(torch.argmax(outputs,dim=1),labels)
        
    print('epoch {:} acc {:},  seconds {:.2f}'.format(epoch,acc,(time.time() - t0)/60))
        
        

    
    

KeyboardInterrupt: ignored

In [None]:
torch.save(model.state_dict(), '/content/drive/MyDrive/textGen.pt') #save to model 

In [None]:
model.load_state_dict(torch.load('/content/drive/MyDrive/textGen.pt'))#load to model 


<All keys matched successfully>

In [26]:
text_n=np.concatenate(np.array(lines[-10:],dtype=object))
for i,s in  enumerate(text_n):
    text_n[i]=one_hot_dict[preprocessing(s)]

text_n

array(['18', '51', '575', '608', '22', '23', '5', '2573', '2393', '1637',
       '3', '5', '228'], dtype='<U6')

In [27]:
model.eval()
with torch.no_grad():

  with open('/content/drive/MyDrive/result.txt','w'):

    input=torch.tensor(text_n.astype('int').reshape(1,-1))
    for i in range(1000):

        output=model(input)

        output_item=torch.argmax(output,dim=1)

        output=torch.cat((input.reshape(1,-1),output_item.reshape(1,-1) ),1).reshape(1,-1)

        input=output.clone()

    listt=[]
    for i in output[0]:
      listt.append(word_list[i-1]+' ')
    end=' '.join(listt)


In [28]:
end

'her  own  child  life  ,  and  the  happy  summer  days  .  the  end  of  the  party  went  back  to  the  game  .  chapter  ix  .  the  mock  turtle  s  story  you  can  t  think  how  glad  i  am  to  go  in  ?  she  went  on  ,  what  am  i  to  do  with  this  creature  when  i  get  it  home  ?  when  it  grunted  again  ,  so  violently  ,  that  she  looked  down  into  its  face  in  some  alarm  .  this  time  there  could  be  no  sort  of  chance  of  her  ever  getting  out  of  the  room  again  ,  no  wonder  she  felt  unhappy  .  it  was  much  pleasanter  at  home  ,  thought  poor  alice  ,  when  one  wasn  t  the  look  of  the  lobster  quadrille  ?  it  s  the  most  curious  thing  ,  and  alice  was  too  much  frightened  to  say  a  word  ,  but  she  did  not  venture  to  go  near  the  house  till  she  had  brought  herself  down  to  nine  inches  high  .  chapter  vi  .  pig  and  pepper  for  a  minute  or  two  she  stood  looking  at  the  house  ,  

In [None]:
len(end.split())

524

In [25]:
end




'failure  thought  snatch  think  no  cant  tunnel  no  considering  shoes  it  happened  sort  seemed  sister  question  air  altogether  get  once  no  learnt  mabel  worth  ropewill  one  seemed  made  down  wondered  measure  having  show  learnt  care  much  home  ending  having  is  mice  learnt  pleaded  aloud  rabbithole  arms  was  quietly  sister  lap  having  through  get  duchess  what  rabbithole  curiouser  angry  having  rabbithole  savage  occasionally  alternately  having  rabbithole  iii  cried  adding  no  be  worth  seemed  what  end  ache  notebook  well  remarkable  beginning  thought  escape  frightened  aloud  was  thought  girl  altered  hot  finger  others  learnt  pop  world  time  sure  rabbithole  doubtful  dried  aloud  having  how  bank  get  duchess  what  end  ache  notebook  well  off  trotting  filled  every  likely  ask  hoarse  climb  dipped  sitting  rabbithole  judge  dates  no  the  driest  having  dipped  respectful  was  for  pop  time  well  h