output -- >layer with the neurons=vocabulary
take a look at lstm layer


In [None]:
import string
from keras.preprocessing.text import Tokenizer
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim

# Preprocessing

In [None]:

def clean_doc(doc):
    doc = doc.replace('--', ' ')
    tokens = doc.split()
    # remove punctuation from each token
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # make lower case
    tokens = [word.lower() for word in tokens]
    return tokens

# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text


in_filename = 'republic_plato_dataset.txt'
doc = load_doc(in_filename)
tokens = clean_doc(doc)
# print(tokens[:200])
print('Total Tokens: %d' % len(tokens))
print('Unique Tokens: %d' % len(set(tokens)))

In [None]:

# organize into sequences of tokens
length = 50 + 1
sequences = list()
# print(len(tokens))
for i in range(length, len(tokens)):
    # select sequence of tokens
    # print(i-length,i)
    seq = tokens[i-length:i]
    # convert into a line
    line = ' '.join(seq)
    # store
    sequences.append(line)
print('Total Sequences: %d' % len(sequences))

In [None]:
# len(sequences[1].split()),sequences[1],type(sequences[1]),type(sequences) #every sequence has 51 words 50+1

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sequences)
sequences = tokenizer.texts_to_sequences(sequences) #assigns unique number to vocabulary and transforms it

In [None]:
# len(sequences[0]),len(sequences),sequences[0]

# Building Model

In [271]:

class LSTM(nn.Module):
    def __init__(self,embedding_dim,hidden_dim,vocab_size):
        super(LSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim) # https://discuss.pytorch.org/t/how-does-nn-embedding-work/88518 --- > each unique word(vocabulary) will have the vector of size(embedding_dim)
        # The LSTM takes word embeddings as inputs, and outputs hidden states # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim) # since each word is a time stamp the input for LSTM layer would be equal to embedding_dim        
        self.hidden2next = nn.Linear(hidden_dim, vocab_size) # The linear layer that maps from hidden state space to no of unique words in our dataset

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence) # embedding for the words is produced 
        # print('embeddings',embeds,len(embeds),len(sentence))
        # print("len of sentence",sentence)
        lstm_out, _ = self.lstm(embeds.view(1, 1, -1)) #nn.embedding.view --> reshapes the produces embedding to 50 rows and 1 column 
        final_layer = self.hidden2next(lstm_out.view(1, -1)) # reshapes to 50 rows and one column
        next_word_scores = F.log_softmax(final_layer, dim=1) #softmax activation is used because here i need probability distribution for the vocabulary
        return next_word_scores
embedding_dim=100
hidden_dim=100
model = LSTM(embedding_dim,hidden_dim,len(set(tokens)))
loss_function = nn.NLLLoss() #negative log liklyhood loss -->cross entropy loss #why --> beacuse we had used softmax
optimizer = optim.SGD(model.parameters(), lr=0.1) #updating the weights after the pass of every sequence

In [None]:
len(set(tokens)),len(tokens)

# Training

In [273]:
loss_epochs=[]
for epoch in range(5):  # running the model for 5 epochs
    for seq in sequences[:1000]:
        # print(seq)
        model.zero_grad() #clearning gradients(change in weight wrt loss) # https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch
        sentence_in = torch.tensor(seq[:50],dtype=torch.long) #taking first 50 words as i/p
        next_word = torch.tensor(seq[50],dtype=torch.long) #taking 51st word as output
        
        ####################3
        # tag_scores=model(sentence_in)
        # print(tag_scores,len(tag_scores[0]))
        ###############
        for word in sentence_in:
            next_word_score=model(word)
        # print("output from the model",tag_scores,len(tag_scores[0]))
        loss = loss_function(next_word_score[0],next_word) #computing loss
        loss.backward() #backprob for updating weights
        optimizer.step() # updating model parameters by calling optimizer.step() function
    loss_epochs.append(loss)
    print(f"epoch {epoch} completed !")  

epoch 0 completed !
epoch 1 completed !
epoch 2 completed !
epoch 3 completed !
epoch 4 completed !


In [296]:
test=sequences[456]
print(tokenizer.sequences_to_texts([test])[0])
with torch.no_grad():
    inputs = torch.tensor(test[:50],dtype=torch.long) 
    for word in inputs:
        tag_scores=model(word)
    # print(len(tag_scores[0]),type(tag_scores[0])) #prob distribution for the entire vocab
    idx=torch.argmax(tag_scores[0])
    # print()
    temp={val:key for key,val in tokenizer.word_index.items()}
    print('Prediction is',temp[int(idx)])
    # temp={val:key for key,val in tokenizer.word_index.items()}
    # print('next word is',temp[idx])
    # # print(max,idx)

of pleasures and desires into necessary and unnecessary these and other great forms of thought are all of them to be found in the republic and were probably first invented by plato the greatest of all logical truths and the one of which writers on philosophy are most apt to lose
Prediction is be


'imaginary'