In [150]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import re
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

### Preprocessing

In [151]:
def read_prepare_data(file_path):
    with open(file_path, 'r') as f:
        data = f.read().strip()
        data = re.sub("[^a-zA-Z0-9 ,.:? \n]", "", data)
        data = data.lower().split('\n')
        data = [line for line in data if line]
    return data

file_path = "/kaggle/input/shekspier-data/Shekspier.txt"
data = read_prepare_data(file_path)
print(data[0:4])

['first citizen:', 'before we proceed any further, hear me speak.', 'all:', 'speak, speak.']


### Tokenizer

In [152]:
def init_tokenizer():
    # initialize the tokenizer
    tkn = Tokenizer()
    tkn.fit_on_texts(data)
    # +1 to handle the words the tokenizer has not seen before
    nmu_unique_words = len(tkn.word_index) + 1
    return (tkn, nmu_unique_words)
tokenizer, nmu_unique_words = init_tokenizer()

def tokenize_data_padding(raw_data, tokenizer, max_length_input=None):
    seq = tokenizer.texts_to_sequences(raw_data)
    seq = [sample for sample in seq if len(sample) >= 2]
    max_length = max([len(sample) for sample in seq])
    if max_length_input != None:
        max_length = max_length_input
    data = pad_sequences(seq, padding = 'pre', maxlen= max_length)
    return data
data = tokenize_data_padding(data, tokenizer)

### Define X and y, and apply one-hot encoding for y

In [153]:
X = data[:,:-1]
y = data[:,-1]

input_size = X.shape[1]
y = to_categorical(y, nmu_unique_words)

In [154]:
X.shape, y.shape

((27307, 15), (27307, 12751))

### Customize dataset and Data loader

In [155]:
"""
inherit from Dataset to make sure it is
compatible with Pytorch training process
"""
class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
        
    def __len__(self):
        return len(self.X) # Return total number of samples

    def __getitem__(self, idx):

        X_sample = self.X[idx,:]
        y_sample = self.y[idx,:]
        return X_sample, y_sample

In [156]:
train_data = CustomDataset(X, y)
train_loader = DataLoader(dataset=train_data, batch_size=128, shuffle=True, drop_last=True, pin_memory=True)

In [157]:
input_size = 15
vocab_size = nmu_unique_words
embedding_dim = 100
hidden_size = 128
num_layers = 1
output_size = nmu_unique_words
learning_rate = 0.001
num_epochs = 100

In [158]:
is_cuda_available = torch.cuda.is_available()
device = torch.device("cuda" if is_cuda_available else "cpu")
device

device(type='cuda')

### Build the model

In [159]:
class LanguageModel(nn.Module):
    def __init__(self,vocab_size, embedding_dim, hidden_size, num_layers, output_size):
        super(LanguageModel, self).__init__()
        self.input_size = input_size        
        self.hidden_size = hidden_size        
        self.num_layers = num_layers
        self.output_size = output_size        
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        
        self.embedding_layer = nn.Embedding(self.vocab_size, self.embedding_dim)
        self.dropout = nn.Dropout(0.1)
        self.lstm = nn.LSTM(self.embedding_dim, self.hidden_size, self.num_layers)
        self.fc = nn.Linear(self.hidden_size, self.output_size)
        
    def forward(self, x):
        embedded_x = self.embedding_layer(x)
        h0 = torch.zeros(self.num_layers, x.size(1), self.hidden_size).to(device)
        c0 = torch.zeros(self.num_layers, x.size(1), self.hidden_size).to(device)
        output,_ = self.lstm(embedded_x, (h0.detach(), c0.detach()))
        output = self.dropout(output[:,-1,:])
        output = self.fc(output)
        return output
    
model = LanguageModel(vocab_size, embedding_dim, hidden_size, num_layers, output_size)
loss = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), learning_rate)

### Train 

In [160]:
model.to(device)
train_acc = []
train_loss = []
for epoch in range(num_epochs):
    model.train()
    for i, (sentence, label) in enumerate(train_loader):
        sentence = sentence.to(device)
        label = label.to(device)
        output = model(sentence)
        l = loss(output, label)
        l.backward()
        train_loss.append(l)
        optimizer.step()
        optimizer.zero_grad()
    loss_per_epoch = sum(train_loss)/len(train_loss)
    print(f"epoch: {epoch}/{num_epochs} / loss: {loss_per_epoch:.2f}")
    
        

epoch: 0/100 / loss: 8.00
epoch: 1/100 / loss: 7.56
epoch: 2/100 / loss: 7.29
epoch: 3/100 / loss: 7.09
epoch: 4/100 / loss: 6.92
epoch: 5/100 / loss: 6.76
epoch: 6/100 / loss: 6.62
epoch: 7/100 / loss: 6.49
epoch: 8/100 / loss: 6.37
epoch: 9/100 / loss: 6.25
epoch: 10/100 / loss: 6.15
epoch: 11/100 / loss: 6.05
epoch: 12/100 / loss: 5.95
epoch: 13/100 / loss: 5.86
epoch: 14/100 / loss: 5.78
epoch: 15/100 / loss: 5.70
epoch: 16/100 / loss: 5.62
epoch: 17/100 / loss: 5.55
epoch: 18/100 / loss: 5.49
epoch: 19/100 / loss: 5.42
epoch: 20/100 / loss: 5.37
epoch: 21/100 / loss: 5.31
epoch: 22/100 / loss: 5.26
epoch: 23/100 / loss: 5.21
epoch: 24/100 / loss: 5.16
epoch: 25/100 / loss: 5.12
epoch: 26/100 / loss: 5.07
epoch: 27/100 / loss: 5.03
epoch: 28/100 / loss: 5.00
epoch: 29/100 / loss: 4.96
epoch: 30/100 / loss: 4.92
epoch: 31/100 / loss: 4.89
epoch: 32/100 / loss: 4.86
epoch: 33/100 / loss: 4.83
epoch: 34/100 / loss: 4.80
epoch: 35/100 / loss: 4.77
epoch: 36/100 / loss: 4.75
epoch: 37/1

### Test

In [169]:
def word_generator(input_text_arrays, num_pred_words, tokenizer):
    for i in range(num_pred_words):
        my_sample = tokenize_data_padding(input_text_arrays, tokenizer, max_length_input=15)
        my_sample = torch.tensor(my_sample).to(device)
        model.eval()
        prediction = model(my_sample)
        prediction_index = torch.argmax(prediction).item()

        for k in tokenizer.word_index.keys():
            if tokenizer.word_index[k] == prediction_index:
                input_text_arrays[0] = input_text_arrays[0] + ' ' + k
    
    return input_text_arrays
    

my_sample = ["all you need to know is"]
word_generator(my_sample, 5, tokenizer)

['all you need to know is this night approaches for me']