In [60]:
import torch
import torch.utils.data.dataloader
import torch.utils.data.dataset
import torch.nn as nn

In [61]:
class Config_class:
    
    def __init__(self,config):
        self.config = config


class LM(nn.Module):
    
    def __init__(self,config):
        super().__init__()
        self.config = config
        # This will tak token ids an make enbedding out of it
        self.embedding = nn.Embedding(num_embeddings=self.config['vocab_size'],embedding_dim=self.config['embedding'],padding_idx=989)
        self.LSTM = nn.LSTM(num_layers=self.config['num_layers'],input_size=self.config['embedding'],hidden_size=self.config['hidden'])
        self.convert_vocab = nn.Linear(in_features=self.config['hidden'],out_features=self.config['vocab_size'])
        self.softmax = nn.Softmax(dim=1)
    def forward(self,token_seq):
        x = self.embedding(token_seq)
        out,_ = self.LSTM(x) # (h_n,c_n)
        convert_output = self.convert_vocab(out)
        prob = self.softmax(convert_output)
        return prob
    

### Loading dataset and train test split

In [62]:
pp_text = open('../data/clean_pp.txt').readlines() 

In [63]:
import random
from random import shuffle
random.seed(42)
shuffle(pp_text)
l = len(pp_text)
trl = int(0.7*l)
vrl = int(0.15*l)
train = pp_text[:trl]
val = pp_text[trl:trl+vrl]
test = pp_text[trl+vrl:]

### Making vocabulary Reserve for [PAD] , [UNK] , [STR] , [END]

In [64]:
vocab = set()
for sent in train:
    for word in sent.split():
        vocab.add(word)

In [65]:
len(vocab)

5925

In [66]:
res = 4
word_idx = {k:v+res for (v,k) in enumerate(vocab)}
word_idx['[PAD]'] = 0
word_idx['[STR]'] = 1
word_idx['[END]'] = 2
word_idx['[UNK]'] = 3

In [67]:
vocab = set(word_idx.keys())

In [68]:
len(vocab)

5929

In [69]:
idx_word =  { v:k for k,v in word_idx.items() }

In [70]:
idx_word[3]

'[UNK]'

In [71]:
def tokenizer(text):
    tokens = []
    text = text.strip('\n')
    text = idx_word[1]+' '+text+' '+idx_word[2]
    # print(text)
    for word in text.split():
        if word in word_idx.keys():
            tokens.append(word_idx[word])
        else:
            tokens.append(word_idx['[UNK]']) # update []
    return tokens

In [72]:
from torch.utils.data import Dataset
class Dataset(Dataset):
    def __init__(self,data,vocab_size):
        self.data = data
        self.vocab_size = vocab_size
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self,idx):
        # print(self.data[idx])
        out = tokenizer(self.data[idx])
        inputs = out[:-1]# Last token is [END] will not include in data
        labels = out[1:]# First token is [STR] will never be predected
        # print(out)
        return torch.tensor(inputs),torch.nn.functional.one_hot(torch.tensor(labels),num_classes=self.vocab_size)

In [73]:
train_dataset = Dataset(train,len(vocab))
val_dataset = Dataset(val,len(vocab))
test_dataset = Dataset(test,len(vocab))

In [74]:
a,b = train_dataset[0]

In [75]:
a.shape

torch.Size([61])

In [76]:
b.shape

torch.Size([61, 5929])

In [77]:
len(train_dataset)

1553

In [78]:
len(val)

332

In [79]:
len(test_dataset)

334

In [80]:
from torch.utils.data import DataLoader
train_dl =  DataLoader(train_dataset,shuffle=True)
val_dl = DataLoader(val_dataset)
test_dl = DataLoader(test_dataset)

In [81]:
LM_config = Config_class({
    'vocab_size': len(vocab),
    'embedding':128,
    'hidden':256,
    'lr':0.0001,
    'num_layers':1
})

In [82]:
LM_config.config

{'vocab_size': 5929,
 'embedding': 128,
 'hidden': 256,
 'lr': 0.0001,
 'num_layers': 1}

In [83]:
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
LM_1 = LM(config=LM_config.config)
LM_1.to(device)
loss = nn.CrossEntropyLoss()
optmizer = torch.optim.Adam(params = LM_1.parameters())

In [91]:
def validate(dataloader, model, loss_fn,which='Validation'):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    preplex = 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device).to(torch.float32)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            preplex += preplixity(y,pred)
            # correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    preplex /= size

    print(f"{which} Preplexity: \n : {(preplex):>0.5f}, Avg loss: {test_loss:>8f} \n")

In [95]:
def preplixity(ground_truth,predicted):
    # print("Ground truth , Predictions",ground_truth.shape,predicted.shape)
    index_gt = torch.argmax(ground_truth,dim=2)
    # print("Ground truth Index: ",index_gt)
    row = torch.arange(index_gt.shape[1]).to(device)
    probablity = predicted[0,row,index_gt]
    # print(probablity,probablity.shape)
    power_p = torch.pow(probablity,-1/probablity.shape[1])
    # print(power_p,power_p.shape)
    preplex = torch.prod(power_p)
    # print(preplex.shape)
    # print(preplex)
    return preplex

    

def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    preplex = 0
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device).to(torch.float32)
        pred = model(X)
        loss = loss_fn(pred, y)
        preplex += preplixity(y,pred)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # break
        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
    preplex /= size
    print(f"Train Preplexity: \n : {(preplex):>0.3f}")

In [96]:
epochs = 10
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dl, LM_1, loss, optmizer)
    validate(val_dl, LM_1, loss)
print("Done!")

Epoch 1
-------------------------------
loss: 0.310914  [    1/ 1553]
loss: 0.000279  [  101/ 1553]
loss: 0.026408  [  201/ 1553]
loss: 0.010933  [  301/ 1553]
loss: 0.002659  [  401/ 1553]
loss: 0.025449  [  501/ 1553]
loss: 0.010780  [  601/ 1553]
loss: 0.010211  [  701/ 1553]
loss: 0.003753  [  801/ 1553]
loss: 0.013730  [  901/ 1553]
loss: 0.100449  [ 1001/ 1553]
loss: 0.006877  [ 1101/ 1553]
loss: 0.094690  [ 1201/ 1553]
loss: 0.000279  [ 1301/ 1553]
loss: 0.000279  [ 1401/ 1553]
loss: 0.041520  [ 1501/ 1553]
Train Preplexity: 
 : 2753.985
Validation Preplexity: 
 : 3547.36279, Avg loss: 0.035459 

Epoch 2
-------------------------------
loss: 0.000279  [    1/ 1553]
loss: 0.132817  [  101/ 1553]
loss: 0.029474  [  201/ 1553]
loss: 0.053709  [  301/ 1553]
loss: 0.024944  [  401/ 1553]
loss: 0.013766  [  501/ 1553]
loss: 0.024752  [  601/ 1553]
loss: 0.059752  [  701/ 1553]
loss: 0.044096  [  801/ 1553]
loss: 0.106627  [  901/ 1553]
loss: 0.033465  [ 1001/ 1553]
loss: 0.034014  [ 1