In [1]:
import torch
import torch.utils.data.dataloader
import torch.utils.data.dataset
import torch.nn as nn
import wandb

In [2]:
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mbss[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

### Loading dataset and train test split

In [3]:
pp_text = open('../data/clean_pp.txt').readlines() 
import random
from random import shuffle
random.seed(42)
shuffle(pp_text)
l = len(pp_text)
trl = int(0.7*l)
vrl = int(0.15*l)
train_sents = pp_text[:trl]
val_sents = pp_text[trl:trl+vrl]
test_sents = pp_text[trl+vrl:]

## Making Proper Tokenization

### Making vocabulary Reserve for [PAD] , [UNK] , [STR] , [END]

#### Load Glove embedding

In [4]:
vocab_embeddings = dict()
with open('../../Embedding/glove.6B.50d.txt','rt') as fi:
    full_content = fi.read().strip().split('\n')
for i in range(len(full_content)):
    i_word = full_content[i].split(' ')[0]
    i_embeddings = [float(val) for val in full_content[i].split(' ')[1:]]
    vocab_embeddings[i_word] = i_embeddings

In [5]:
vocab_dim = 50

#### Train Vocab

In [6]:
train_vocab = set()
for sent in train_sents:
    for word in sent.split():
        train_vocab.add(word)

In [7]:
train_vocab = list(train_vocab)
len(train_vocab)

5925

In [8]:
res = 4
word_idx = {k:v+res for (v,k) in enumerate(train_vocab)}
word_idx['[PAD]'] = 0
word_idx['[UNK]'] = 1
word_idx['[STR]'] = 2
word_idx['[END]'] = 3

train_vocab.insert(0,'[PAD]')
train_vocab.insert(1,'[UNK]')
train_vocab.insert(2,'[STR]')
train_vocab.insert(3,'[END]')

train_vocab[:5]

['[PAD]', '[UNK]', '[STR]', '[END]', 'extracts']

In [9]:
idx_word =  { v:k for k,v in word_idx.items() }
idx_word[0],idx_word[1],idx_word[2]

('[PAD]', '[UNK]', '[STR]')

In [10]:
from tqdm import tqdm
import numpy as np

## Embedding for SPECIAL Token 
unk_embedding = np.array([0 for i in range(vocab_dim)],dtype=np.float64) # Average of all tokens
start_embedding = np.random.rand(vocab_dim)
end_embedding = np.random.rand(vocab_dim)
pad_embedding = np.zeros(vocab_dim)

print(unk_embedding.shape,start_embedding.shape,pad_embedding.shape)


vocab_set = set(train_vocab)
glove_set = set(vocab_embeddings.keys())
vocab_interset = 0
interset = vocab_set.intersection(glove_set)

for word in tqdm(interset,desc='UNK Prep'):
    vocab_interset +=1
    unk_embedding += np.array(vocab_embeddings[word])

unk_embedding = unk_embedding/vocab_interset
print(len(vocab_set),vocab_interset,vocab_interset/(len(vocab_set)))

(50,) (50,) (50,)


UNK Prep: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5687/5687 [00:00<00:00, 209213.04it/s]

5929 5687 0.9591836734693877





In [11]:
len(interset)

5687

In [12]:
final_vocab = []
train_embedding = []
UNK_list = []
count = 0
for word in train_vocab:
    if word in glove_set:
        final_vocab.append(word)
        train_embedding.append(vocab_embeddings[word])
    elif word == '[PAD]':
        final_vocab.append(word)
        train_embedding.append(pad_embedding)
    elif word == '[UNK]':
        final_vocab.append(word)
        train_embedding.append(unk_embedding)
    elif word == '[STR]':
        final_vocab.append(word)
        train_embedding.append(start_embedding)
    elif word == '[END]':
        final_vocab.append(word)
        train_embedding.append(end_embedding)
    else: # Ever thing else is [UNK]:
        count+=1
        UNK_list.append(word)
        # train_embedding.append(unk_embedding)

In [13]:
word_idx = {k:v for (v,k) in enumerate(final_vocab)}
idx_word =  { v:k for k,v in word_idx.items() }

In [14]:
final_vocab[:15],train_vocab[:15]

(['[PAD]',
  '[UNK]',
  '[STR]',
  '[END]',
  'extracts',
  'failure',
  'witnessing',
  'xvii',
  'cost',
  'longer',
  'eleven',
  'appointment',
  'another',
  'spars',
  'comply'],
 ['[PAD]',
  '[UNK]',
  '[STR]',
  '[END]',
  'extracts',
  'failure',
  'witnessing',
  'xvii',
  'cost',
  'longer',
  'eleven',
  'appointment',
  'another',
  'spars',
  'comply'])

In [15]:
len(train_embedding),len(final_vocab)

(5691, 5691)

In [16]:
final_vocab =set(final_vocab)
def tokenizer(text):
    tokens = []
    text = text.strip('\n')
    text = idx_word[2]+' '+text+' '+idx_word[3]
    # print(text)
    for word in text.split():
        if word in final_vocab:
            tokens.append(word_idx[word])
        else:
            tokens.append(word_idx['[UNK]']) # update []
    return tokens

In [17]:
class LM(nn.Module):
    
    def __init__(self,config,embedding):
        super().__init__()
        self.config = config
        # This will tak token ids an make enbedding out of it
        # Embedding are from glove and should be frezzed from learning
        self.embedding = nn.Embedding.from_pretrained(torch.from_numpy(embedding).float())
        self.LSTM = nn.LSTM(num_layers=self.config['num_layers'],input_size=self.config['embedding'],hidden_size=self.config['hidden'])
        self.convert_vocab = nn.Linear(in_features=self.config['hidden'],out_features=self.config['vocab_size'])
        self.softmax = nn.Softmax(dim=1)
    def forward(self,token_seq):
        x = self.embedding(token_seq)
        out,_ = self.LSTM(x) # (h_n,c_n)
        convert_output = self.convert_vocab(out)
        prob = self.softmax(convert_output)
        return prob

words which are not in glove will become unk

In [18]:
from torch.utils.data import Dataset
class Dataset(Dataset):
    def __init__(self,data,vocab_size):
        self.data = data
        self.vocab_size = vocab_size
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self,idx):
        # print(self.data[idx])
        out = tokenizer(self.data[idx])
        inputs = out[:-1]# Last token is [END] will not include in data
        labels = out[1:]# First token is [STR] will never be predected
        # print(out)
        return torch.tensor(inputs),torch.nn.functional.one_hot(torch.tensor(labels),num_classes=self.vocab_size)

In [19]:
len(final_vocab)

5691

In [20]:
train_dataset = Dataset(train_sents,len(final_vocab))
val_dataset = Dataset(val_sents,len(final_vocab))
test_dataset = Dataset(test_sents,len(final_vocab))

In [21]:
a,b = train_dataset[0]

In [22]:
a.shape

torch.Size([61])

In [23]:
b.shape

torch.Size([61, 5691])

In [24]:
len(train_dataset)

1553

In [25]:
len(val_dataset)

332

In [26]:
len(test_dataset)

334

In [27]:
from torch.utils.data import DataLoader
train_dl =  DataLoader(train_dataset,shuffle=True)
val_dl = DataLoader(val_dataset)
test_dl = DataLoader(test_dataset)

In [36]:
LM_config = {
    'epoch':10,
    'vocab_size': len(final_vocab),
    'embedding':50,
    'hidden':100,
    'lr':0.00001,
    'num_layers':1
}

In [37]:
LM_config

{'epoch': 10,
 'vocab_size': 5691,
 'embedding': 50,
 'hidden': 100,
 'lr': 1e-05,
 'num_layers': 1}

In [38]:
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
LM_1 = LM(config=LM_config,embedding=np.array(train_embedding))
LM_1.to(device)
loss = nn.CrossEntropyLoss()
opt = torch.optim.Adam(params = LM_1.parameters(),lr=LM_config['lr'])
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer=opt,factor=0.1,threshold=1e-01,patience=1)

In [39]:
def validate(dataloader, model, loss_fn,which='Validation'):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    preplex = 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device).to(torch.float32)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            preplex += preplixity(y,pred)
            # correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    preplex /= size

    print(f"{which} Preplexity: \n : {(preplex):>0.5f}, Avg loss: {test_loss:>8f} \n")
    return (preplex,test_loss)

In [40]:
def preplixity(ground_truth,predicted):
    # print("Ground truth , Predictions",ground_truth.shape,predicted.shape)
    index_gt = torch.argmax(ground_truth,dim=2)
    # print("Ground truth Index: ",index_gt)
    row = torch.arange(index_gt.shape[1]).to(device)
    probablity = predicted[0,row,index_gt]
    # print(probablity,probablity.shape)
    power_p = torch.pow(probablity,-1/probablity.shape[1])
    # print(power_p,power_p.shape)
    preplex = torch.prod(power_p)
    return preplex


def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    preplex = 0
    avg_loss = 0 
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device).to(torch.float32)
        pred = model(X)
        loss = loss_fn(pred, y)
        preplex += preplixity(y,pred)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # break
        avg_loss += loss.item()
        if batch % 300 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
    preplex /= size
    avg_loss /= size
    print(f"\n Train Preplexity: {(preplex):>0.3f} \n loss: {avg_loss:>7f} ")
    return (preplex,avg_loss)

In [43]:
# Implement early stoping
# Implement Learning rate schedular
# Implement something else
# Implement Batch size concept

In [42]:
epochs = LM_config['epoch']
with wandb.init(project="pytorch-demo", config=LM_config):
    for t in range(epochs):
        print(f"Epoch {t+1}\n-------------------------------")
        tp,tl = train(train_dl, LM_1, loss,opt)
        vp,vl = validate(val_dl, LM_1, loss)
        scheduler.step(vp)
        wandb.log({'validate_preplex':vp , 'validate_loss':vl,'train_preplex':tp, 'train_loss':tl})
    print("Done!")

Epoch 1
-------------------------------
loss: 0.021069  [    1/ 1553]
loss: 0.054107  [  301/ 1553]
loss: 0.000580  [  601/ 1553]
loss: 0.294791  [  901/ 1553]
loss: 0.001889  [ 1201/ 1553]
loss: 0.025106  [ 1501/ 1553]

 Train Preplexity: 60.196 
 loss: 0.048914 
Validation Preplexity: 
 : 49.91350, Avg loss: 0.037964 

Epoch 2
-------------------------------
loss: 0.003475  [    1/ 1553]
loss: 0.059711  [  301/ 1553]
loss: 0.014885  [  601/ 1553]
loss: 0.079935  [  901/ 1553]
loss: 0.034370  [ 1201/ 1553]
loss: 0.009828  [ 1501/ 1553]

 Train Preplexity: 60.034 
 loss: 0.048913 
Validation Preplexity: 
 : 49.77551, Avg loss: 0.037964 

Epoch 3
-------------------------------
loss: 0.000571  [    1/ 1553]
loss: 0.059710  [  301/ 1553]
loss: 0.014884  [  601/ 1553]
loss: 0.041381  [  901/ 1553]
loss: 0.055966  [ 1201/ 1553]
loss: 0.179573  [ 1501/ 1553]

 Train Preplexity: 59.863 
 loss: 0.048913 
Validation Preplexity: 
 : 49.62623, Avg loss: 0.037963 

Epoch 4
-----------------------

0,1
train_loss,█▆▃▁▁▁▁▁▁▁
train_preplex,█▆▃▁▁▁▁▁▁▁
validate_loss,█▅▂▁▁▁▁▁▁▁
validate_preplex,█▅▂▁▁▁▁▁▁▁

0,1
train_loss,0.04891
train_preplex,59.72309
validate_loss,0.03796
validate_preplex,49.59132
