In [1]:
import torch
import torch.utils.data.dataloader
import torch.utils.data.dataset
import torch.nn as nn
import wandb

In [2]:
# wandb.login()

### Loading dataset and train test split

In [4]:
pp_text = open('../data/clean_pp.txt').readlines() 
import random
from random import shuffle
random.seed(42)
shuffle(pp_text)
l = len(pp_text)
trl = int(0.7*l)
vrl = int(0.15*l)
train_sents = pp_text[:trl]
val_sents = pp_text[trl:trl+vrl]
test_sents = pp_text[trl+vrl:]

## Making Proper Tokenization

### Making vocabulary Reserve for [PAD] , [UNK] , [STR] , [END]

#### Load Glove embedding

In [5]:
vocab_embeddings = dict()
with open('../../Embedding/glove.6B.50d.txt','rt') as fi:
    full_content = fi.read().strip().split('\n')
for i in range(len(full_content)):
    i_word = full_content[i].split(' ')[0]
    i_embeddings = [float(val) for val in full_content[i].split(' ')[1:]]
    vocab_embeddings[i_word] = i_embeddings

In [6]:
vocab_dim = 50

#### Train Vocab

In [7]:
train_vocab = set()
for sent in train_sents:
    for word in sent.split():
        train_vocab.add(word)

In [8]:
train_vocab = list(train_vocab)
len(train_vocab)

5925

In [9]:
res = 4
word_idx = {k:v+res for (v,k) in enumerate(train_vocab)}
word_idx['[PAD]'] = 0
word_idx['[UNK]'] = 1
word_idx['[STR]'] = 2
word_idx['[END]'] = 3

train_vocab.insert(0,'[PAD]')
train_vocab.insert(1,'[UNK]')
train_vocab.insert(2,'[STR]')
train_vocab.insert(3,'[END]')

train_vocab[:5]

['[PAD]', '[UNK]', '[STR]', '[END]', 'noise']

In [10]:
idx_word =  { v:k for k,v in word_idx.items() }
idx_word[0],idx_word[1],idx_word[2]

('[PAD]', '[UNK]', '[STR]')

In [11]:
from tqdm import tqdm
import numpy as np

## Embedding for SPECIAL Token 
unk_embedding = np.array([0 for i in range(vocab_dim)],dtype=np.float64) # Average of all tokens
start_embedding = np.random.rand(vocab_dim)
end_embedding = np.random.rand(vocab_dim)
pad_embedding = np.zeros(vocab_dim)

print(unk_embedding.shape,start_embedding.shape,pad_embedding.shape)


vocab_set = set(train_vocab)
glove_set = set(vocab_embeddings.keys())
vocab_interset = 0
interset = vocab_set.intersection(glove_set)

for word in tqdm(interset,desc='UNK Prep'):
    vocab_interset +=1
    unk_embedding += np.array(vocab_embeddings[word])

unk_embedding = unk_embedding/vocab_interset
print(len(vocab_set),vocab_interset,vocab_interset/(len(vocab_set)))

(50,) (50,) (50,)


UNK Prep: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5687/5687 [00:00<00:00, 208769.92it/s]

5929 5687 0.9591836734693877





In [12]:
len(interset)

5687

In [13]:
final_vocab = []
train_embedding = []
UNK_list = []
count = 0
for word in train_vocab:
    if word in glove_set:
        final_vocab.append(word)
        train_embedding.append(vocab_embeddings[word])
    elif word == '[PAD]':
        final_vocab.append(word)
        train_embedding.append(pad_embedding)
    elif word == '[UNK]':
        final_vocab.append(word)
        train_embedding.append(unk_embedding)
    elif word == '[STR]':
        final_vocab.append(word)
        train_embedding.append(start_embedding)
    elif word == '[END]':
        final_vocab.append(word)
        train_embedding.append(end_embedding)
    else: # Ever thing else is [UNK]:
        count+=1
        UNK_list.append(word)
        # train_embedding.append(unk_embedding)

In [14]:
word_idx = {k:v for (v,k) in enumerate(final_vocab)}
idx_word =  { v:k for k,v in word_idx.items() }

In [15]:
final_vocab[:15],train_vocab[:15]

(['[PAD]',
  '[UNK]',
  '[STR]',
  '[END]',
  'noise',
  'sincerity',
  'duties',
  'astonished',
  'devoid',
  'pronounce',
  'awoke',
  'devoted',
  'leave',
  'dispense',
  'apologizing'],
 ['[PAD]',
  '[UNK]',
  '[STR]',
  '[END]',
  'noise',
  'sincerity',
  'duties',
  'astonished',
  'devoid',
  'pronounce',
  'awoke',
  'devoted',
  'leave',
  'dispense',
  'apologizing'])

In [16]:
len(train_embedding),len(final_vocab)

(5691, 5691)

In [17]:
final_vocab =set(final_vocab)
def tokenizer(text):
    tokens = []
    text = text.strip('\n')
    text = idx_word[2]+' '+text+' '+idx_word[3]
    # print(text)
    for word in text.split():
        if word in final_vocab:
            tokens.append(word_idx[word])
        else:
            tokens.append(word_idx['[UNK]']) # update []
    return tokens

In [18]:
class LM(nn.Module):
    
    def __init__(self,config,embedding):
        super().__init__()
        self.config = config
        # This will tak token ids an make enbedding out of it
        # Embedding are from glove and should be frezzed from learning
        self.embedding = nn.Embedding.from_pretrained(torch.from_numpy(embedding).float())
        self.LSTM = nn.LSTM(num_layers=self.config['num_layers'],input_size=self.config['embedding'],hidden_size=self.config['hidden'])
        self.convert_vocab = nn.Linear(in_features=self.config['hidden'],out_features=self.config['vocab_size'])
        self.softmax = nn.Softmax(dim=1)
    def forward(self,token_seq):
        x = self.embedding(token_seq)
        out,_ = self.LSTM(x) # (h_n,c_n)
        convert_output = self.convert_vocab(out)
        prob = self.softmax(convert_output)
        return prob

words which are not in glove will become unk

In [19]:
from torch.utils.data import Dataset
class Dataset(Dataset):
    def __init__(self,data,vocab_size):
        self.data = data
        self.vocab_size = vocab_size
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self,idx):
        # print(self.data[idx])
        out = tokenizer(self.data[idx])
        inputs = out[:-1]# Last token is [END] will not include in data
        labels = out[1:]# First token is [STR] will never be predected
        # print(out)
        return torch.tensor(inputs),torch.nn.functional.one_hot(torch.tensor(labels),num_classes=self.vocab_size)

In [20]:
len(final_vocab)

5691

In [21]:
train_dataset = Dataset(train_sents,len(final_vocab))
val_dataset = Dataset(val_sents,len(final_vocab))
test_dataset = Dataset(test_sents,len(final_vocab))

In [22]:
a,b = train_dataset[0]

In [23]:
a.shape

torch.Size([61])

In [24]:
b.shape

torch.Size([61, 5691])

In [25]:
len(train_dataset)

1553

In [26]:
len(val_dataset)

332

In [27]:
len(test_dataset)

334

In [28]:
from torch.utils.data import DataLoader
train_dl =  DataLoader(train_dataset,shuffle=True)
val_dl = DataLoader(val_dataset)
test_dl = DataLoader(test_dataset)

In [29]:
x = torch.tensor([43,4455,546,5,55])

In [30]:
x.tolist()

[43, 4455, 546, 5, 55]

In [31]:
def token_sent(token):
    # print(token.shape)
    token = token.tolist()[0]
    # print(token)
    for i in token:
        print(idx_word[i],end=' ')

In [32]:
def validate(dataloader, model, loss_fn,which='Validation'):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    preplex = 0
    with torch.no_grad():
        for X, y in dataloader:
            # token_sent(X)
            X, y = X.to(device), y.to(device).to(torch.float32)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            preplex += preplixity(y,pred)
            # correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    preplex /= size

    print(f"{which} Preplexity: \n : {(preplex):>0.5f}, Avg loss: {test_loss:>8f} \n")
    return (preplex,test_loss)

In [33]:
def preplixity(ground_truth,predicted):
    # print("Ground truth , Predictions",ground_truth.shape,predicted.shape)
    index_gt = torch.argmax(ground_truth,dim=2)
    # print("Ground truth Index: ",index_gt)
    row = torch.arange(index_gt.shape[1]).to(device)
    probablity = predicted[0,row,index_gt]
    # print(probablity,probablity.shape)
    power_p = torch.pow(probablity,-1/probablity.shape[1])
    # print(power_p,power_p.shape)
    preplex = torch.prod(power_p)
    # print('\t',preplex.item())
    return preplex


def train(dataloader, model, loss_fn, optimizer,sch):
    size = len(dataloader.dataset)
    model.train()
    preplex = 0
    avg_loss = 0
    
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device).to(torch.float32)
        pred = model(X)
        loss = loss_fn(pred, y)
        preplex += preplixity(y,pred)
        optimizer.zero_grad()
        # if batch%10 == 9:
        loss.backward()
        optimizer.step()
        # break
        avg_loss += loss.item()
        if batch % 300 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
    preplex /= size
    avg_loss /= size
    print(f"\n Train Preplexity: {(preplex):>0.3f} \n loss: {avg_loss:>7f} ")
    return (preplex,avg_loss)

In [33]:
# Implement early stoping
# Implement Learning rate schedular
# Implement something else
# Implement Batch size concept

In [34]:
LM_config = {
    'epoch':10,
    'vocab_size': len(final_vocab),
    'embedding':50,
    'hidden':100,
    'lr':0.00001,
    'num_layers':1
}

In [35]:
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
LM_1 = LM(config=LM_config,embedding=np.array(train_embedding))
LM_1.to(device)
loss = nn.CrossEntropyLoss()
opt = torch.optim.Adam(params = LM_1.parameters(),lr=LM_config['lr'])
scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer=opt,max_lr=0.001,steps_per_epoch=LM_config['epoch'],epochs=LM_config['epoch'])

In [36]:
float("inf")>2

True

In [37]:
best_p = float('inf')
epochs = LM_config['epoch']
with wandb.init(project="pytorch-demo", config=LM_config):
    for t in range(epochs):
        print(f"Epoch {t+1}\n-------------------------------")
        print(scheduler.get_last_lr()[0])
        tp,tl = train(train_dl, LM_1, loss,opt,scheduler)
        vp,vl = validate(val_dl, LM_1, loss)
        # scheduler.step(vp)
        wandb.log({'validate_preplex':vp,
                   'validate_loss':vl,
                   'train_preplex':tp, 
                   'train_loss':tl,
                   "lr": scheduler.get_last_lr()[0]
                  })
        scheduler.step()
        if vp < best_p:
            best_p = vp
            torch.save(LM_1.state_dict(),'../data/LM_5.pth')
            print("model saved")
    print("Done!")

[34m[1mwandb[0m: Currently logged in as: [33mbss[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch 1
-------------------------------
3.9999999999999996e-05
loss: 0.005859  [    1/ 1553]
loss: 0.017158  [  301/ 1553]
loss: 0.029256  [  601/ 1553]
loss: 0.071162  [  901/ 1553]
loss: 0.031794  [ 1201/ 1553]
loss: 0.099930  [ 1501/ 1553]

 Train Preplexity: 59.787 
 loss: 0.048913 
Validation Preplexity: 
 : 49.29222, Avg loss: 0.037962 

model saved
Epoch 2
-------------------------------
4.281378056590743e-05
loss: 0.000567  [    1/ 1553]
loss: 0.120519  [  301/ 1553]
loss: 0.014882  [  601/ 1553]
loss: 0.009824  [  901/ 1553]
loss: 0.108095  [ 1201/ 1553]
loss: 0.519451  [ 1501/ 1553]

 Train Preplexity: 58.699 
 loss: 0.048908 
Validation Preplexity: 
 : 47.98090, Avg loss: 0.037955 

model saved
Epoch 3
-------------------------------
5.122213325915845e-05
loss: 0.004633  [    1/ 1553]
loss: 0.080913  [  301/ 1553]
loss: 0.068259  [  601/ 1553]
loss: 0.014130  [  901/ 1553]
loss: 0.086847  [ 1201/ 1553]
loss: 0.004606  [ 1501/ 1553]

 Train Preplexity: 56.202 
 loss: 0.048897

VBox(children=(Label(value='0.005 MB of 0.005 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
lr,▁▁▁▂▂▃▄▅▇█
train_loss,███▇▇▅▄▃▂▁
train_preplex,▂▁▁▁▁▁▂▄▆█
validate_loss,███▇▆▅▄▃▂▁
validate_preplex,▁▁▁▁▁▂▃▄▅█

0,1
lr,0.00025
train_loss,0.04827
train_preplex,145.32487
validate_loss,0.03737
validate_preplex,164.10207


In [38]:
scheduler.get_last_lr()[0]

0.00029516394846410075

In [40]:
LM_1.load_state_dict(torch.load('../data/LM_5.pth'))

<All keys matched successfully>

In [41]:
LM_1.eval()

LM(
  (embedding): Embedding(5691, 50)
  (LSTM): LSTM(50, 100)
  (convert_vocab): Linear(in_features=100, out_features=5691, bias=True)
  (softmax): Softmax(dim=1)
)

In [42]:
def my_token_sent(token):
    # print(token.shape)
    token = token.tolist()[0]
    # print(token)
    out = ''
    for i in token:
        out += idx_word[i]+' '
    return out.strip()
def generate_p(dataloader, model, loss_fn,which='Validation_LM_2'):
    file = open(f'../data/{which}.txt','w')
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    preplex = 0
    with torch.no_grad():
        for X, y in dataloader:
            out = my_token_sent(X)
            X, y = X.to(device), y.to(device).to(torch.float32)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            pout = preplixity(y,pred) 
            preplex += pout
            file.write(f"{out}\t{pout}\n")
            # correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    preplex /= size
    file.close()
    print(f"{which} Preplexity: \n : {(preplex):>0.5f}, Avg loss: {test_loss:>8f} \n")
    return (preplex,test_loss)

In [44]:
for t in range(1):
    # print(f"Epoch {t+1}\n-------------------------------")
    vp,vl = generate_p(train_dl, LM_1, loss,which='train_LM_1')
    # print("Done!")

train_LM_1 Preplexity: 
 : 51.27480, Avg loss: 0.048829 



In [45]:
for t in range(1):
    # print(f"Epoch {t+1}\n-------------------------------")
    vp,vl = generate_p(test_dl, LM_1, loss,which='test_LM_1')
    # print("Done!")

test_LM_1 Preplexity: 
 : 47.60085, Avg loss: 0.043497 



In [45]:
data = {'test_data':test_sents,
 'word_idx':word_idx,
 'idx_word': idx_word,
 'final_vocab':final_vocab,
 'LM_config':LM_config,
 'embedding': np.array(train_embedding)}

In [46]:
import pickle
file = open('../data/LM_5_data.pkl','wb')
pickle.dump(data,file)
file.close()

In [5]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import pickle
import pdb

def tokenizer(text,idx_word,word_idx,final_vocab):
    tokens = []
    text.lower()
    text = text.strip('\n')
    text = idx_word[2]+' '+text+' '+idx_word[3]
    # print(text)
    for word in text.split():
        if word in final_vocab:
            tokens.append(word_idx[word])
        else:
            tokens.append(word_idx['[UNK]']) # update []
    return tokens



class Dataset(Dataset):
    def __init__(self,data,vocab_size):
        self.data = data
        self.vocab_size = vocab_size
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self,idx):
        # print(self.data[idx])
        out = tokenizer(self.data[idx])
        inputs = out[:-1]# Last token is [END] will not include in data
        labels = out[1:]# First token is [STR] will never be predected
        # print(out)
        return torch.tensor(inputs),torch.nn.functional.one_hot(torch.tensor(labels),num_classes=self.vocab_size)




class LM(nn.Module):
    
    def __init__(self,config,embedding):
        super().__init__()
        self.config = config
        # This will tak token ids an make enbedding out of it
        # Embedding are from glove and should be frezzed from learning
        self.embedding = nn.Embedding.from_pretrained(torch.from_numpy(embedding).float())
        self.LSTM = nn.LSTM(num_layers=self.config['num_layers'],input_size=self.config['embedding'],hidden_size=self.config['hidden'])
        self.convert_vocab = nn.Linear(in_features=self.config['hidden'],out_features=self.config['vocab_size'])
        self.softmax = nn.Softmax(dim=1)
    def forward(self,token_seq):
        x = self.embedding(token_seq)
        out,_ = self.LSTM(x) # (h_n,c_n)
        convert_output = self.convert_vocab(out)
        prob = self.softmax(convert_output)
        return prob

def sent_probablity(ground_truth,pred):
    print("Ground truth , Predictions",ground_truth.shape,pred.shape)
    index_gt = torch.argmax(ground_truth,dim=1)
    
    print("Ground truth Index: ",index_gt)
    # pdb.set_trace()
    row = torch.arange(index_gt.shape[0]).to(device)
    probablity = pred[row,index_gt]
    print(probablity,probablity.shape)
    # print(power_p,power_p.shape)
    prob = torch.prod(probablity)
    return prob



device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"

file = open('../data/LM_5_data.pkl','rb')
config = pickle.load(file)
final_vocab = config['final_vocab']
word_idx = config['word_idx']
idx_word = config['idx_word']
embedding = config['embedding']
LM_config = config['LM_config']
#test_data = config['test_data'] # list of strings
checkpoint = torch.load('../data/LM_1.pth')
LM_1 = LM(LM_config,embedding).to(device)
LM_1.load_state_dict(checkpoint)
LM_1.eval()

LM(
  (embedding): Embedding(5691, 50)
  (LSTM): LSTM(50, 100)
  (convert_vocab): Linear(in_features=100, out_features=5691, bias=True)
  (softmax): Softmax(dim=1)
)

In [9]:
def preplixity(ground_truth,predicted):
    print("Ground truth , Predictions",ground_truth.shape,predicted.shape)
    index_gt = torch.argmax(ground_truth,dim=1)
    # print("Ground truth Index: ",index_gt)
    row = torch.arange(index_gt.shape[0]).to(device)
    probablity = predicted[row,index_gt]
    # print(probablity,probablity.shape)
    power_p = torch.pow(probablity,-1/probablity.shape[0])
    # print(power_p,power_p.shape)
    preplex = torch.prod(power_p)
    return preplex

for sent in val_sents:
    tokens = tokenizer(sent,idx_word,word_idx,final_vocab)
    tok_sent = torch.tensor(tokens[:-1]).to(device)
    ground_truth = torch.nn.functional.one_hot(torch.tensor(tokens[1:]),num_classes=len(final_vocab)).to(device)
    pred = LM_1(tok_sent)
    print(sent,' ',preplixity(ground_truth,pred))


Ground truth , Predictions torch.Size([25, 5691]) torch.Size([25, 5691])
elizabeth shook her head over this letter it convinced her that accident only could discover to mr bingley her sister is being in town
   tensor(5121.7280, device='cuda:0', grad_fn=<ProdBackward0>)
Ground truth , Predictions torch.Size([35, 5691]) torch.Size([35, 5691])
after a few minutes reflection however she continued i do remember his boasting one day at netherfield of the implacability of his resentments of his having an unforgiving temper his disposition must be dreadful
   tensor(4447.1997, device='cuda:0', grad_fn=<ProdBackward0>)
Ground truth , Predictions torch.Size([84, 5691]) torch.Size([84, 5691])
to catherine and lydia neither the letter nor its writer were in any degree interesting it was next to impossible that their cousin should come in a scarlet coat and it was now some weeks since they had received pleasure from the society of a man in any other colour as for their mother mr collins is letter 