In [58]:
import json
import copy
import numpy as np
import random
import math
import torch

In [2]:
import torch.nn as nn
# from torch.utils.data import DataLoader,Dataset,WeightedRandomSampler
from torchtext.legacy.data import Field, BucketIterator
import torch.nn.functional as F
import torch.optim as optim
from torchtext.legacy.data import TabularDataset

In [272]:
from torch.utils.tensorboard import SummaryWriter
from seqeval.metrics import accuracy_score

In [3]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)

In [4]:
# train_path = 'ptrain.jsonl'
# with open(train_path) as f:
#     train_data = [json.loads(line) for line in f]

In [5]:
# val_path = 'val.jsonl'
# with open(val_path) as f:
#     val_data = [json.loads(line) for line in f]

In [6]:
# train_data_use = copy.deepcopy(train_data)

# val_data_use = copy.deepcopy(val_data)

In [7]:
# def removeUnwanted(data_set):
#     for each in data_set:
#         each['tokens'].pop(0)
#         each['tokens'].pop()

#         each['labels'].pop(0)
#         each['labels'].pop()

In [8]:
# removeUnwanted(train_data_use)
# removeUnwanted(val_data_use)

In [156]:
tokens = Field(lower = True, pad_token = '<pad>', unk_token = '<unk>', batch_first = True, use_vocab = True, fix_length = 49)
labels = Field(pad_token = '<pad>', unk_token = '<unk>', batch_first = True, use_vocab = True, fix_length = 49)

In [157]:
fields = {'tokens':('tokens',tokens), 'labels':('labels', labels)}
train_dataset, val_dataset, test_dataset = TabularDataset.splits(path = "./", format = "json", 
                                                                 train = "ptrain_topk_overfit.jsonl", 
                                                                 validation = "val_topk_overfit.jsonl", 
                                                                 test = "test_topk.jsonl", 
                                                                 fields = fields)

In [158]:
print(len(train_dataset))
# train_dataset = train_dataset[:100]

100


In [159]:
train_dataset[0].tokens

['mr',
 'charnley',
 'said',
 'either',
 'the',
 'council',
 'was',
 'also',
 'considering',
 'up',
 'whether',
 'to',
 'install',
 'speed',
 'the',
 'cameras',
 'along',
 'the',
 'road',
 ',']

In [160]:
tokens.build_vocab(train_dataset, vectors = "glove.6B.50d")
labels.build_vocab(train_dataset, vectors = "glove.6B.50d")

In [161]:
print(f"tokens.vocab.vectors.shape = {tokens.vocab.vectors.shape}")
print(f"edits.vocab.vectors.shape = {labels.vocab.vectors.shape}")
print()

tokens.vocab.vectors.shape = torch.Size([1131, 50])
edits.vocab.vectors.shape = torch.Size([22, 50])



In [127]:
print(f"tokens.vocab.itos[0] = {tokens.vocab.itos[1]}")
print(f"tokens.vocab.stoi['the'] = {tokens.vocab.stoi['body']}")
print(f"edits.vocab.stoi['$keep'] = {labels.vocab.stoi['$KEEP']}")
print(f"edits.vocab.itos[0] = {labels.vocab.itos[2]}")
print(f"edits.vocab.stoi[' '] = {labels.vocab.stoi[' ']}")
# index 0: <unk> , 1:<pad> for both labels and tokens

tokens.vocab.itos[0] = <pad>
tokens.vocab.stoi['the'] = 0
edits.vocab.stoi['$keep'] = 2
edits.vocab.itos[0] = $KEEP
edits.vocab.stoi[' '] = 0


In [128]:
print(f"len(tokens.vocab.freqs.keys()) = {len(tokens.vocab.freqs.keys())}")  #doesnt include <pad> and <unk> token
print(f"len(edits.vocab.freqs.keys()) = {len(labels.vocab.freqs.keys())} \n") #doesnt include <pad> and <unk> token
c1 = labels.vocab.freqs
c1

len(tokens.vocab.freqs.keys()) = 1129
len(edits.vocab.freqs.keys()) = 20 



Counter({'$KEEP': 2122,
         '$DELETE': 70,
         '$REPLACE_.': 16,
         '$REPLACE_of': 5,
         '$APPEND_.': 10,
         '$REPLACE_,': 14,
         '$TRANSFORM_AGREEMENT_SINGULAR': 11,
         '$APPEND_the': 10,
         '$REPLACE_to': 8,
         '$APPEND_to': 4,
         '$APPEND_,': 10,
         '$REPLACE_the': 5,
         '$TRANSFORM_VERB_VBN_VB': 5,
         '$REPLACE_in': 1,
         '$TRANSFORM_VERB_VBZ_VB': 4,
         '$APPEND_of': 2,
         '$APPEND_a': 5,
         '$TRANSFORM_VERB_VBG_VB': 1,
         '$APPEND_and': 2,
         '$TRANSFORM_AGREEMENT_PLURAL': 3})

In [290]:
def getAccuracy(pred, y):

    pred_1 = torch.argmax(pred, dim = 1)
    #(N,49)
    N = pred_1.size(0)
    
    acc = 0.0
    
    for i in range(N):

        c = y[i] == 1   #geting sentence length (find first padding value so we can trime it from there)
        c = c.nonzero()
        try:
            sent_len = c[0].item()
        except:
            sent_len = len(y[i]) - 1
        
        use_y = y[i][:sent_len + 1]
        use_pred = pred_1[i][:sent_len+1]
        
        acc += accuracy_score(use_y, use_pred).item()
        
#         acc += accuracy_score(y[i], pred_1[i]).item()
    
    return acc
        

In [291]:
def train(model, train_dataloader, optimizer, loss_fn, epoch):
    model.train()
    train_loss = 0
    train_accuracy = 0
    for batch, dat in enumerate(train_dataloader):
        X = dat.tokens
        y = dat.labels
        #move to GPU
        X,y = X.to(device), y.to(device)

        
        # Compute prediction error
        pred,_ = model(X)

#         print(pred.size(), " pred shpe")
#         print("y size: ", y.size())
        
        
        loss = loss_fn(pred, y)
        train_loss += loss.item()
        train_accuracy += getAccuracy(pred, y)
        
        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        

        #UNCOMMENT
#         if batch%50 == 49:
        if batch%2 == 1:
            print("BATCH: ", batch, " LOSS: ", loss.item())
            writer.add_scalar('training loss batchwise',
                                    train_loss / batch,
                                    epoch * len(train_dataloader) + batch)

    train_loss = train_loss/len(train_dataloader)
    
#     print("ret: ", ret)
    train_accuracy = (100. * train_accuracy) / len(train_dataloader.dataset)
    
    return train_loss, train_accuracy

In [292]:
def evaluate(model, val_dataloader,loss_fn):
    model.eval()
    val_loss = 0
    val_accuracy = 0
    with torch.no_grad():
        for dat in val_dataloader:
            X = dat.tokens
            y = dat.labels
            
            #move to GPU
            X,y = X.to(device), y.to(device)
        
            pred,_ = model(X)

            loss = loss_fn(pred, y)
            
            val_loss += loss.item()
            val_accuracy += getAccuracy(pred, y)
            

    avg_val_loss = val_loss/len(val_dataloader)

    val_accuracy = (100.* val_accuracy) / len(val_dataloader.dataset)
    
    return avg_val_loss, val_accuracy

In [424]:
class SimpleRNN(nn.Module):
    def __init__(self, embed_dim,hidden_dim, n_classes, vocab_size, pad_idx, embedd_weights):
        super().__init__()
        
        self.embedding = nn.Embedding.from_pretrained(embedd_weights, padding_idx = pad_idx, freeze = True)
        
        self.rnn = nn.RNN(input_size = embed_dim, hidden_size = hidden_dim, num_layers = 2, bidirectional = True, batch_first=True)
        
        self.linear = nn.Linear(hidden_dim*2, n_classes)  #when using bidirectional rnn
        self.embed_dim = embed_dim
        
    def forward(self,x):
        x = self.embedding(x)
        #(N,L,embed_dim)
#         print("X: ", x.size())
        
        op1, h_n = self.rnn(x)
#         print("op1: ", op1.size(), " h_n: ", h_n.size())
        #op1 (N,L,n_classes)
        
#         op2 = op1.view(-1, self.embed_dim)
#         print("op2: ", op2.size())
        #op2 (N*L, embed_dim)
        
        op1 = self.linear(op1)

        op1 = torch.transpose(op1, 1,2)
        # (N,n_classes, L) -- needed for crossentropy
#         print("op1: ", op1.size())
        
        
        return op1, h_n
        

In [425]:
device = torch.device("cpu" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [426]:
# train_dataset = train_dataset[:100]
# val_dataset = val_dataset[:50]
len(train_dataset)

100

In [427]:
batch_size = 16
train_dataloader, val_dataloader= BucketIterator.splits((train_dataset, val_dataset), batch_size = batch_size, device = device, 
                                 sort_key=lambda x: len(x.tokens), sort_within_batch = True)

In [428]:
train_dataloader.create_batches()

# Loop through BucketIterator.
print('PyTorchText BuketIterator\n')
for batch in train_dataloader.batches:

    # Let's check batch size.
    print('Batch size: %d\n'% len(batch))
    print('LABEL\tLENGTH\tTEXT'.ljust(10))
  
  # Print each example.
    for example in batch:
        print('%d\t'.ljust(10) % (len(example.tokens)))
#         print("here: ", example.labels)
    print("\n")
    
#     print(batch.tokens)
  
  # Only look at first batch. Reuse this code in training models.
#     break

PyTorchText BuketIterator

Batch size: 4

LABEL	LENGTH	TEXT
44	       
45	       
45	       
49	       


Batch size: 16

LABEL	LENGTH	TEXT
26	       
26	       
26	       
27	       
27	       
28	       
28	       
29	       
29	       
29	       
29	       
30	       
30	       
30	       
30	       
30	       


Batch size: 16

LABEL	LENGTH	TEXT
14	       
15	       
15	       
15	       
15	       
16	       
16	       
17	       
17	       
17	       
17	       
17	       
17	       
18	       
18	       
18	       


Batch size: 16

LABEL	LENGTH	TEXT
21	       
21	       
21	       
23	       
23	       
23	       
23	       
23	       
24	       
24	       
24	       
24	       
24	       
24	       
25	       
25	       


Batch size: 16

LABEL	LENGTH	TEXT
18	       
18	       
18	       
19	       
19	       
19	       
19	       
19	       
19	       
20	       
20	       
21	       
21	       
21	       
21	       
21	       


Batch size: 16

LABEL	LENGTH	TEXT
31	       
3

In [429]:
# batch = next(iter(train_dataloader))
# batch.labels

In [430]:
batch = next(iter(train_dataloader))
for each in batch:
    print(each[1][0])   #[1] - labels , [0] - first example
    print(len(each[1][0]))
    break
    

tensor([ 3,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2, 13,  2,  2,  2,
         2,  2,  2,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1])
49


In [431]:
pretrained_embeddings = tokens.vocab.vectors

print(pretrained_embeddings.shape)

torch.Size([1131, 50])


In [432]:
EMBEDDING_DIM = 50
MX_LEN_SEN = 49   #####CHANGE TO mx_len_for_pad  AFTER CHECKING FOR OVERFITTING
n_classes = len(labels.vocab)
vocab_size = len(tokens.vocab)
pad_idx = tokens.vocab.stoi[tokens.pad_token]
hidden_dim = n_classes

model = SimpleRNN(EMBEDDING_DIM, hidden_dim, n_classes, vocab_size, pad_idx, pretrained_embeddings).to(device)

lr = 0.01
weight_decay = 0.001
amsgrad = False
# class_weights = class_weights
ignore_index = 1
# class_weights_des = 'method 2'


# optimizer = optim.Adam(model.parameters(), lr = 0.01)
optimizer = optim.Adam(model.parameters(), lr = lr , weight_decay = weight_decay, amsgrad = amsgrad)
loss_fn = nn.CrossEntropyLoss(ignore_index = ignore_index)

In [433]:
model.embedding.weight.data.copy_(pretrained_embeddings)  #copying the embedding matrix for embedding layer of model

UNK_IDX = tokens.vocab.stoi[tokens.unk_token]  #this doesnt include pad and unk token so we r initializing that as well
PAD_IDX = tokens.vocab.stoi[tokens.pad_token] 

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)


In [434]:
###################################################################################################
# CHANGE EVERY TIME U CHANGE HYPERPARAMETERS
writer = SummaryWriter('rnn_runs/exp3_trial')

In [435]:
NUM_EPOCHS = 150
best_val_loss = float('inf')


for i in range(NUM_EPOCHS): #chnage rahe to (N, 2*N) if u r running same thing again to incree no. of epochs
#     train_loss, train_accuracy = train(model, initial_train_dataloader, optimizer, loss_fn, i)

    #UNCOMMENT
    train_loss, train_accuracy = train(model, train_dataloader, optimizer, loss_fn, i)
    
#     print("_______________________________________***************_______________________________")
    val_loss, val_accuracy = evaluate(model, val_dataloader,loss_fn)


    print("\n ---------------------------------------------------------------------\n")
    print(" EPOCH ", i)
    print("train_loss: ", train_loss, " train_accuracy: ", train_accuracy)
    
    #UNCOMMENT
    
#     print("train_loss: ", avg_train_loss, " train_accuracy: ", train_accuracy)
    print("val_loss: ", val_loss, " val_accuracy: ", val_accuracy)
    print("\n ---------------------------------------------------------------------\n")
    
    
    writer.add_scalars('LOSS', { 'Train' : train_loss,
                                'Val' : val_loss
                                } , i)
    
    writer.add_scalars('ACCURACY', { 'Train': train_accuracy,
                                    'val': val_accuracy
                                    }, i)
    
    
    #UNCOMMENT
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), 'RNN_grammer_trial_3.pt')

BATCH:  1  LOSS:  2.2657148838043213
BATCH:  3  LOSS:  1.0584861040115356
BATCH:  5  LOSS:  0.32621344923973083

 ---------------------------------------------------------------------

 EPOCH  0
train_loss:  1.3359597538198744  train_accuracy:  71.51307164132595
val_loss:  0.5732477381825447  val_accuracy:  83.38955837488174

 ---------------------------------------------------------------------

BATCH:  1  LOSS:  0.26775190234184265
BATCH:  3  LOSS:  0.6906896829605103
BATCH:  5  LOSS:  1.0083794593811035

 ---------------------------------------------------------------------

 EPOCH  1
train_loss:  0.5649869740009308  train_accuracy:  86.10221639275551
val_loss:  0.6188370101153851  val_accuracy:  83.38955837488174

 ---------------------------------------------------------------------

BATCH:  1  LOSS:  0.6625056266784668
BATCH:  3  LOSS:  0.3678681552410126
BATCH:  5  LOSS:  0.4778642952442169

 ---------------------------------------------------------------------

 EPOCH  2
train_

In [379]:
len(train_dataloader)

7

In [436]:
model.eval()
crct_class = 0
incrct_class = 0
other_class = 0
for dat in train_dataloader:
    X = dat.tokens
    Y = dat.labels
#     print("here: ",len(X))
    
    for i in range(len(X)):
    #move to GPU
        x,y = X[i].to(device), Y[i].to(device)


        # Compute prediction error

        x = x.unsqueeze(0)
        y = y.unsqueeze(0)

#         print(x.size(),": x")
#         print("y: ", y.size())

        check_pred,_ = model(x)
#         print("check_pred",check_pred.size())
        check_pred = torch.argmax(check_pred, dim = 1)
    #     print(check_pred)

    #     print(y.size())
    #     break

        for i in range(len(y[0])):
            if y[0][i] not in [1,0,2]:
                other_class += 1
                if y[0][i] == check_pred[0][i] :
                    crct_class += 1

                else:
                    incrct_class += 1

print("crct_class", crct_class)
print("incrct_class", incrct_class)
print("other_class", other_class)

crct_class 183
incrct_class 3
other_class 186


In [322]:
train_dataset[0].tokens

['mr',
 'charnley',
 'said',
 'either',
 'the',
 'council',
 'was',
 'also',
 'considering',
 'up',
 'whether',
 'to',
 'install',
 'speed',
 'the',
 'cameras',
 'along',
 'the',
 'road',
 ',']

In [60]:
rnn = nn.RNN(10, 20, 2)
input = torch.randn(5, 3, 10)
h0 = torch.randn(2, 3, 20)
output, hn = rnn(input)

In [61]:
output.size()

torch.Size([5, 3, 20])

In [247]:
a = torch.tensor([0,3,1,2,3,1,4,6,6,6])
c = a == 1
c = c.nonzero()
c[0].item()

2

In [225]:
x = torch.randn(2, 3,5,7)
x
y = torch.transpose(x, 1, 3)
y.size()

torch.Size([2, 7, 5, 3])