<a href="https://colab.research.google.com/github/ashishagg70/ContextBasedQA/blob/master/ContextBasedQAV2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!wget "https://data.deepai.org/squad1.1.zip"

--2021-04-20 06:19:38--  https://data.deepai.org/squad1.1.zip
Resolving data.deepai.org (data.deepai.org)... 138.201.36.183
Connecting to data.deepai.org (data.deepai.org)|138.201.36.183|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9152254 (8.7M) [application/x-zip-compressed]
Saving to: ‘squad1.1.zip’


2021-04-20 06:19:38 (42.5 MB/s) - ‘squad1.1.zip’ saved [9152254/9152254]



In [2]:
!unzip squad1.1.zip

Archive:  squad1.1.zip
  inflating: dev-v1.1.json           
  inflating: train-v1.1.json         


In [3]:
import numpy as np
from collections import defaultdict
from nltk.tokenize import word_tokenize
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchtext.vocab import GloVe
import json
import tqdm

import nltk
nltk.download('punkt')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

if device == "cuda":
    num_workers = 1
    pin_memory = True
else:
    num_workers = 0
    pin_memory = False

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
cuda


In [4]:
file = open('train-v1.1.json','r')
file2 = open('dev-v1.1.json','r')
train_data = json.load(file)['data']
dev_data = json.load(file2)['data']

In [5]:
MAX_WORD_LENGTH = 40
D = 100

In [None]:
glove = GloVe('840B',300)

.vector_cache/glove.840B.300d.zip:  35%|███▍      | 753M/2.18G [02:33<07:04, 3.35MB/s]

In [None]:
char_to_index = defaultdict(int)
index_to_char = defaultdict(str)
# word_to_index = defaultdict(int)
# index_to_word = defaultdict(str)

num_contexts = 0
num_questions = 0
num_characters = 0
num_words = len(glove)

char_contexts = []
char_questions = []

word_contexts=[]
word_questions = []
answers = []

input_data = []

In [None]:
# i=1
# for key in glove.itos:
#     word_to_index[key]=i
#     index_to_word[i]=key
#     i+=1

In [None]:
def char_encode(text, is_train=True):
    global char_to_index
    global index_to_char
    global num_characters

    text_endcoding = []
    for word in text:
        encoding = np.zeros(MAX_WORD_LENGTH)
        try:
            i=0
            for char in word:
                encode = char_to_index[char]
                if is_train == True and encode == 0:
                    index = num_characters+1
                    char_to_index[char]= index
                    index_to_char[index]= char
                    encode = index
                    num_characters+=1
                encoding[i]=encode
                i+=1
        except:
            print(word)
        text_endcoding.append(encoding)
    return text_endcoding

In [None]:
def convert_to_lower(text):
    return text.lower()

def perform_word_tokenization(text):
    return word_tokenize(text)

def word_encode(text, is_train=True):
    global word_to_index
    global index_to_word
    global num_words
    encoding =[]
    for word in text:
        try:
            encode = glove.stoi[word]
        except:
            encode = num_words
        encoding.append(encode)
    return encoding


In [None]:
def text_preprocess(text, is_train=True):
    text = convert_to_lower(text)
    text = perform_word_tokenization(text)
    word_encoding = word_encode(text)
    char_encoding = char_encode(text, is_train)
    return char_encoding, word_encoding

def get_answer_indices(context,start,answer):
    context=convert_to_lower(context)
    tokens = perform_word_tokenization(context[:(start+1)])
    ans_tokens = perform_word_tokenization(convert_to_lower(answer))
    start = len(tokens)-1
    return start, start + len(ans_tokens)-1

In [None]:
for obj in train_data:
    for para in obj['paragraphs']:
        cce, cwe = text_preprocess(para['context'])
        char_contexts.append(cce)
        word_contexts.append(cwe)
        num_contexts+=1
        for qa in para['qas']:
            qce, qwe = text_preprocess(qa['question'])
            char_questions.append(qce)
            word_questions.append(qwe)
            num_questions+=1
            input_data.append((num_contexts-1,num_questions-1))
            ans = []
            for a in qa['answers']:
                start,end = get_answer_indices(para['context'],a['answer_start'],a['text'])
                ans.append((start,end))
            answers.append(ans)

In [None]:
char_vocab_size = len(char_to_index)+1
word_vocab_size = num_words+1

In [None]:
class CharEmbedding(nn.Module):

    def __init__(self,vocab_size,embedding_dim = 8, cnn_kernel_size = 5,word_embedding_size =100 ):
        super().__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size,embedding_dim=embedding_dim,padding_idx=0,)
        self.cnn = nn.Conv1d(in_channels=embedding_dim,out_channels=word_embedding_size,kernel_size=cnn_kernel_size)
        self.maxpool = nn.MaxPool1d(kernel_size = MAX_WORD_LENGTH-cnn_kernel_size+1)
    
    def forward(self,x):
        # print("start",x.shape)
        batch_size = x.shape[0]
        x = x.view(-1,MAX_WORD_LENGTH)
        x = self.embedding(x)
        x = x.transpose(1,2)
        x = self.cnn(x)
        x = F.relu(x)
        # print("cnn",x.shape)
        x = self.maxpool(x)
        # print("pool",x.shape)
        #TODO : ReLU
        x = x.view(batch_size,-1,x.shape[1])
        # print("final",x.shape)
        return x

In [None]:
class WordEmbedding(nn.Module):
    def __init__(self):
        super().__init__()      
        self.embedding = nn.Embedding.from_pretrained(torch.cat((glove.vectors,torch.zeros(1,glove.dim)),dim=0))

    def forward(self,x):
        x=self.embedding(x)
        return x

In [None]:
class HighwayNetworkLayer(nn.Module):
    def __init__(self):
        super().__init__()
        self.transform = nn.Sequential(
               nn.Linear(glove.dim+D,glove.dim+D),
               nn.ReLU(),
               nn.Linear(glove.dim+D,glove.dim+D),
               nn.ReLU()
        )
        self.gate = nn.Sequential(
               nn.Linear(glove.dim+D,glove.dim+D),
               nn.Sigmoid()
        )
    def forward(self,x):
        x_transformed = self.transform(x)
        p = self.gate(x)
        return p*x_transformed + (1-p)*x

In [None]:
class ContextualEmbedding(nn.Module):
    def __init__(self):
        super().__init__()
        self.bilstm = nn.LSTM(glove.dim+D,D,1,bidirectional = True,batch_first = True)

    def forward(self,x):
        x,_ = self.bilstm(x)
        return x

In [None]:
class AttentionFlowLayer(nn.Module):
    def __init__(self):
        super().__init__()
        self.alpha = nn.Linear(6*D,1)
    
    def forward(self,H,U):
        # H : contextual embedding of context
        # U : contextual embedding of query

        T = H.shape[1]
        J = U.shape[1]

        H_interleaved = torch.repeat_interleave(H,J,dim=1)
        U_repeated = U.repeat(1,T,1)

        assert(H_interleaved.shape==U_repeated.shape)

        HU = torch.cat((H_interleaved,U_repeated,H_interleaved*U_repeated),dim=-1)
        S = self.alpha(HU)
        S = S.view(-1,T,J)
        C2Q_att = F.softmax(S,dim = -1)
        U_tilde = torch.matmul(C2Q_att,U)
        Q2C_att = F.softmax(torch.max(S,dim=-1)[0],dim=-1)
        Q2C_att = Q2C_att.unsqueeze(1)
        H_tilde = torch.matmul(Q2C_att,H).repeat(1,T,1)
        G = torch.cat((H,U_tilde,H*U_tilde,H*H_tilde),dim=-1)
        return G


In [None]:
class ModellingLayer(nn.Module):
    def __init__(self):
        super().__init__()
        self.bilstm = nn.LSTM(8*D,D,2,bidirectional = True,batch_first = True)
    def forward(self,G):
        M,_ = self.bilstm(G)
        return M

In [None]:
class OutputLayer(nn.Module):
    def __init__(self):
        super().__init__()
        self.dense1 = nn.Linear(10*D,1)
        self.dense2 = nn.Linear(10*D,1)
        self.bilstm = nn.LSTM(2*D,D,1,bidirectional = True,batch_first = True)
        self.softmax = nn.LogSoftmax(dim=-1)
    def forward(self,G,M):
        GM = torch.cat((G,M),dim=-1)
        temp_GM = self.dense1(GM).squeeze(-1)
        start = self.softmax(temp_GM)
        M2,_ = self.bilstm(M)
        GM2= torch.cat((G,M2),dim=-1)
        temp_GM2 = self.dense2(GM2).squeeze(-1)
        end = self.softmax(temp_GM2)

        return start,end

In [None]:
class BiDAF(nn.Module):
    def __init__(self):
        super().__init__()
        self.char_emb_layer=CharEmbedding(char_vocab_size,embedding_dim=8)
        self.word_emb_layer=WordEmbedding()
        self.highway = HighwayNetworkLayer()
        self.cont_emb_layer=ContextualEmbedding()
        self.att_layer = AttentionFlowLayer()
        self.modelling_layer = ModellingLayer()
        self.output_layer = OutputLayer()
    
    def forward(self,context_char,context_word,query_char,query_word):
        context_char_emb = self.char_emb_layer(context_char)
        context_word_emb = self.word_emb_layer(context_word)
        final_context_word_embedding = torch.cat((context_char_emb,context_word_emb),dim = -1)
        final_context_word_embedding = self.highway(final_context_word_embedding)
        context_cont_emb = self.cont_emb_layer(final_context_word_embedding)

        query_char_emb = self.char_emb_layer(query_char)
        query_word_emb = self.word_emb_layer(query_word)
        final_query_word_embedding = torch.cat((query_char_emb,query_word_emb),dim = -1)
        final_query_word_embedding = self.highway(final_query_word_embedding)
        query_cont_emb = self.cont_emb_layer(final_query_word_embedding)

        g = self.att_layer(context_cont_emb,query_cont_emb)
        m = self.modelling_layer(g)
        o = self.output_layer(g,m)
        return o

In [None]:
# char_batch_context = [char_contexts[0],char_contexts[0]]
# char_batch_query = [char_questions[0],char_questions[1]]
# word_batch_context=[word_contexts[0],word_contexts[0]]
# word_batch_query = [word_questions[0],word_questions[1]]

In [None]:
# char_batch_context_t = torch.LongTensor(nn.utils.rnn.pad_sequence([torch.LongTensor(sent) for sent in char_batch_context],batch_first=True))
# char_batch_query_t = torch.LongTensor(nn.utils.rnn.pad_sequence([torch.LongTensor(sent) for sent in char_batch_query],batch_first=True))
# word_batch_context_t = torch.LongTensor(nn.utils.rnn.pad_sequence([torch.LongTensor(sent) for sent in word_batch_context],batch_first=True))
# word_batch_query_t = torch.LongTensor(nn.utils.rnn.pad_sequence([torch.LongTensor(sent) for sent in word_batch_query],batch_first=True))

In [None]:
def collate_fn(idx):
    context_char,context_word,query_char,query_word,ans_start, ans_end=[],[],[],[],[],[]
    for id in idx:
        c,q = input_data[id]
        context_char.append(torch.LongTensor(char_contexts[c]))
        context_word.append(torch.LongTensor(word_contexts[c]))
        query_char.append(torch.LongTensor(char_questions[q]))
        query_word.append(torch.LongTensor(word_questions[q]))
        start,end = answers[q][0]
        ans_start.append(start)
        ans_end.append(end)
    
    context_char = torch.LongTensor(nn.utils.rnn.pad_sequence(context_char,batch_first=True))
    context_word = torch.LongTensor(nn.utils.rnn.pad_sequence(context_word,batch_first=True))
    query_char = torch.LongTensor(nn.utils.rnn.pad_sequence(query_char,batch_first=True))
    query_word = torch.LongTensor(nn.utils.rnn.pad_sequence(query_word,batch_first=True))
    ans_start = torch.LongTensor(ans_start)
    ans_end = torch.LongTensor(ans_end)

    return context_char,context_word,query_char,query_word,ans_start,ans_end,idx

In [None]:
batch_size = 32
# num_workers = 1

In [None]:
train_loader = torch.utils.data.DataLoader(
    range(len(input_data)),
    batch_size=batch_size,
    shuffle=True,
    collate_fn=collate_fn,
    # num_workers=num_workers,
    # pin_memory=pin_memory
)

In [None]:
bidaf = BiDAF().to(device)

In [None]:
epochs = 12

learning_rate = 0.5

print_every = 50

optimizer = torch.optim.Adadelta(bidaf.parameters(),lr=learning_rate)

criterion = nn.NLLLoss()


In [None]:
bidaf.train()

for epoch in range(epochs):
    for batch_idx, _data in enumerate(tqdm.notebook.tqdm(iter(train_loader))):
        context_char,context_word,query_char,query_word,ans_start,ans_end,idx = _data
        context_char,context_word,query_char,query_word,ans_start,ans_end = context_char.to(device),context_word.to(device),query_char.to(device),query_word.to(device),ans_start.to(device),ans_end.to(device)
        optimizer.zero_grad()
        start,end = bidaf(context_char,context_word,query_char,query_word)
        loss1 = criterion(start,ans_start)
        loss2 = criterion(end,ans_end)
        loss = loss1+loss2
        loss.backward()
        optimizer.step()
        with torch.no_grad():
            if batch_idx % print_every == 0 :
                print("epoch = {},batch = {},loss={},loss_1={},loss_2={}".format(epoch,batch_idx,loss.item(),loss1.item(),loss2.item()))
        