In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/stanford-question-answering-dataset/dev-v1.1.json
/kaggle/input/stanford-question-answering-dataset/train-v1.1.json


In [2]:
import logging
import os

import torch
import torch.nn as nn
import torch.nn.functional as F


from transformers.modeling_bert import BertLayerNorm, BertPreTrainedModel, BertModel
from transformers.activations import get_activation


class MLPWithLayerNorm(nn.Module):  ##diff
    def __init__(self, config, input_size):
        super(MLPWithLayerNorm, self).__init__()
        self.config = config
        self.linear1 = nn.Linear(input_size, config.hidden_size)
        self.non_lin1 = get_activation(self.config.hidden_act)
        self.layer_norm1 = BertLayerNorm(config.hidden_size, eps=1e-12)
        self.linear2 = nn.Linear(config.hidden_size, config.hidden_size)
        self.non_lin2 = get_activation(self.config.hidden_act)
        self.layer_norm2 = BertLayerNorm(config.hidden_size, eps=1e-12)

    def forward(self, hidden):
        return self.layer_norm2(self.non_lin2(self.linear2(self.layer_norm1(self.non_lin1(self.linear1(hidden))))))


class QG_SimpleBertModel(BertPreTrainedModel):
    ''' model for QG with bert as encoder and a simple decoder of MLP layers
    '''
    def __init__(self,config,bert_type_or_path, vocab_size):
        super().__init__(config)
        self.config= config
        self.vocab_size= vocab_size
        self.main_encoder= BertModel.from_pretrained(bert_type_or_path)  # out: last_HS, pooled_out, all_HS, attention(opt)
        self.mlp_input_size=  config.hidden_size  # CR 
        self.mlp = MLPWithLayerNorm(config, self.mlp_input_size) #ad

#         if config.embedding_size != config.hidden_size:
#             self.embeddings_project = nn.Linear(config.hidden_size, config.embedding_size)   # when embedding size and hidden size are not same
        
        self.decoder = nn.Linear(self.config.hidden_size, self.config.vocab_size, bias=False)

        self.decoder.weight= self.main_encoder.get_input_embeddings().weight

        self.init_weights()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        ques= None
        ):
        last_hs, pooled= self.main_encoder(input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds)

        hidden_states= self.mlp(last_hs)

        
        
        logits= self.decoder(hidden_states)
        assert logits.size(-1)== self.vocab_size 
        loss = None
        if ques!= None:
            pre_labels= logits.view(-1, self.vocab_size)

            ques_labels= ques.view(-1)

            loss= F.cross_entropy(
                pre_labels,
                ques_labels,
                size_average=False,
                ignore_index= -1, #label tp be ignored
                reduce=True #
            )

        return loss, logits

In [3]:
import torch
from torch.utils.data import Dataset
from transformers import BertTokenizer
from transformers import BertConfig
import json
import spacy


def get_tokenizer(model_type_path):
    tok = BertTokenizer.from_pretrained(model_type_path, do_lower_case=True)
    return tok

def get_mask_ids(tokens, max_seq_length):
    """attention Mask id for padding 1 for original 0 for padded"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))

def get_segment_ids(tokens, max_seq_length):
    """Segments id : 0 for the first sequence, 1 for the second"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    first_sep = True
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            if first_sep:
                first_sep = False 
                current_segment_id = 1
    assert current_segment_id ==1
    return segments + [0] * (max_seq_length - len(tokens))

def get_token_ids(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return input_ids



def process_input(text, ans, ques, tokenizer, max_seq_len):
    text_token = tokenizer.tokenize(text)
    ans_token= tokenizer.tokenize(ans)
    ques_token= tokenizer.tokenize(ques)

    if len(text_token) > max_seq_len-3 -len(ans_token):
        text_token = text_token[:max_seq_len- 3-len(ans_token)]

    return text_token, ans_token, ques_token
    
def pad_ques(ques, max_seq_len, padding =0):
    if len(ques)> max_seq_len:
        raise IndexError("len of ques {} greater than max_seq_len{}".format(len(ques), max_seq_len))

    req_len= max_seq_len- len(ques)
    ques += [padding]* req_len

    return ques



def convert_to_input(text, ans, ques, tokenizer, max_seq_len):
    text_token, ans_token, ques_token= process_input(text,ans,ques, tokenizer, max_seq_len)

    allToken= ["[CLS]"] + text_token  + ["[SEP]"] + ans_token + ["[SEP]"]
    ques_token= ["[CLS]"] + ques_token  + ["[SEP]"]
    ids= get_token_ids(allToken, tokenizer, max_seq_len)
    mask_ids = get_mask_ids(allToken, max_seq_len)
    segment_ids = get_segment_ids(allToken, max_seq_len)

    que_ids= get_token_ids(ques_token, tokenizer, len(ques_token))

    que_ids= pad_ques(que_ids, max_seq_len) #CR padding
    assert len(que_ids)== max_seq_len
    return ids, mask_ids, segment_ids, que_ids


def generate_context_list(text):
    ''' Extract chunks with spacy from text paragraph
    '''
    nlp= spacy.load('en_core_web_sm') #this will download spacy english core model
    nlpDoc= nlp(text)
    lst= []
    for chunk in nlpDoc.noun_chunks:
        lst.append(chunk)

    return lst







class QGDataset(Dataset):
    def __init__(self, args):
        super().__init__()
        self.args= args
        self.examples= []
        self.tokenizer= get_tokenizer(args.bert_model)
        self.max_seq_len= self.args.max_seq_len

        if args.squad_path== None and args.inferenceMode == False:
            raise ValueError("invaild path to squad 1.0 data or wrong inference mode")

        if args.squad_path != None:
            with open(args.squad_path) as f:
                json_data= json.load(f)
                json_data = json_data['data']

            for data in json_data:
                for para in data['paragraphs']:
                    con = para['context']
                    qas= para['qas']
                    for xs in qas:
                        cur_ans= xs['answers'][0]['text']
                        cur_ans_offset= xs['answers'][0]['answer_start']
                        que= xs['question']
                        ex= {
                            'text': con,
                            'ans': cur_ans,
                            'ans_offset': cur_ans_offset,
                            'ques': que
                        }
                        self.examples.append(ex)
            del json_data
        
        else:
            if args.inferenceFile == None:
                raise ValueError("wrong file for inference")
            
            with open(args.inferenceFile) as f:
                json_data= json.load(f)
            
            for item in json_data:
                paragraph= None
                contextList= None
                if 'paragraph' in item.keys():
                    paragraph = item['paragraph']
                
                else:
                    raise KeyError("no text para graph is found. worong format of inference file")
                
                if 'context_list' in  item.keys():
                    contextList= item['context_list']
                
                else:
                    contextList = generate_context_list(paragraph)
                
                for context in contextList:
                    cur_ex= {
                        'text': paragraph,
                        'ans': context,
                        'ques': "a dummy ques to avoid None error"
                    }

                    self.examples.append(cur_ex)
                

        if args.occu:
            self.examples= self.examples[:args.occu]


    def __len__(self):
        return len(self.examples)

    def __getitem__(self,idx):
        cur_ex= self.examples[idx]
        ids, mask_id, seg_id, ques= convert_to_input(cur_ex['text'], cur_ex['ans'], cur_ex['ques'], self.tokenizer, self.max_seq_len)

        exm= {
            'ids': torch.tensor(ids, dtype= torch.long),
            'mask_ids': torch.tensor(mask_id, dtype= torch.long),
            'segment_ids': torch.tensor(seg_id, dtype= torch.long),
            'ques': torch.tensor(ques, dtype= torch.long)
        }
#         print(exm)
        return exm
    


In [4]:

from transformers import BertConfig 

bert_model= 'bert-base-uncased'
max_seq_len =256

class trainDataArgs:
    bert_model= bert_model
    max_seq_len = max_seq_len
    squad_path= '../input/stanford-question-answering-dataset/train-v1.1.json' #CR
    inferenceMode= False  # T/F for inference
    inferenceFile= None  # if nferenceMode = True: a path to json file consisting of list of dict with keys 'paragraph' as mandatory key and 'context_list' as option  [{'paragraph':' a string', 'context_list': ['list' , 'of', 'context', 'i.e answers'] }, {}, . . .]

    occu = 50000

class validDataArgs:
    bert_model= bert_model
    max_seq_len =max_seq_len
    squad_path= '../input/stanford-question-answering-dataset/dev-v1.1.json' #CR
    inferenceMode= False  # T/F for inference
    inferenceFile= None  # if nferenceMode = True: a path to json file consisting of list of dict with keys 'paragraph' as mandatory key and 'context_list' as option  [{'paragraph':' a string', 'context_list': ['list' , 'of', 'context', 'i.e answers'] }, {}, . . .]
    occu= 5000

class inferenceArgs:
    bert_model= bert_model
    max_seq_len =max_seq_len
    squad_path= '/kaggle/input/stanford-question-answering-dataset/dev-v1.1.json' #CR
    inferenceMode= False  # T/F for inference
    inferenceFile= None  # CR
    # if nferenceMode = True: a path to json file consisting of list of dict with keys 'paragraph' as mandatory key and 'context_list' as option  [{'paragraph':' a string', 'context_list': ['list' , 'of', 'context', 'i.e answers'] }, {}, . . .]
    occu= 32
    infereceModelPath= '/kaggle/input/qg-bert-en-dec/model_2'


class trainingConfig:
    bert_model = bert_model
    bert_config = BertConfig.from_pretrained(bert_model)
    max_seq_len= max_seq_len
    train_batch_size= 16
    valid_batch_size= 16
    ignore_label =0
    num_workers=0
    epochs =5
    learningRate = 5e-5
    save_dir = '/kaggle/working/'

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




In [5]:
import os
import torch
from tqdm.notebook import tqdm
import pandas as pd
import torch.nn as nn
import numpy as np
import time, random

from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader

from transformers import AdamW, BertConfig 
from transformers import get_linear_schedule_with_warmup
from transformers import EncoderDecoderModel

# from process import QGDataset
# from model import QG_SimpleBertModel
# from arguments import trainDataArgs, trainingConfig, validDataArgs
from nltk.translate.bleu_score import sentence_bleu

class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

def get_blue_score(orig, pre):
#     print("orignal blue ->>>>:" , orig,"\n Predicted blue->>>>", pre)
    orig_tok= orig.split()
    pre_tok= pre.split()[:len(orig_tok)]
#     print("orignal blue ->>>>:" , orig_tok,"\n Predicted blue->>>>", pre_tok)
    ref= [orig_tok]
    score= sentence_bleu(ref, pre_tok)
    return score



def predict(evalData, batch_size, device, model,  ignore_label=-1, worker=0):
    model.eval()
    
    tokenizer = evalData.tokenizer
    vocab_size = tokenizer.vocab_size
    evalDataLoader = DataLoader(evalData,batch_size=batch_size, num_workers= worker)
    
    tdl = tqdm(evalDataLoader, total= len(evalDataLoader))
    total_acc= AverageMeter()
    predictions= []
    for idx,batch in enumerate(tdl):
    
        ids = batch['ids'].to(device, dtype=torch.long)
        mask_ids = batch['mask_ids'].to(device, dtype=torch.long)
        seg_ids = batch['segment_ids'].to(device, dtype=torch.long)
        ques = batch['ques'].to(device, dtype=torch.long)
        with torch.no_grad():
            loss, logits= model(
                input_ids= ids,
                attention_mask= mask_ids,
                decoder_input_ids= ids,
                # decoder_inputs_embeds= model.get_input_embeddings().weight,
                token_type_ids= seg_ids,
                masked_lm_labels = ques
            )[:2]

        logits= logits.view(-1, vocab_size)
        # orig_ques= ques.view(-1)
        logits = logits.detach().cpu().numpy()
        orig_ques = ques.detach().cpu().numpy()
        pred_ques = np.argmax(logits, axis=1).flatten().squeeze()
        pred_ques = np.reshape(pred_ques,(batch_size,-1))
        cur_pre= []
        for i in range(orig_ques.shape[0]):
            cur_orignal_ques= tokenizer.decode(list(orig_ques[i]), skip_special_tokens=True)
            cur_pred_ques= list(pred_ques[i])
            try:
                cur_len= cur_pred_ques.index(102) # find first sep token
            except ValueError:
                cur_len= len(cur_pred_ques)-1

            cur_pred_ques = cur_pred_ques[:cur_len+1]
            cur_pred_ques= tokenizer.decode(cur_pred_ques, skip_special_tokens=True)
            # print("orignal ->>>>:" , cur_orignal_ques,"\n Predicted->>>>", cur_pred_ques)
            cur_acc= get_blue_score(cur_orignal_ques, cur_pred_ques)
            cur_pre.append(cur_pred_ques)
            total_acc.update(cur_acc)

        predictions += cur_pre
        tdl.set_postfix(status= 'valid',accu= total_acc.avg)

    return total_acc.avg, predictions


def train(trainData, validData, device, train_config):
    
    seed_val = 42
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)
    tokenizer= trainData.tokenizer
    vocab_size= tokenizer.vocab_size
    model = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-uncased', 'bert-base-uncased')
    batch_size= train_config.train_batch_size
    trainDataloader= DataLoader(trainData, batch_size= train_config.train_batch_size, num_workers= train_config.num_workers)

    param_optimizer = list(model.named_parameters())  #get parameter of models
    no_decay = [
        "bias",
        "LayerNorm.bias",
        "LayerNorm.weight"
    ] ##doubt layers to be not decayed #issue
    optimizer_parameters = [
        {
            'params': [
                p for n, p in param_optimizer if not any(
                    nd in n for nd in no_decay
                )
            ], 
         'weight_decay': 0.001
        },
        {
            'params': [
                p for n, p in param_optimizer if any(
                    nd in n for nd in no_decay
                )
            ], 
            'weight_decay': 0.0
        },
    ]    
    optimizer =AdamW(
        optimizer_parameters, 
        lr= train_config.learningRate
    )
    total_len= trainData.__len__()

    num_steps= total_len/train_config.train_batch_size*train_config.epochs
    
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=num_steps
    )

    
    model.to(device)
    print("total epochs: ", train_config.epochs)
    best_valid_acc=0
    for epoch_i in range(0, train_config.epochs):
#         print("")
#         print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, train_config.epochs))
#         print('Training...')
        t0 = time.time()
        total_loss = AverageMeter()
        total_acc = AverageMeter()

        tdl = tqdm(trainDataloader, total=len(trainDataloader))

        model.train()

        
        for idx,batch in enumerate(tdl):

            ids= batch['ids'].to(device, dtype= torch.long)
            mask_ids= batch['mask_ids'].to(device, dtype= torch.long)
            seg_ids= batch['segment_ids'].to(device, dtype= torch.long)
            ques= batch['ques'].to(device, dtype= torch.long)
            
            model.zero_grad()

            loss, logits= model(
                input_ids= ids,
                attention_mask= mask_ids,
                decoder_input_ids= ids,
#                 decoder_inputs_embeds= model.get_input_embeddings().weight,
                token_type_ids= seg_ids,
                masked_lm_labels = ques
            )[:2]


            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()


            logits= logits.view(-1, vocab_size)
            # orig_ques= ques.view(-1)
            logits = logits.detach().cpu().numpy()
            orig_ques = ques.detach().cpu().numpy()
            pred_ques = np.argmax(logits, axis=1).flatten().squeeze()
            pred_ques = np.reshape(pred_ques,(batch_size,-1))
#             print("shape of orig and pred batch: ",orig_ques.shape, pred_ques.shape)
            for i in range(orig_ques.shape[0]):
                cur_orignal_ques= tokenizer.decode(list(orig_ques[i]), skip_special_tokens=True)
                cur_pred_ques= tokenizer.decode(list(pred_ques[i]), skip_special_tokens=True)
                cur_acc= get_blue_score(cur_orignal_ques, cur_pred_ques)
                total_acc.update(cur_acc)

            tdl.set_postfix(accu= total_acc.avg)

            total_loss.update(loss.item(), mask_ids.size(0))
            
            

            tdl.set_postfix(status= 'train', epochs= epoch_i,loss= total_loss.avg, accu= total_acc.avg)

        if validData:
            valid_accu, prediciton= predict(validData, train_config.valid_batch_size, device, model, ignore_label= train_config.ignore_label, worker= train_config.num_workers)
        if epoch_i in [1,3,4]:
            torch.save(model, train_config.save_dir+"model_{}".format(epoch_i)) #save whole model after epoch






# if __name__ == "__main__":
def main():
    if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
        device = torch.device("cuda")

        print('There are %d GPU(s) available.' % torch.cuda.device_count())

        print('We will use the GPU:', torch.cuda.get_device_name(0))

    # If not...
    else:
        print('No GPU available, using the CPU instead.')
        device = torch.device("cpu")

    trainData= QGDataset(trainDataArgs)
    validData= QGDataset(validDataArgs)
    train(trainData, validData, device, trainingConfig)

In [6]:
import gc
torch.cuda.empty_cache()
gc.collect()

20

In [7]:
main()

There are 1 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…


total epochs:  5


HBox(children=(FloatProgress(value=0.0, max=3125.0), HTML(value='')))

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha)
Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().





HBox(children=(FloatProgress(value=0.0, max=313.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3125.0), HTML(value='')))