In [1]:
!apt install libpoppler-cpp-dev

Reading package lists... Done
Building dependency tree       
Reading state information... Done
libpoppler-cpp-dev is already the newest version (0.62.0-2ubuntu2.12).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 34 not upgraded.


In [2]:
!pip install pdftotext
!pip install transformers
!python -m nltk.downloader punkt
!pip install sentencepiece

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
import pdftotext 
import re

import itertools
from nltk import sent_tokenize
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

from transformers import AutoModelForQuestionAnswering

import random

In [4]:
import pdftotext 
import re

class TextProcessingAndContextCreation :

    @classmethod
    def get_context_chunks(cls, file_name) :
        
        # Get the list of strings where each string is the text content present
        # each page of the pdf document
        raw_text = cls._get_raw_text_from_pdf(file_name)

        # Remove the index part of the doc; for this, page number of the first
        # main content page is required
        first_page_number = cls._get_the_first_page_number(raw_text)
        if first_page_number == -1 :
            print("Error encountered")
            return []

        # Get only the content pages
        raw_text = raw_text[first_page_number:]

        # The first content page has the title of the Act. Remove it
        cls._remove_header_of_first_page(raw_text)

        # Delete the footer notes, that are generally the references from other
        # acts or some points giving additional info about those acts
        cls._delete_footer_notes(raw_text)

        # Words like "CHAPTER" are of no use for context. Remove them
        cls._remove_captial_words(raw_text)

        # Join the content of all the pages to get a single string
        text = cls._join_all_pages(raw_text)

        # Split the text into context chunks at the bold points present in the Act.
        text = cls._split_the_text_on_bold_points(text)

        # Remove unwanted newline, spaces, unrecognized quotes, and other symbols.
        cls._filter_and_get_plain_text(text)

        # For every chunk we extracted before, if the size of the chunk goes beyond
        # the limit, split that chunk as well with some defined procedure. Check
        # the function for more details
        contexts =  cls._get_the_contexts(text)
        # return the contexts
        return contexts

    @classmethod
    def _get_raw_text_from_pdf(cls, file_name) :

        # Open the file and read the text
        with open(file_name, "rb") as f :
            pdf_text = pdftotext.PDF(f)
        
        # Append the text present on each page to the list and return it
        raw_text = []
        for i in pdf_text :
            raw_text.append(str(i))
        return raw_text

    @classmethod
    def _get_the_first_page_number(cls, raw_text) :

        # Use the string 'ACT,' present in the title of the Act to mark the title
        # present in the content pages and remove it.
        # If the word 'ACT,' is not found, use the year (DDDD) to mark the title
        index = -1
        try :
            # Get the index of the string 'ACT,' present in the title of the first
            # index page
            index = raw_text[0].split().index('ACT,')
        except :
            # If 'ACT,' is not found, use year
            year_regex = r"[0-9]{4}"
            for i in range(len(raw_text[0].split())) :
                if (re.match(year_regex, raw_text[0].split()[i])) is not None :
                    index = i
                    break

        # If both 'ACT,' and year is not found, report error
        if index == -1 :
            return -1

        # Get the title of the Act using the index calculated.
        # A typical title is like 'EDUCATION ACT, 2002'
        match_text_page_zero = " ".join(raw_text[0].split()[:index + 1])
        # Match the title with the later pages and when it is found, that one is
        # the first content page.
        for i in range(1, len(raw_text)) :
            page_match_text = " ".join(raw_text[i].split()[:index + 1])
            if match_text_page_zero == page_match_text :
                return i

        # Failed
        return -1

    @classmethod
    def _remove_header_of_first_page(cls, raw_text) :

        # Remove the header present on the first content page.
        # The title/header ends with a date enclosed in square brackets
        # Use regex to match it and perform its deletion
        temp = re.match(r"(.|\n)*?\[.*\]", raw_text[0])
        raw_text[0] = raw_text[0][temp.end():]

    @classmethod
    def _delete_footer_notes(cls, raw_text) :

        # Remove the footer of the form "1. ...\n2. ..."
        footer_regex = r"1\..*"
        # For every page
        for i in range(len(raw_text)) :
            # Split on newline
            temp = raw_text[i].split('\n')[:-2]
            # Match the mentioned regex; j is the starting index of the footer
            for j in range(len(temp) - 1, -1, -1) :
                if re.match(footer_regex, temp[j]) is not None :
                    break
            # Discard the footer
            if j != 0 :
                temp = temp[:j]
            # Rejoin using the newline character
            raw_text[i] = "\n".join(temp)

    @classmethod
    def _remove_captial_words(cls, raw_text) :

        # Split each page using \n (newline) to get each line.
        # If the string is uppercase, remove it
        # Rejoin the text
        for i in range(len(raw_text)) :
            raw_text[i] = raw_text[i].split('\n')
            raw_text[i] = list(filter(lambda x: not (x.isupper()), raw_text[i]))
            raw_text[i] = "\n".join(raw_text[i])

    @classmethod
    def _join_all_pages(cls, raw_text) :

        # Join all the pages into one long string
        concatenated_text = []
        for i in raw_text :
            concatenated_text.append(i)
        concatenated_text = "\n".join(concatenated_text)
        return concatenated_text

    @classmethod
    def _split_the_text_on_bold_points(cls, text) :

        # The starting of each point has the form "1. <text> ...
        # Use it for splitting the continuous text into pointwise text
        split_regex = r"[0-9]+[A-Z]*\.[^\n]"
        split_text = re.split(split_regex, text)
        return split_text

    @classmethod
    def _filter_and_get_plain_text(cls, text) :

        # Get rid off unwanted symbols
        for i in range(len(text)) :
            text[i] = re.sub(r"(“|”|’)", "\"", text[i])
            text[i] = re.sub(r"\n", " ", text[i])
            text[i] = re.sub(r" +", " ", text[i])
            text[i] = re.sub(r"––", "- ", text[i])
            text[i] = re.sub(r"—", "- ", text[i])

    @classmethod
    def _get_the_contexts(cls, text) :

        # For each bold point in the list
        contexts = []
        for i in range(len(text)) :
            # If the section is less than 20 words, it is not a good candidate
            if (len(text[i].split()) < 20) :
                continue
            # If the section is greater than 20 words and less than 350 words,
            # it is a good candidate
            elif (len(text[i].split()) >= 20 and len(text[i].split()) <= 350) :
                contexts.append(text[i])
            # If the section is greater than 350 words, break that section further
            # using some rules
            else :
                temp = cls._get_contexts_greater_than_max_size(text[i])
                for i in temp :
                    contexts.append(" ".join(i))

        # return contexts
        return contexts

    @classmethod
    def _get_contexts_greater_than_max_size(cls, text) :

        ROMAN_NUMERALS = ['i', 'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix', 'x', \
                          'xi', 'xii', 'xiii', 'xiv', 'xv', 'xvi', 'xvii', 'xviii', 'xix', 'xx', \
                          'xxi', 'xxii', 'xxiii', 'xxiv', 'xxv', 'xxvi', 'xxvii', 'xxviii', 'xxix', 'xxx' \
                          'xxxi', 'xxxii', 'xxxiii', 'xxxiv', 'xxxv', 'xxxvi', 'xxxvii', 'xxxviii', 'xxxix', 'xl' \
                          'xli', 'xlii', 'xliii', 'xliv', 'xlv', 'xlvi', 'xlvii', 'xlviii', 'xlix', 'l', \
                          'li', 'lii', 'liii', 'liv', 'lv', 'lvi', 'lvii', 'lviii', 'lix', 'lx']

        # Regex for points. The points are often nested. The hierarchy of the points
        # is like this : Points starting with (1), (2) are the top ones, then (a), (b)
        # comes below it, then (i), (ii) and then (A) (B).
        level_wise_regex = [r"\([0-9]+\)", r"\([a-z]{1,2}\)", r"\((" + "|".join(ROMAN_NUMERALS) + r")\)" , r"\([A-Z]\)"]
        punctuations = ",-.:;"
        end_markers = ".;"
        current_index = 0
        regex_match_flag = 0
        max_context_length = 0

        contexts = []
        text = text.split()

        # Traverse through the text
        while (current_index < len(text)) :

            regex_match_flag = 0
            # Find the end index of the chunk considering the limits
            max_context_length = min(current_index + 350, len(text))
            context = text[current_index:max_context_length]
            # Check if this is the last chunk
            if max_context_length == len(text) :
                contexts.append(context)
                current_index += len(context)
            # If it is not the last chunk.
            else :
                # Find the possible end of that chunk by backtraversing. Find
                # the start of the current point by matching the regex and add
                # the context upto that index.
                # Backtrack half the length of current context.
                i = len(context) - 1
                while i > (len(context) // 2) :

                    # Try matching one of the regular expression
                    if (re.match(level_wise_regex[0], context[i]) is not None) or \
                        (re.match(level_wise_regex[1], context[i]) is not None) or \
                        (re.match(level_wise_regex[2], context[i]) is not None) or \
                        (re.match(level_wise_regex[3], context[i]) is not None) :
                        # The previous word should end with punctuation
                        if i != 0 and context[i-1][-1] in punctuations :
                            # Record the context, move the current index and the
                            # set the regex match flag to skip the next checking
                            context = context[:i]
                            contexts.append(context)
                            current_index += i
                            regex_match_flag = 1
                            break
                    i -= 1

                # If no regex is matched, backtrack and check for endmarker punctuation
                # like full-stop.
                if not regex_match_flag :
                    i = len(context) - 1
                    while (i > 0) :
                        if context[i][-1] in end_markers :
                            context = context[:i + 1]
                            contexts.append(context)
                            current_index += (i + 1)
                            break
                        i -= 1

            # Context less than 20 words are discarded.
            if len(contexts[-1]) < 20 :
                del contexts[-1]

        # Return the contexts
        return contexts
 

In [5]:
import itertools

from nltk import sent_tokenize

import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Model for question generation and answer extraction
PIPELINE_SETTINGS = {
    #"model": "valhalla/t5-base-qg-hl",
    "model": "mrm8488/t5-base-finetuned-question-generation-ap",    # Question Generation model
    "ans_model": ["valhalla/t5-base-qa-qg-hl"]                      # List of Ans extraction models. We can add more than one.
}

class QGPipeline:

    def __init__(self, pipeline_settings: dict = PIPELINE_SETTINGS, use_cuda: bool = True) :

        # Define the models and tokenizer
        self.model = AutoModelForSeq2SeqLM.from_pretrained(pipeline_settings['model'])
        self.tokenizer = AutoTokenizer.from_pretrained(pipeline_settings['model'], use_fast=False)

        self.ans_model = []
        self.ans_tokenizer = []
        for i in range(len(pipeline_settings['ans_model'])) :
            self.ans_model.append(AutoModelForSeq2SeqLM.from_pretrained(pipeline_settings['ans_model'][i]))
            self.ans_tokenizer.append(AutoTokenizer.from_pretrained(pipeline_settings['ans_model'][i], use_fast=False))

        self.device = "cuda" if torch.cuda.is_available() and use_cuda else "cpu"
        self.model.to(self.device)
        for i in range(len(self.ans_model)) :
            if self.ans_model[i] is not self.model:
                self.ans_model[i].to(self.device)


    def __call__(self, text : str):

        # Split the input string into list of words and join them back using space
        # to get rid off unwanted characters like newline, tab, trailing spaces
        input_text = " ".join(text.split())
        # Extract possible answer spans from the input text
        answers = self._extract_answers(input_text)
        # If no possible answer span is found, return empty list
        if len(answers) == 0:
          return []

        # Generate the questions given the list of answer spans and input context
        questions = self._generate_questions(answers, input_text)
        # Form a list of question and their answers and return it
        question_answers_list = []
        for question, answer in zip(questions, answers) :
            question_answers_list.append({'question': question, 'answer': answer})
        return question_answers_list


    def _extract_answers(self, context):

        # Prepare inputs for answer extraction.
        # Each input is of the form "s1 s2 si <hl> ans_sent <hl> sj sn". Here
        # 'si' represents ith sentence.
        inputs = self._prepare_inputs_for_ans_extraction(context)
        # Tokenize the inputs, i.e. convert the alphanumeric tokens to numeric token_ids
        inputs = self._tokenize(inputs, padding=True, truncation=True)

        # Pass the inputs to every ans model for extracting answers.
        answers = []
        for i in range(len(self.ans_model)) :
            # Run the model
            # Pass the input ids to the model and attention mask that represents
            # whether the token is the actual content token or the padded token
            # Maximum answer length considered is 64
            outs = self.ans_model[i].generate(
                input_ids=inputs['input_ids'].to(self.device),
                attention_mask=inputs['attention_mask'].to(self.device),
                max_length=64,
            )

            # Decode the model output, i.e. convert back ids to words
            # Don't skip the special tokens, as we need them to explicitly separate
            # the actual answer and the padding tokens present in the output
            dec = [self.ans_tokenizer[i].decode(ids, skip_special_tokens=False) for ids in outs]
            decoded_output = [item.split('<sep>') for item in dec]
            decoded_output = [i[0] for i in decoded_output]
            answers.extend(decoded_output)

        # Delete the <pad> tokens in the answers
        for i in range(len(answers)) :
            answers[i] = answers[i].replace("<pad> ", "")
        # Delete any duplicate answers if any and return the list of answers
        answers = list(set(answers))
        return answers


    def _prepare_inputs_for_ans_extraction(self, text):

        # Divide the paragraph into sentences using nltk's recommended sentence tokenizer
        # Default one is Punkt Tokenizer
        sents = sent_tokenize(text)

        # For each sentence in the paragraph, highlight that sentence using <hl>
        # markers to tell the ans extraction model that which sentence of the para
        # should be focused for extracting the answer.
        inputs = []
        for i in range(len(sents)) :
            # Append the text 'extract answers' to each sample
            source_text = "extract answers:"
            for j, sent in enumerate(sents):
                # Highlight the required sentence in each iteration
                if i == j:
                    sent = "<hl> %s <hl>" % sent
                # Keep on appending the sentences to the input sample para.
                source_text = "%s %s" % (source_text, sent)
                source_text = source_text.strip()

            # Add the end marker </s> denoting the end of sample.
            source_text = source_text + " </s>"
            inputs.append(source_text)

        # Return the samples
        return inputs


    def _tokenize(self, inputs, padding=True, truncation=True, add_special_tokens=True, max_length=512):

        # Main tokenizer function to convert tokens to their corresponding token_ids.
        # Padding means to bring the inputs of variable length to the same length (generally)
        # equal to the length of largest input) by appending the required number of
        # pad tokens to the sentences.
        # Here maximum length consider is 512 tokens
        # Add_special_tokens means to allow converting tokens like <hl>, </s>, etc.
        # into their token_ids, though they are not actual text content tokens.
        # Truncation allowed if the input is greater than 512
        # Return value is the tensor matrix.
        inputs = self.ans_tokenizer[0].batch_encode_plus(
            inputs,
            max_length=max_length,
            add_special_tokens=add_special_tokens,
            truncation=truncation,
            padding="max_length" if padding else False,
            pad_to_max_length=padding,
            return_tensors="pt"
        )
        return inputs


    def _generate_questions(self, answers, context):

        # For each answer in the answer list, generate the question by passing every
        # sample through the model.
        questions = []
        for answer in answers :
            # Prepare the input having the form :
            # "answer: 'ans_text'  context: 'context_text'"
            input_text = "answer: %s  context: %s </s>" % (answer, context)
            # Tokenize the inputs, i.e. convert the alphanumeric tokens to numeric token_ids
            inputs = self._tokenize([input_text], padding=True, truncation=True)

            # Generate the questions
            # Maximum question length is 64
            # About num_beams, read here-> https://huggingface.co/blog/how-to-generate#beam-search
            outs = self.model.generate(
                input_ids=inputs['input_ids'].to(self.device),
                attention_mask=inputs['attention_mask'].to(self.device),
                max_length=64,
                num_beams=4,
            )
            # Decode the output. We can skip the special tokens as there are no padding
            # tokens present in the ouput, as the output is just a single sentence
            # Add the question to the questions list
            questions.extend([self.tokenizer.decode(ids, skip_special_tokens=True) for ids in outs])

        # Remove the 'question: ' token present in the output
        for i in range(len(questions)) :
            questions[i] = questions[i].replace("question: ", "")
        # Return the questions list
        return questions

In [6]:
#
#-----------Question Answering Module-----------
#
import torch
import numpy as np
import transformers
from transformers import AutoModelForQuestionAnswering, AutoTokenizer
        

class QA() :

    def __init__(self) :
        #Defining the model with the tokenizer
        self.model_file = "bert-large-uncased-whole-word-masking-finetuned-squad"
        self.model = AutoModelForQuestionAnswering.from_pretrained(self.model_file)
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_file)
        assert isinstance(self.tokenizer, transformers.PreTrainedTokenizerFast)
        
    #Function to process a batch of contexts
    def answer_batch(self, questions, contexts, best_size=20, max_answer_length=100) :
        max_length = 480    #max length of input(question + context)
        doc_stride = 128    #length of overlap between consecutive features of the same example

        #Tokenize the question and context pair
        #Encode the words into word embeddings / encodings
        encodings = self.tokenizer(
                questions,
                contexts,
                truncation="only_second",
                max_length=max_length,
                stride=doc_stride,
                return_overflowing_tokens=True,
                return_offsets_mapping=True,
                return_attention_mask=True,
                padding="max_length"
            )
        
        #Sending the inputs to the cuda device
        cuda_device = torch.device("cuda")
        input_ids = torch.tensor(encodings.input_ids, device=cuda_device)
        token_type_ids=torch.tensor(encodings.token_type_ids, device=cuda_device)
        attention_mask=torch.tensor(encodings.attention_mask, device=cuda_device)
              
        #Running the QA model and fetching the output start and end scores of the tokens
        scores = self.model(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask);
        all_start_logits = (scores.start_logits).cpu()
        all_end_logits = (scores.end_logits).cpu()

        #Releasing the GPU Memory
        del input_ids
        del token_type_ids
        del attention_mask
        del scores

        #Finding the mapping between the features and the contexts to extract the answer from the respective context
        context_mapping = encodings.pop("overflow_to_sample_mapping")
        offset_mappings = encodings.pop("offset_mapping")
        context_features = dict()
        for i, c in enumerate(context_mapping):
            if(c not in context_features.keys()):
                context_features[c] = list()
            context_features[c].append(i)

        #Finding the best valid answers from all the features for all contexts
        best_answers = list()
        for c, context in enumerate(contexts):
            #list of valid answers in the context
            valid_answers = []
            #Feature indexes associated with the context
            feature_indices = context_features[c]

            #Looping through all features of the context
            for feature_index in feature_indices:
                # We grab the predictions of the model for this feature.
                start_logits = all_start_logits[feature_index]
                end_logits = all_end_logits[feature_index]

                #Fetching the offset mapping to find the answer in the context
                offset_mapping = offset_mappings[feature_index]

                #Finding the best best_size start and end logits.
                start_indexes = np.argsort(start_logits.detach().numpy())[-1 : -best_size - 1 : -1].tolist()
                end_indexes = np.argsort(end_logits.detach().numpy())[-1 : -best_size - 1 : -1].tolist()
                #Going through all combinations and processing all possible answers!
                for start_index in start_indexes:
                    for end_index in end_indexes:
                        # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
                        # to part of the input_ids that are not in the context.
                        if (
                            start_index >= len(offset_mapping)
                            or end_index >= len(offset_mapping)
                            or offset_mapping[start_index] is None
                            or offset_mapping[end_index] is None
                        ) : continue

                        # Don't consider answers with a length that is either < 0 or > max_answer_length.
                        if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                            continue

                        #Finding the valid answer along with its score (start_logit + end_logit)
                        start_char = offset_mapping[start_index][0]
                        end_char = offset_mapping[end_index][1]
                        valid_answers.append(
                            {
                                "score": start_logits[start_index] + end_logits[end_index],
                                "text": context[start_char: end_char]
                            }
                        )
                  
            #Finding the best answer for the batch
            if len(valid_answers) > 0:
                best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
            #In the very rare edge case there may not be a single non-null prediction,
            #hence, we create a fake prediction to avoid failure.
            else:
                best_answer = {"text": "<no_answer>", "score": 0.0}
            best_answers.append(best_answer)

        #Returning the best batch of answers            
        return best_answers

    #Function to find the best possible answer among all contexts
    def answer(self, questions, contexts) :
        #Sending batch of 32 contexts to the model for answering
        batch_size = 32
        cnt_batches = len(contexts)//batch_size + (1 if len(contexts)%batch_size != 0 else 0)
        best_anss = []

        #Calling the cuda device to run the model for the batch
        self.model.cuda()

        for b in range(cnt_batches):
            #Clearing the cuda cache to run the incoming batch          
            torch.cuda.empty_cache()

            #predicting the best answer for the given batch
            with torch.no_grad() :
                result = self.answer_batch(questions[b*batch_size : (b+1)*batch_size], contexts[b*batch_size : (b+1)*batch_size])
            
            #appending the best batch answers to list of best answers for the question
            best_anss = best_anss + result

        #Finding the best answer among all the best batch answers
        answer = sorted(best_anss, key=lambda x: x["score"], reverse=True)[0]["text"]
        return answer

In [7]:
class Main :

    qg = QGPipeline()
    qa = QA()

    @classmethod
    def get_contexts_given_the_doc(cls, doc_name) :

        contexts = TextProcessingAndContextCreation.get_context_chunks(doc_name)
        return contexts


    @classmethod
    def get_questions_given_the_contexts(cls, context_list) :

        generated_questions_and_contexts = []
        for context in context_list :
            questions = cls.qg(context)
            for j in questions :
                generated_questions_and_contexts.append([j['question'], context])

        return generated_questions_and_contexts


    @classmethod
    def get_answer_given_the_question_and_context_list(cls, question_context_list) :

        answers = []
        for i in question_context_list :
            answers.append(cls.qa.answer([i[0]], [i[1]]))

        return answers


    @classmethod
    def get_answer_for_single_question_given_context_list(cls, question, context_list) :

        repeated_question_list = [question] * len(context_list)
        answer = cls.qa.answer(repeated_question_list, context_list)
        return answer

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1230.0, style=ProgressStyle(description…

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.





HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1187795641.0, style=ProgressStyle(descr…




The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=791656.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1786.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=25.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=629.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=891612585.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=791656.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=31.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=65.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=90.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=443.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1340675298.0, style=ProgressStyle(descr…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




In [9]:
# doc_name = "Sexual Harassment Act, 2013.pdf"
doc_name = "The Consumer Protection Act, 2019.pdf"
contexts = Main.get_contexts_given_the_doc(doc_name)
contexts = contexts[:5]
generated_questions_and_contexts = Main.get_questions_given_the_contexts(contexts)
answers = Main.get_answer_given_the_question_and_context_list(generated_questions_and_contexts)

  f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added."


In [10]:
for i, j in zip(generated_questions_and_contexts, answers) :
    print(i[0])
    print(j)

Who enacted the Act in the Seventieth Year of the Republic of India?
Parliament
Whose interests is the Act to protect?
consumers
What date shall the Consumer Protection Act, 2019 come into force?
such date1 as the Central Government may, by notification, appoint and different dates may be appointed for different States and for different provisions of this Act
What is the only state that is excluded from the scope of the Consumer Protection Act, 2019?
Jammu and Kashmir
What may this act be called?
Consumer Protection Act, 2019
What does the Consumer Protection Act apply to?
all goods and services
What does "advertisement" mean?
any audio or visual publicity, representation, endorsement or pronouncement made by means of light, sound, smoke, gas, print, electronic media, internet or website
What term means any person who-- if any--knows that the goods are unsafe to the public?
consumer
What does the expression "buy any goods" mean?
offline or online transactions through electronic means o

In [13]:
# doc_name = "Sexual Harassment Act, 2013.pdf"
doc_name = "The Consumer Protection Act, 2019.pdf"
# question = '''What does "employee" mean without the knowledge of the principal employer?'''
question = '''What does "advertisement" mean?'''
contexts = Main.get_contexts_given_the_doc(doc_name)
answer = Main.get_answer_for_single_question_given_context_list(question, contexts)
print(answer)

any audio or visual publicity, representation, endorsement or pronouncement made by means of light, sound, smoke, gas, print, electronic media, internet or website
