### Setup

In [19]:
from IPython.display import clear_output
import warnings
warnings.filterwarnings("ignore")

In [20]:
!pip install --upgrade pip
!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab,ocr]
!wget --no-check-certificate https://dl.xpdfreader.com/xpdf-tools-linux-4.04.tar.gz
!tar -xvf xpdf-tools-linux-4.04.tar.gz && sudo cp xpdf-tools-linux-4.04/bin64/pdftotext /usr/local/bin
!pip install 'farm-haystack[faiss]'
!pip install Pillow==9.0.0
!pip install xlsxwriter
clear_output()

### Import Libraries

In [21]:
from haystack.nodes import PDFToTextConverter, PreProcessor, QuestionGenerator
from haystack.document_stores.faiss import FAISSDocumentStore
from haystack.nodes import FARMReader
from haystack.pipelines import QuestionAnswerGenerationPipeline
import time
import tqdm
import xlsxwriter

In [22]:
from google.colab import files
uploaded = files.upload()

document = "/content/CRISP-DM.pdf"

Saving CRISP-DM.pdf to CRISP-DM.pdf


### Pre-processing

In [23]:
pdf_converter = PDFToTextConverter(
    remove_numeric_tables=True,
    valid_languages=["en"]
)

converted = pdf_converter.convert(file_path=document, meta={"company": "Company_1", "processed":False})

In [24]:
preprocessor = PreProcessor(split_by="word", split_length=200, split_overlap=10)
preprocessed = preprocessor.process(converted)

Preprocessing:   0%|          | 0/1 [00:00<?, ?docs/s]

In [25]:
timestr = time.strftime("%Y%m%d-%H%M%S")

document_store = FAISSDocumentStore(
    sql_url='sqlite:///'+timestr+'_document_store.db', faiss_index_factory_str="Flat", return_embedding=True)
document_store.delete_documents()
document_store.write_documents(preprocessed)

Writing Documents:   0%|          | 0/6 [00:00<?, ?it/s]

### Update embedding

In [26]:
reader = FARMReader(
        model_name_or_path='deepset/tinyroberta-squad2', use_gpu=True)
question_generator = QuestionGenerator()

Using sep_token, but it is not set yet.


In [27]:
pipeline = QuestionAnswerGenerationPipeline(question_generator, reader)

### Generate Question

In [28]:
def question_generator(document_store, pipeline):
    row = 1
    column = 0
    timestr = time.strftime("%Y%m%d-%H%M%S")
    filename_excel = timestr + '.xls'

    workbook = xlsxwriter.Workbook(filename_excel)
    worksheet = workbook.add_worksheet('Sheet 1')
    worksheet.write(0, 0, 'Question')
    worksheet.write(0, 1, 'Answer')
    worksheet.write(0, 2, 'Context')

    for idx, document in enumerate(tqdm.tqdm(document_store)):
        res = pipeline.run(documents=[document])
        for i in range(0, len(res['queries'])):
            query = res['queries'][i]
            worksheet.write(row, column, query)

            answer = res['answers'][i][0].answer
            worksheet.write(row, column + 1, answer)

            contexts = res['answers'][i][0].context
            worksheet.write(row, column + 2, contexts)
            row += 1
            
    workbook.close()

    return filename_excel

In [29]:
question_generator(document_store, pipeline)

0it [00:00, ?it/s]
Inferencing Samples:   0%|          | 0/1 [00:00<?, ? Batches/s][A
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  6.68 Batches/s]
1it [00:01,  1.23s/it]
Inferencing Samples:   0%|          | 0/1 [00:00<?, ? Batches/s][A
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  4.09 Batches/s]
2it [00:03,  1.56s/it]
Inferencing Samples:   0%|          | 0/1 [00:00<?, ? Batches/s][A
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  6.19 Batches/s]
3it [00:04,  1.32s/it]
Inferencing Samples:   0%|          | 0/1 [00:00<?, ? Batches/s][A
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  5.16 Batches/s]
4it [00:05,  1.44s/it]
Inferencing Samples:   0%|          | 0/1 [00:00<?, ? Batches/s][A
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  8.39 Batches/s]
5it [00:06,  1.32s/it]
Inferencing Samples:   0%|          | 0/1 [00:00<?, ? Batches/s][A
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  6.60 Batches/s]
6it [00:07,  1.32s/i

'20221028-023706.xls'