# Question-Answer with PDFs and Transformers

### Imports

In [None]:
from pathlib import Path
from typing import Union
from pypdf import PdfReader
from transformers import pipeline
import gradio as gr

### Retrieval using PyPDF

In [None]:
def retrieve_text_from_pdf(pdf_file):
    """Reads the PDF from the provided path amd returns a string"""

    reader = PdfReader(pdf_file)

    all_text = ""
    for page in reader.pages:
        all_text += page.extract_text()
    return all_text

### Actual extraction of the text

In [None]:
pdf_text_all = retrieve_text_from_pdf("Uber-Q2-24-Earnings-Press-Release.pdf")

In [None]:
# Verify text has been read using index
pdf_text_all[:300]

### Creating pipeline to answer questions using pre-trained model

In [None]:
answers_questions = pipeline(task="question-answering", model="deepset/tinyroberta-squad2")

### Testing

In [None]:
# Question Theme: Operating cash flow
answers_questions("What is the operating cash flow?", pdf_text_all)

In [None]:
# Question Theme: Active Platform Consumers
answers_questions("What was the total number of monthly active platform consumers?", pdf_text_all)

In [None]:
# Question Theme: Expansion
answers_questions("Where did UberX expand to?", pdf_text_all)

### Creating App to display model output

In [None]:
# Displaying model output
def extract_answer_from_output(pdf_file, question):
    pdf_text_all = retrieve_text_from_pdf(pdf_file)
    answer = answers_questions(question, pdf_text_all)
    return answer["answer"]

In [None]:
# Setup for app
pdf_input = gr.File(file_types=[".pdf"], label="Upload a PDF document and ask a question about the content.")
question = gr.Textbox(label="Type a question about the uploaded document here.")
gr.Interface(fn=extract_answer_from_output, inputs=[pdf_input, question], outputs="text").launch()