In [None]:
from pathlib import Path
from typing import Union
from pypdf import PdfReader
from transformers import pipeline
import gradio as gr

In [2]:
# Retrieval function - text from PDF
def retrieve_text_from_pdf(pdf_file):
    """Reads the PDF from the provided path amd returns a string"""

    reader = PdfReader(pdf_file)

    all_text = ""
    for page in reader.pages:
        all_text += page.extract_text()
    return all_text

In [3]:
# Actual PDF text
pdf_text_all = retrieve_text_from_pdf("Uber-Q2-24-Earnings-Press-Release.pdf")

In [4]:
# Verify text has been read using index
pdf_text_all[:300]

' \n \n \n1    \nUber Announces Results for Second Quarter  2024  \n \nGross Bookings grew  19% year-over-year and 21% year-over-year on a constant currency basis  \nIncome from operations of $796 million ; Adjusted EBITDA of $1.6 billion , up 71% year-over-year \nOperating cash flow of $1.8 billion ; Free c'

In [5]:
# Create pipeline to answer questions with pre-trained model
answers_questions = pipeline(task="question-answering", model="deepset/tinyroberta-squad2")

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [6]:
# Question Theme: Operating cash flow
answers_questions("What is the operating cash flow?", pdf_text_all)

{'score': 0.8730255365371704,
 'start': 279,
 'end': 291,
 'answer': '$1.8 billion'}

In [7]:
# Question Theme: Active Platform Consumers
answers_questions("What was the total number of monthly active platform consumers?", pdf_text_all)

{'score': 0.9715433120727539,
 'start': 12492,
 'end': 12503,
 'answer': '156 million'}

In [8]:
# Question Theme: Expansion
answers_questions("Where did UberX expand to?", pdf_text_all)

{'score': 0.9630413055419922,
 'start': 15572,
 'end': 15594,
 'answer': 'Hungary and Luxembourg'}

In [9]:
# Displaying model output
def extract_answer_from_output(pdf_file, question):
    pdf_text_all = retrieve_text_from_pdf(pdf_file)
    answer = answers_questions(question, pdf_text_all)
    return answer["answer"]

In [10]:
# Setup for app
pdf_input = gr.File(file_types=[".pdf"], label="Upload a PDF document and ask a question about the content.")
question = gr.Textbox(label="Type a question about the uploaded document here.")
gr.Interface(fn=extract_answer_from_output, inputs=[pdf_input, question], outputs="text").launch()

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


