In [1]:
import os
from dotenv import load_dotenv

# access the openai api and unstract llm whisperer key from .env file and map it to the current system
os.environ['OPENAI_API_KEY']=os.getenv("openai_api_key")

In [2]:
import fitz
from typing import Dict, List

def extract_pdf_content(pdf_path: str, max_pages: int = None) -> Dict[int, Dict]:
    doc = fitz.open(pdf_path)
    content = {}
    
    pages_to_process = min(len(doc), max_pages) if max_pages else len(doc)
    
    for page_num in range(pages_to_process):
        page = doc[page_num]
        tables = page.find_tables()
        
        content[page_num] = {
            'text': page.get_text(),
            'tables': [table.extract() for table in tables] if tables else []
        }
        
    doc.close()
    return content

# Usage
content = extract_pdf_content("../raw-docs/uber-10k-report.pdf", max_pages=10)  # Process first 10 pages

# Print results in a structured way
for page_num, data in content.items():
    print(f"\n=== Page {page_num + 1} ===")
    
    if data['tables']:
        print(f"\nFound {len(data['tables'])} tables:")
        for i, table in enumerate(data['tables']):
            print(f"\nTable {i + 1}:")
            for row in table:
                print(row)


=== Page 1 ===

=== Page 2 ===

Found 3 tables:

Table 1:
['Adjusted EBITDA reconciliation:', '', '', None]
['Net income (loss) attributable to Uber Technologies, Inc.', '$ (9,141)', '$', '1,887']
['Add (deduct):', '', '', None]
['Net income attributable to non-controlling interests, net of tax', '3', '269', None]
['Provision for (benefit from) income taxes', '(181)', '213', None]
['Income from equity method investments', '(107)', '(48)', None]
['Interest expense', '565', '633', None]
['Other (income) expense, net', '7,029', '(1,844)', None]
['Depreciation and amortization', '947', '823', None]
['Stock-based compensation expense', '1,793', '1,935', None]
['Legal, tax, and regulatory reserve changes and settlements', '732', '9', None]
['Goodwill and asset impairments/loss on sale of assets, net', '25', '84', None]
['Acquisition, financing and divestitures related expenses', '46', '36', None]
['Accelerated lease costs related to cease-use of ROU assets', '6', '—', None]
['COVID-19 respo

In [3]:
from pydantic import BaseModel, Field
from typing import List

class AdjustedEBITDAItem(BaseModel):
    item: str = Field(description="Name of the item in the Adjusted EBITDA reconciliation")
    value_2022: float = Field(description="Value for the year 2022 in millions of dollars")
    value_2023: float = Field(description="Value for the year 2023 in millions of dollars")

class AdjustedEBITDAReconciliation(BaseModel):
    items: List[AdjustedEBITDAItem] = Field(description="List of items in the Adjusted EBITDA reconciliation")
    adjusted_ebitda_2022: float = Field(description="Final Adjusted EBITDA value for 2022 in millions of dollars")
    adjusted_ebitda_2023: float = Field(description="Final Adjusted EBITDA value for 2023 in millions of dollars")

In [4]:
# LLM approach for parsing the table

from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain.output_parsers import PydanticOutputParser
import fitz
from typing import Dict, List

def extract_pdf_content(pdf_path: str, max_pages: int = None) -> Dict[int, Dict]:
    doc = fitz.open(pdf_path)
    content = {}
    
    pages_to_process = min(len(doc), max_pages) if max_pages else len(doc)
    
    for page_num in range(pages_to_process):
        page = doc[page_num]
        tables = page.find_tables()
        
        content[page_num] = {
            'text': page.get_text(),
            'tables': [table.extract() for table in tables] if tables else []
        }
        
    doc.close()
    return content

def compile_template_and_get_llm_response(preamble, extracted_text, pydantic_object):
    postamble = "Do not include any explanation in the reply. Only include the extracted information in the reply."
    system_template = "{preamble}"
    system_message_prompt = SystemMessagePromptTemplate.from_template(system_template)
    human_template = "{format_instructions}\n\n{extracted_text}\n\n{postamble}"
    human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)

    parser = PydanticOutputParser(pydantic_object=pydantic_object)
    chat_prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])
    request = chat_prompt.format_prompt(preamble=preamble,
                                      format_instructions=parser.get_format_instructions(),
                                      extracted_text=extracted_text,
                                      postamble=postamble).to_messages()
    chat = ChatOpenAI()
    response = chat.invoke(request, temperature=0.0)
    print(f"Response from LLM:\n{response.content}")
    return response.content

def extract_adjusted_ebitda_from_pdf(pdf_content: dict) -> str:
    """Find EBITDA table and extract its content"""
    def find_ebitda_table(content):
        for page_num, page_data in content.items():
            for table in page_data['tables']:
                if any('EBITDA' in str(row) for row in table):
                    return table
        return None

    ebitda_table = find_ebitda_table(pdf_content)
    if not ebitda_table:
        raise ValueError("EBITDA table not found in the document")
    
    # Convert table to text format
    table_text = "\n".join(["\t".join(map(str, row)) for row in ebitda_table])
    
    # Use LangChain to extract structured data
    preamble = ("You're seeing the Adjusted EBITDA reconciliation table from Uber's 10-K report. "
                "Your job is to accurately extract each item, its value for 2022 and 2023, "
                "and the final Adjusted EBITDA value. Treat negative values as negative numbers.")
    
    return compile_template_and_get_llm_response(preamble, table_text, AdjustedEBITDAReconciliation)

# Usage
# First extract content using PyMuPDF
content = extract_pdf_content("../raw-docs/uber-10k-report.pdf")

# Then use LangChain to parse the table
ebitda_data = extract_adjusted_ebitda_from_pdf(content)

# Print results (note: result will be a string that needs to be parsed into JSON)
print(ebitda_data)

Response from LLM:
{
    "items": [
        {
            "item": "Net income (loss) attributable to Uber Technologies, Inc.",
            "value_2022": -9141,
            "value_2023": 1887
        },
        {
            "item": "Net income attributable to non-controlling interests, net of tax",
            "value_2022": 3,
            "value_2023": 269
        },
        {
            "item": "Provision for (benefit from) income taxes",
            "value_2022": -181,
            "value_2023": 213
        },
        {
            "item": "Income from equity method investments",
            "value_2022": -107,
            "value_2023": -48
        },
        {
            "item": "Interest expense",
            "value_2022": 565,
            "value_2023": 633
        },
        {
            "item": "Other (income) expense, net",
            "value_2022": 7029,
            "value_2023": -1844
        },
        {
            "item": "Depreciation and amortization",
            "valu

In [5]:
import json
import ast
import os
from datetime import date

# First get the data using our combined PyMuPDF + LangChain approach
content = extract_pdf_content("../raw-docs/uber-10k-report.pdf")
response = extract_adjusted_ebitda_from_pdf(content)

# Parse the string into a Python dictionary
try:
    parsed_data = ast.literal_eval(response)
except (SyntaxError, ValueError):
    # If ast.literal_eval fails, try json.loads
    try:
        parsed_data = json.loads(response)
    except json.JSONDecodeError:
        print("Error: Unable to parse the response string.")
        parsed_data = {}

# Add metadata
combined_data = {
    "document_type": "Uber 10-K Report",
    "extraction_date": str(date.today()),
    "extraction_method": "PyMuPDF + LangChain",
    "extracted_data": parsed_data
}

# Ensure the directory exists
os.makedirs('../saved-docs', exist_ok=True)

# Write to a JSON file
output_file = os.path.join('../saved-docs/pymupdf_Uber_10k_extracted_data.json')

with open(output_file, 'w') as f:
    json.dump(combined_data, f, indent=2)

print(f"Data written to {output_file}")

# Verify the saved data (optional)
with open(output_file, 'r') as f:
    saved_data = json.load(f)
    print("\nVerification of saved data:")
    print(f"Document type: {saved_data['document_type']}")
    print(f"Extraction date: {saved_data['extraction_date']}")
    print(f"Number of items: {len(saved_data['extracted_data']['items'])}")

Response from LLM:
{
    "items": [
        {
            "item": "Net income (loss) attributable to Uber Technologies, Inc.",
            "value_2022": -9141,
            "value_2023": 1887
        },
        {
            "item": "Net income attributable to non-controlling interests, net of tax",
            "value_2022": 3,
            "value_2023": 269
        },
        {
            "item": "Provision for (benefit from) income taxes",
            "value_2022": -181,
            "value_2023": 213
        },
        {
            "item": "Income from equity method investments",
            "value_2022": -107,
            "value_2023": -48
        },
        {
            "item": "Interest expense",
            "value_2022": 565,
            "value_2023": 633
        },
        {
            "item": "Other (income) expense, net",
            "value_2022": 7029,
            "value_2023": -1844
        },
        {
            "item": "Depreciation and amortization",
            "valu

In [6]:
import json
from langchain.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.schema import Document
from langchain import hub
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

# 1. Load the JSON file we saved earlier
with open(os.path.join('../saved-docs/pymupdf_Uber_10k_extracted_data.json'), 'r') as f:
    data = json.load(f)

# 2. Create documents for the vector store
documents = [
    Document(
        page_content=json.dumps(data['extracted_data']),
        metadata={
            "document_type": data['document_type'],
            "extraction_date": data['extraction_date'],
            "extraction_method": data.get('extraction_method', 'PyMuPDF + LangChain')
        }
    )
]

# 3. Create the vector store
embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(documents, embeddings)

# 4. Create the retriever
retriever = vectorstore.as_retriever()

# 5. Create the QA chain
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
retrieval_qa_chat_prompt = hub.pull("langchain-ai/retrieval-qa-chat")
combine_docs_chain = create_stuff_documents_chain(llm, retrieval_qa_chat_prompt)
retrieval_chain = create_retrieval_chain(retriever, combine_docs_chain)

# 6. Example queries
def ask_question(question: str) -> str:
    """Helper function to ask questions about the EBITDA data"""
    response = retrieval_chain.invoke({"input": question})
    return response['answer']

# Test it with some example questions
questions = [
    "What was Uber's Adjusted EBITDA in 2023?",
    "What was Uber's Adjusted EBITDA in 2022?",
    "What was the year-over-year change in Adjusted EBITDA?",
    "What were the major components affecting EBITDA in 2023?"
]

print("\nExample Questions and Answers:")
for question in questions:
    print(f"\nQ: {question}")
    print(f"A: {ask_question(question)}")




Example Questions and Answers:

Q: What was Uber's Adjusted EBITDA in 2023?
A: Uber's Adjusted EBITDA in 2023 was 4052.

Q: What was Uber's Adjusted EBITDA in 2022?
A: Uber's Adjusted EBITDA in 2022 was $1,713 million.

Q: What was the year-over-year change in Adjusted EBITDA?
A: The year-over-year change in Adjusted EBITDA was an increase of $2,339 million ($4,052 million in 2023 - $1,713 million in 2022).

Q: What were the major components affecting EBITDA in 2023?
A: The major components affecting EBITDA in 2023 were:
1. Net income (loss) attributable to Uber Technologies, Inc.
2. Net income attributable to non-controlling interests, net of tax
3. Provision for (benefit from) income taxes
4. Income from equity method investments
5. Interest expense
6. Other (income) expense, net
7. Depreciation and amortization
8. Stock-based compensation expense
9. Legal, tax, and regulatory reserve changes and settlements
10. Goodwill and asset impairments/loss on sale of assets, net
11. Acquisit