In [57]:
from unstract.llmwhisperer.client import LLMWhispererClient
import os
from dotenv import load_dotenv

# access the openai api and unstract llm whisperer key from .env file and map it to the current system
os.environ['OPENAI_API_KEY']=os.getenv("openai_api_key")
UNSTRACT_API_KEY = os.getenv("unstract_api_key")
BASE_URL = os.getenv("BASE_URL")

client = LLMWhispererClient(base_url=BASE_URL, api_key=UNSTRACT_API_KEY)

2024-10-21 11:18:25,014 - unstract.llmwhisperer.client - DEBUG - logging_level set to DEBUG
2024-10-21 11:18:25,016 - unstract.llmwhisperer.client - DEBUG - base_url set to https://llmwhisperer-api.unstract.com/v1
2024-10-21 11:18:25,017 - unstract.llmwhisperer.client - DEBUG - api_key set to fc21xxxxxxxxxxxxxxxxxxxxxxxxxxxx


In [34]:
try:
    result = client.whisper(
        file_path="../raw-docs/uber-10k-report.pdf",
        processing_mode="text",
        force_text_processing=True
    )
    extracted_text = result["extracted_text"]
    print(extracted_text)
except LLMWhispererClientException as e:
    print(e)

2024-10-21 10:55:03,916 - unstract.llmwhisperer.client - DEBUG - whisper called
2024-10-21 10:55:03,917 - unstract.llmwhisperer.client - DEBUG - api_url: https://llmwhisperer-api.unstract.com/v1/whisper
2024-10-21 10:55:03,919 - unstract.llmwhisperer.client - DEBUG - params: {'url': '', 'processing_mode': 'text', 'output_mode': 'line-printer', 'page_seperator': '<<<', 'force_text_processing': True, 'pages_to_extract': '', 'timeout': 200, 'store_metadata_for_highlighting': False, 'median_filter_size': 0, 'gaussian_blur_radius': 0, 'ocr_provider': 'advanced', 'line_splitter_tolerance': 0.4, 'horizontal_stretch_factor': 1.0}





    Adjusted EBITDA 


    We  define  Adjusted  EBITDA  as  net  income  (loss),  excluding  (i)  income  (loss)  from  discontinued  operations,  net  of  income  taxes,  (ii)  net  income  (loss) 
attributable to non-controlling interests, net of tax, (iii) provision for (benefit from) income taxes, (iv) income (loss) from equity method investments, (v) interest 
expense,  (vi)  other  income  (expense),  net,  (vii)  depreciation  and  amortization,  (viii)  stock-based  compensation  expense,  (ix)  certain  legal,  tax,  and  regulatory 
reserve  changes  and  settlements,  (x)  goodwill  and  asset  impairments/loss  on  sale  of  assets,  (xi)  acquisition,  financing  and  divestitures  related  expenses,  (xii) 
restructuring and related charges and (xiii) other items not indicative of our ongoing operating performance. 


    We have included Adjusted EBITDA in this Annual Report on Form 10-K because it is a key measure used by our management team to evaluate our operatin

In [36]:
from pydantic import BaseModel, Field
from typing import List

class AdjustedEBITDAItem(BaseModel):
    item: str = Field(description="Name of the item in the Adjusted EBITDA reconciliation")
    value_2022: float = Field(description="Value for the year 2022 in millions of dollars")
    value_2023: float = Field(description="Value for the year 2023 in millions of dollars")

class AdjustedEBITDAReconciliation(BaseModel):
    items: List[AdjustedEBITDAItem] = Field(description="List of items in the Adjusted EBITDA reconciliation")
    adjusted_ebitda_2022: float = Field(description="Final Adjusted EBITDA value for 2022 in millions of dollars")
    adjusted_ebitda_2023: float = Field(description="Final Adjusted EBITDA value for 2023 in millions of dollars")

In [52]:
import json
from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain.output_parsers import PydanticOutputParser


def compile_template_and_get_llm_response(preamble, extracted_text, pydantic_object):
    postamble = "Do not include any explanation in the reply. Only include the extracted information in the reply."
    system_template = "{preamble}"
    system_message_prompt = SystemMessagePromptTemplate.from_template(system_template)
    human_template = "{format_instructions}\n\n{extracted_text}\n\n{postamble}"
    human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)

    parser = PydanticOutputParser(pydantic_object=pydantic_object)
    chat_prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])
    request = chat_prompt.format_prompt(preamble=preamble,
                                        format_instructions=parser.get_format_instructions(),
                                        extracted_text=extracted_text,
                                        postamble=postamble).to_messages()
    chat = ChatOpenAI()
    response = chat.invoke(request, temperature=0.0)
    
    try:
        # Parse the JSON content
        parsed_response = json.loads(response.content)
        # Convert back to a formatted JSON string
        formatted_json = json.dumps(parsed_response, indent=4)
        return formatted_json
    except json.JSONDecodeError:
        print("Error: Unable to parse JSON response")
        return response.content

In [53]:
def extract_adjusted_ebitda_from_text(extracted_text):
    preamble = ("You're seeing the Adjusted EBITDA reconciliation table from Uber's 10-K report. "
                "Your job is to accurately extract each item, its value for 2022 and 2023, "
                "and the final Adjusted EBITDA value. Treat negative values as negative numbers.")
    return compile_template_and_get_llm_response(preamble, extracted_text, AdjustedEBITDAReconciliation)

In [54]:
import json
import ast
from datetime import date


response = extract_adjusted_ebitda_from_text(extracted_text)

# Parse the string into a Python dictionary
try:
    parsed_data = ast.literal_eval(response)
except (SyntaxError, ValueError):
    # If ast.literal_eval fails, try json.loads
    try:
        parsed_data = json.loads(response)
    except json.JSONDecodeError:
        print("Error: Unable to parse the response string.")
        parsed_data = {}

# Add any additional metadata if needed
combined_data = {
    "document_type": "Uber 10-K Report",
    "extraction_date": str(date.today()),  # You might want to use the actual current date
    "extracted_data": parsed_data
}

# Write to a JSON file
output_file = os.path.join('../saved-docs', 'uber_10k_extracted_data.json')

with open(output_file, 'w') as f:
    json.dump(combined_data, f, indent=2)

print(f"Data written to {output_file}")

Data written to ../saved-docs/uber_10k_extracted_data.json


In [55]:
from langchain.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain.schema import Document

# Load the JSON file
with open("../saved-docs/uber_10k_extracted_data.json", 'r') as f:
    data = json.load(f)

# Create documents
documents = [
    Document(
        page_content=json.dumps(data['extracted_data']),
        metadata={
            "document_type": data['document_type'],
            "extraction_date": data['extraction_date']
        }
    )
]

# Create the vector store
embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(documents, embeddings)

# Create the retriever
retriever = vectorstore.as_retriever()

In [56]:
from langchain import hub
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

# Load the openai chat model
llm = ChatOpenAI(model_name="gpt-3.5-turbo",temperature=0)

# QA prompt
retrieval_qa_chat_prompt = hub.pull("langchain-ai/retrieval-qa-chat")

combine_docs_chain = create_stuff_documents_chain(
    llm, retrieval_qa_chat_prompt
)
retrieval_chain = create_retrieval_chain(retriever, combine_docs_chain)

response = retrieval_chain.invoke({"input":"What was Uber's Adjusted EBITDA in 2022?"})
response['answer']

"Uber's Adjusted EBITDA in 2022 was $1,713 million."