<a href="https://colab.research.google.com/github/aswinaus/RAG/blob/main/Lyft10K_of_rag_pipeline_pymupdf_langsmith_observability_metrics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os

In [None]:
!pip install pdfminer.six langsmith langchain langchain_openai chromadb pypdf nest_asyncio

In [3]:
import nest_asyncio
nest_asyncio.apply()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [6]:
from pdfminer.high_level import extract_text

def extract_text_from_pdf(pdf_path):
    return extract_text(pdf_path)

In [7]:
# Download Data
data_dir = '/content/drive/MyDrive' # Input a data dir path from your mounted Google Drive

In [8]:
# step 1: upload a PDF to the root of the file browser
# then change the variable below to be the name of your file
file_name = 'uber_10k_2023'

In [9]:
pdf_text = extract_text_from_pdf(f"{data_dir}/RAG/data/10k/uber_10k_2023.pdf")

In [None]:
pdf_text.split('\n')[0:10000]

In [None]:
! pip install langchain-community


In [12]:
!pip install chromadb openai

In [13]:
import chromadb
from langchain.embeddings import OpenAIEmbeddings

In [14]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader

text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=300,
    chunk_overlap=50
)
loader = PyPDFLoader(f"{data_dir}/RAG/data/10k/uber_10k_2023.pdf")
# load_and_split uses RecursiveCharacterTextSplitter by default, but here I customize the chunk size & overlap
pages = loader.load_and_split(text_splitter)

In [15]:
from google.colab import userdata
os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')

In [16]:
# create vector store with Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.vectorstores.utils import filter_complex_metadata # import filter_complex_metadata

openai_api_key = os.environ["OPENAI_API_KEY"]
# filtered_chunked_markdown = filter_complex_metadata(chunked_markdown)

#index = Chroma.from_documents(documents=pages, embedding=OpenAIEmbeddings(openai_api_key=os.environ["OPENAI_API_KEY"]))

vectordb = Chroma.from_documents(documents=pages, embedding=OpenAIEmbeddings(openai_api_key=os.environ["OPENAI_API_KEY"]),persist_directory=f"{data_dir}/RAG/VectorDB/chroma_db_RAG_FOR_Evals")
vectordb.persist()
retriever = vectordb.as_retriever()

  vectordb.persist()


In [17]:
# multi-query
from langchain.prompts import ChatPromptTemplate

template = """You are an AI language model Accounting assistant. Your task is to generate five
different versions of the given user question to retrieve relevant documents from a vector
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search.
Provide these alternative questions separated by newlines. Original question: {question}"""
prompt_perspectives = ChatPromptTemplate.from_template(template)

from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI

generate_queries = (
    prompt_perspectives
    | ChatOpenAI(temperature=0, openai_api_key=openai_api_key)
    | StrOutputParser()
    | (lambda x: x.split("\n"))
)

In [18]:
questiononRevenue = "Please summarize Financial and Operational Highlights for Uber?"

In [19]:
generate_queries.invoke(questiononRevenue)

['1. Can you provide a summary of the Financial and Operational Highlights of Uber?',
 "2. What are the key Financial and Operational Highlights that summarize Uber's performance?",
 "3. I'm interested in learning about the Financial and Operational Highlights of Uber. Can you provide a summary?",
 '4. Could you summarize the key Financial and Operational Highlights for Uber?',
 "5. What are the main points to know about Uber's Financial and Operational Highlights?"]

In [21]:
question = "Does Uber have the liquidity to meet its working capital and capital expenditures needs.Please explain?"

In [22]:
generate_queries.invoke(question)

['1. Can Uber meet its working capital and capital expenditures needs based on its liquidity?',
 "2. How does Uber's liquidity position impact its ability to cover working capital and capital expenditures?",
 "3. Is Uber's liquidity sufficient to support its working capital and capital expenditures requirements?",
 "4. What is the relationship between Uber's liquidity and its ability to meet working capital and capital expenditures needs?",
 "5. How does Uber's liquidity position affect its capacity to fulfill working capital and capital expenditures obligations?"]

In [23]:
# Retrieve docs given a list of queries
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(temperature=0, openai_api_key=openai_api_key)

#retriever = MultiQueryRetriever.from_llm(
#    retriever=index.as_retriever(), llm=llm
#)

In [24]:
# Rank documents
from langchain.load import dumps, loads

def rank_documents(results: list[list], k=60):
    fused_scores = {}

    for docs in results:
        for rank, doc in enumerate(docs):
            doc_str = dumps(doc)
            if doc_str not in fused_scores:
                fused_scores[doc_str] = 0
            previous_score = fused_scores[doc_str]
            fused_scores[doc_str] += 1 / (rank + k)

    reranked_results = [
        (loads(doc), score)
        for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
    ]

    return reranked_results

retrieval_chain = generate_queries | retriever.map() | rank_documents
docs = retrieval_chain.invoke(questiononRevenue)

  (loads(doc), score)


In [25]:
docs

[(Document(metadata={'producer': 'EDGRpdf Service w/ EO.Pdf 22.0.40.0', 'total_pages': 191, 'keywords': '0001543151-24-000012; ; 10-K', 'page_label': '84', 'page': 83, 'moddate': '2024-02-15T16:33:07-05:00', 'subject': 'Form 10-K filed on 2024-02-15 for the period ending 2023-12-31', 'creator': 'EDGAR Filing HTML Converter', 'source': '/content/drive/MyDrive/RAG/data/10k/uber_10k_2023.pdf', 'title': '0001543151-24-000012', 'author': 'EDGAR® Online LLC, a subsidiary of OTC Markets Group', 'creationdate': '2024-02-15T16:32:45-05:00'}, page_content='UBER TECHNOLOGIES, INC.\nNOTES TO CONSOLIDATED FINANCIAL STATEMENTS\nNote 1 – Description of Business and Summary of Significant Accounting Policies\nDescription of Business'),
  0.016666666666666666),
 (Document(metadata={'creator': 'EDGAR Filing HTML Converter', 'page_label': '52', 'creationdate': '2024-02-15T16:32:45-05:00', 'subject': 'Form 10-K filed on 2024-02-15 for the period ending 2023-12-31', 'title': '0001543151-24-000012', 'source

In [26]:
#Creating a RAG Pipeline
from operator import itemgetter
from langchain_core.runnables import RunnablePassthrough


# RAG
template = """Answer the following question based on this context:

{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)


llm = ChatOpenAI(temperature=0, openai_api_key=openai_api_key)

# Chain
final_rag_chain = (
    {"context": retrieval_chain, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

final_rag_chain.invoke({"question":question})

"Based on the provided context, Uber's liquidity can be assessed by looking at its balance sheet. The total assets for Uber as of December 31, 2023, were $38,699 million, while the total liabilities, redeemable non-controlling interests, and equity were $32,109 million. This indicates that Uber has a total equity of $6,590 million ($38,699 - $32,109), which can be used to meet its working capital and capital expenditures needs.\n\nAdditionally, Uber had cash and cash equivalents of $4,680 million, short-term investments of $727 million, and other current assets totaling $1,681 million, which can further contribute to its liquidity position.\n\nIn conclusion, based on the information provided in the balance sheet, Uber appears to have the liquidity to meet its working capital and capital expenditures needs."

In [32]:
os.environ["LANGSMITH_TRACING"]="true"
os.environ["LANGSMITH_ENDPOINT"]="https://api.smith.langchain.com"
os.environ["LANGSMITH_API_KEY"]=userdata.get('LANGSMITH_OBSERVABILITY')
os.environ["LANGSMITH_PROJECT"]="pr-aching-print-50"
os.environ["OPENAI_API_KEY"]=userdata.get('OPENAI_API_KEY')

In [33]:
import os
print(f"LANGSMITH_TRACING: {os.environ.get('LANGSMITH_TRACING')}")
print(f"LANGSMITH_ENDPOINT: {os.environ.get('LANGSMITH_ENDPOINT')}")
print(f"LANGSMITH_API_KEY: {os.environ.get('LANGSMITH_API_KEY')}")
print(f"LANGSMITH_PROJECT: {os.environ.get('LANGSMITH_PROJECT')}")

LANGSMITH_TRACING: true
LANGSMITH_ENDPOINT: https://api.smith.langchain.com
LANGSMITH_API_KEY: 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
LANGSMITH_PROJECT: pr-aching-print-50


In [34]:
LANGSMITH_TRACING=True
LANGSMITH_ENDPOINT="https://api.smith.langchain.com"
LANGSMITH_API_KEY=userdata.get('LANGSMITH_OBSERVABILITY')
LANGSMITH_PROJECT="pr-sparkling-mustache-32"
OPENAI_API_KEY=userdata.get('OPENAI_API_KEY')

In [35]:
from langsmith import Client, traceable
import openai
from langchain.llms import OpenAI

client = Client() # replace with your api key if you don't have environment variables set

@traceable(run_type="chain")  # or client=client, if you don't have environment variables set
def run_llm(prompt: str):
    llm = OpenAI()  # or any other llm you want to test with
    return llm(prompt)

run_llm("what is the meaning of life in the context of an IT Engineer and how will that change in the AI world in future?")

'\n\nThe meaning of life is a philosophical question that has been debated by many throughout history. It refers to the purpose or significance of human existence. Some believe that the meaning of life is to find happiness and fulfillment, while others think it is to fulfill a specific destiny or follow a set of moral principles. Some religious beliefs suggest that the meaning of life is to serve a higher power or to achieve enlightenment. Ultimately, the meaning of life may differ for each individual and can be shaped by personal beliefs, experiences, and values.'

In [42]:
import openai
from langsmith.wrappers import wrap_openai
from langsmith import traceable
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.load import dumps, loads
from operator import itemgetter
from langchain_core.runnables import RunnablePassthrough

# Auto-trace LLM calls in-context
client = wrap_openai(openai.Client())

@traceable # Auto-trace this function
def pipeline(user_input: str):
    # --- final_rag_chain logic starts here ---

    # generate_queries from final_rag_chain
    template = """You are an AI language model Accounting assistant. Your task is to generate five
    different versions of the given user question to retrieve relevant documents from a vector
    database. By generating multiple perspectives on the user question, your goal is to help
    the user overcome some of the limitations of the distance-based similarity search.
    Provide these alternative questions separated by newlines. Original question: {question}"""
    prompt_perspectives = ChatPromptTemplate.from_template(template)

    openai_api_key = os.environ["OPENAI_API_KEY"]  # Assuming you have OPENAI_API_KEY set

    generate_queries = (
        prompt_perspectives
        | ChatOpenAI(temperature=0, openai_api_key=openai_api_key)
        | StrOutputParser()
        | (lambda x: x.split("\n"))
    )

    # retriever from final_rag_chain (assuming you have 'vectordb' defined)
    retriever = vectordb.as_retriever()

    # rank_documents from final_rag_chain
    def rank_documents(results: list[list], k=60):
        fused_scores = {}
        for docs in results:
            for rank, doc in enumerate(docs):
                doc_str = dumps(doc)
                if doc_str not in fused_scores:
                    fused_scores[doc_str] = 0
                previous_score = fused_scores[doc_str]
                fused_scores[doc_str] += 1 / (rank + k)

        reranked_results = [
            (loads(doc), score)
            for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
        ]
        return reranked_results

    retrieval_chain = generate_queries | retriever.map() | rank_documents

    # RAG prompt from final_rag_chain
    template = """Answer the following question based on this context:

    {context}

    Question: {question}
    """
    prompt = ChatPromptTemplate.from_template(template)

    llm = ChatOpenAI(temperature=0, openai_api_key=openai_api_key)

    # final_rag_chain assembled
    final_rag_chain = (
        {"context": retrieval_chain, "question": RunnablePassthrough()}
        | prompt
        | llm
        | StrOutputParser()
    )

    # --- final_rag_chain logic ends here ---

    # Invoke final_rag_chain with user_input
    response = final_rag_chain.invoke({"question": user_input})

    # Log the response and potentially other metadata
    # ... (your logging logic) ...

    return response

pipeline("Does Uber have the liquidity to meet its working capital and capital expenditures needs.Please explain?")

"Based on the provided context, Uber's liquidity can be assessed by looking at its balance sheet. The document mentions the company's assets, including cash and cash equivalents, short-term investments, and other current assets, as well as its liabilities and equity. By analyzing these figures, one can determine if Uber has the necessary liquidity to meet its working capital and capital expenditures needs.\n\nSpecifically, the document shows that Uber had cash and cash equivalents of $4,680 million and short-term investments of $727 million as of December 31, 2023. These figures indicate that Uber has a significant amount of liquid assets that could potentially cover its short-term financial obligations and capital expenditures.\n\nAdditionally, the total assets of Uber were $38,699 million, while the total liabilities were $32,109 million as of December 31, 2023. This suggests that Uber has a healthy equity position, which could further support its liquidity position.\n\nIn conclusion

In [39]:
dataset_name = "Uber_10K_2023_v3"

In [40]:
# create dataset
from langsmith import Client
import os

dataset_inputs = [
    '1. Can Uber cover its operational and investment expenses with its available funds?',
 '2. Is Uber financially equipped to handle its day-to-day expenses and long-term investments?',
 '3. Does Uber possess enough financial resources to support its working capital and capital expenditure requirements?',
 '4. How well is Uber positioned to meet its financial obligations for both short-term and long-term needs?',
 '5. Is Uber adequately funded to address its working capital and capital expenditure demands effectively?'
]

dataset_outputs = [
    {"must_mention": ["Uber's total assets were $38,699 million", "Additionally, Uber had restricted cash and cash equivalents of $805 million", "The company's financial position suggests that it has the resources to cover its short-term obligations and fund its operational requirements.", "1"]},
    {"must_mention": ["$12,682 million ($38,699 million - $26,017 million)", 'global company and as of December 31, 2023, we and our subsidiaries had approximately 30,400 employees globally and operations inapproximately 70 countries and more than 10,000 cities around the world', '$12,682 million ']},
    {"must_mention": ["$5,407 million in liquid assets", 'Freight Gross Bookings declining 25% year-over-year.']}, # reading from a table
    {"must_mention": ["The company's financial position suggests that it has the resources to cover its short-term obligations and fund its operational requirements."]}, # reading from a table
    {"must_mention": ["Uber had cash and cash equivalents of $4,680 million and short-term investments of $727 million"]}, # reading from a table
]

# ensure you have this set up
from google.colab import userdata
os.environ["LANGCHAIN_API_KEY"] = userdata.get('LANGCHAIN_API_KEY')
langchain_api_key = os.environ["LANGCHAIN_API_KEY"]

client = Client(api_key=langchain_api_key)

dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="Uber 10K 2023 questions",
)
client.create_examples(
    inputs=[{"question": q} for q in dataset_inputs],
    outputs=dataset_outputs,
    dataset_id=dataset.id,
)

{'example_ids': ['f5730d2b-453a-4a48-99a3-19d0229a8856',
  '766cd441-ebff-4959-a0bf-dadc8ab29fe1',
  '7dcce152-93e5-401e-bbe8-887414fc23ee',
  'b996ccbc-8af2-407d-8f7c-998529099558',
  'aac460dc-5f2c-418e-b854-21410c08bf63'],
 'count': 5}

In [41]:
 #run evals in langsmith
from langsmith.schemas import Run, Example

from langsmith.evaluation import evaluate, LangChainStringEvaluator


def must_mention(run: Run, example: Example) -> dict:
    prediction = run.outputs.get("output") or ""
    required = example.outputs.get("must_mention") or []
    score = any(phrase in prediction for phrase in required)
    return {"key":"must_mention", "score": score}

evaluators = [
  must_mention,
]
runner = final_rag_chain
def query_wrapper(query_dict):
    query_string = query_dict['question']
    response = runner.invoke(query_string)
    return {"output": response}

experiment_results = evaluate(
    query_wrapper,
    data=dataset_name,
    evaluators=evaluators,
    experiment_prefix="uber10Kv1",
    client=client,
)

View the evaluation results for experiment: 'uber10Kv1-617863c0' at:
https://smith.langchain.com/o/04de93c6-630e-469d-a203-a695192872ea/datasets/167a3114-4096-4d78-bb2f-ec637be7dcc0/compare?selectedSessions=0beca073-4f3b-4464-9478-1e0c5f148542




0it [00:00, ?it/s]

In [None]:
!pip install -U langchain langchain-openai

In [None]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI()
llm.invoke("Hello, world!")