<a href="https://colab.research.google.com/github/aswinaus/RAG/blob/main/Lyft10K_of_rag_pipeline_pymupdf_langsmith_article.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os

In [None]:
!pip install pdfminer.six langsmith langchain langchain_openai chromadb pypdf nest_asyncio

In [None]:
import nest_asyncio
nest_asyncio.apply()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from pdfminer.high_level import extract_text

def extract_text_from_pdf(pdf_path):
    return extract_text(pdf_path)

In [None]:
# Download Data
data_dir = '/content/drive/MyDrive' # Input a data dir path from your mounted Google Drive

In [None]:
# step 1: upload a PDF to the root of the file browser
# then change the variable below to be the name of your file
file_name = 'uber_10k_2023'

In [None]:
pdf_text = extract_text_from_pdf(f"{data_dir}/RAG/data/10k/uber_10k_2023.pdf")

In [None]:
pdf_text.split('\n')[0:10000]

In [None]:
! pip install langchain-community


In [None]:
!pip install chromadb openai

In [None]:
import chromadb
from langchain.embeddings import OpenAIEmbeddings

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader

text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=300,
    chunk_overlap=50
)
loader = PyPDFLoader(f"{data_dir}/RAG/data/10k/uber_10k_2023.pdf")
# load_and_split uses RecursiveCharacterTextSplitter by default, but here I customize the chunk size & overlap
pages = loader.load_and_split(text_splitter)

In [None]:
from google.colab import userdata
os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')

In [None]:
# create vector store with Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.vectorstores.utils import filter_complex_metadata # import filter_complex_metadata

openai_api_key = os.environ["OPENAI_API_KEY"]
# filtered_chunked_markdown = filter_complex_metadata(chunked_markdown)

#index = Chroma.from_documents(documents=pages, embedding=OpenAIEmbeddings(openai_api_key=os.environ["OPENAI_API_KEY"]))

vectordb = Chroma.from_documents(documents=pages, embedding=OpenAIEmbeddings(openai_api_key=os.environ["OPENAI_API_KEY"]),persist_directory=f"{data_dir}/RAG/VectorDB/chroma_db_RAG_FOR_Evals")
vectordb.persist()
retriever = vectordb.as_retriever()

In [None]:
# multi-query
from langchain.prompts import ChatPromptTemplate

template = """You are an AI language model Accounting assistant. Your task is to generate five
different versions of the given user question to retrieve relevant documents from a vector
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search.
Provide these alternative questions separated by newlines. Original question: {question}"""
prompt_perspectives = ChatPromptTemplate.from_template(template)

from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI

generate_queries = (
    prompt_perspectives
    | ChatOpenAI(temperature=0, openai_api_key=openai_api_key)
    | StrOutputParser()
    | (lambda x: x.split("\n"))
)

In [None]:
questiononRevenue = "Please summarize Financial and Operational Highlights for Uber?"

In [None]:
generate_queries.invoke(questiononRevenue)

In [None]:
question = "Does Uber have the liquidity to meet its working capital and capital expenditures needs.Please explain?"

In [None]:
generate_queries.invoke(question)

In [None]:
# Retrieve docs given a list of queries
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(temperature=0, openai_api_key=openai_api_key)

#retriever = MultiQueryRetriever.from_llm(
#    retriever=index.as_retriever(), llm=llm
#)

In [None]:
# Rank documents
from langchain.load import dumps, loads

def rank_documents(results: list[list], k=60):
    fused_scores = {}

    for docs in results:
        for rank, doc in enumerate(docs):
            doc_str = dumps(doc)
            if doc_str not in fused_scores:
                fused_scores[doc_str] = 0
            previous_score = fused_scores[doc_str]
            fused_scores[doc_str] += 1 / (rank + k)

    reranked_results = [
        (loads(doc), score)
        for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
    ]

    return reranked_results

retrieval_chain = generate_queries | retriever.map() | rank_documents
docs = retrieval_chain.invoke(questiononRevenue)

In [None]:
docs

In [None]:
#Creating a RAG Pipeline
from operator import itemgetter
from langchain_core.runnables import RunnablePassthrough


# RAG
template = """Answer the following question based on this context:

{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)


llm = ChatOpenAI(temperature=0, openai_api_key=openai_api_key)

# Chain
final_rag_chain = (
    {"context": retrieval_chain, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

final_rag_chain.invoke({"question":question})

In [None]:
dataset_name = "Uber_10K_2023_v2"

In [None]:
# create dataset
from langsmith import Client
import os

dataset_inputs = [
    '1. Can Uber cover its operational and investment expenses with its available funds?',
 '2. Is Uber financially equipped to handle its day-to-day expenses and long-term investments?',
 '3. Does Uber possess enough financial resources to support its working capital and capital expenditure requirements?',
 '4. How well is Uber positioned to meet its financial obligations for both short-term and long-term needs?',
 '5. Is Uber adequately funded to address its working capital and capital expenditure demands effectively?'
]

dataset_outputs = [
    {"must_mention": ["Uber's total assets were $38,699 million", "Additionally, Uber had restricted cash and cash equivalents of $805 million", "The company's financial position suggests that it has the resources to cover its short-term obligations and fund its operational requirements.", "1"]},
    {"must_mention": ["$12,682 million ($38,699 million - $26,017 million)", 'global company and as of December 31, 2023, we and our subsidiaries had approximately 30,400 employees globally and operations inapproximately 70 countries and more than 10,000 cities around the world', '$12,682 million ']},
    {"must_mention": ["$5,407 million in liquid assets", 'Freight Gross Bookings declining 25% year-over-year.']}, # reading from a table
    {"must_mention": ["The company's financial position suggests that it has the resources to cover its short-term obligations and fund its operational requirements."]}, # reading from a table
    {"must_mention": ["Uber had cash and cash equivalents of $4,680 million and short-term investments of $727 million"]}, # reading from a table
]

# ensure you have this set up
from google.colab import userdata
os.environ["LANGCHAIN_API_KEY"] = userdata.get('LANGCHAIN_API_KEY')
langchain_api_key = os.environ["LANGCHAIN_API_KEY"]

client = Client(api_key=langchain_api_key)

dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="Uber 10K 2023 questions",
)
client.create_examples(
    inputs=[{"question": q} for q in dataset_inputs],
    outputs=dataset_outputs,
    dataset_id=dataset.id,
)

In [None]:
 #run evals in langsmith
from langsmith.schemas import Run, Example

from langsmith.evaluation import evaluate, LangChainStringEvaluator


def must_mention(run: Run, example: Example) -> dict:
    prediction = run.outputs.get("output") or ""
    required = example.outputs.get("must_mention") or []
    score = any(phrase in prediction for phrase in required)
    return {"key":"must_mention", "score": score}

evaluators = [
  must_mention,
]
runner = final_rag_chain
def query_wrapper(query_dict):
    query_string = query_dict['question']
    response = runner.invoke(query_string)
    return {"output": response}

experiment_results = evaluate(
    query_wrapper,
    data=dataset_name,
    evaluators=evaluators,
    experiment_prefix="uber10Kv1",
    client=client,
)

In [None]:
!pip install -U langchain langchain-openai

In [None]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI()
llm.invoke("Hello, world!")