In [1]:
# !pip install -r data/requirements.txt

In [2]:
import os
import hashlib

# used to load the pdfs 
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings

# used to create the retrieval tool
from langchain.agents import tool

# used to create the memory
from langchain.memory import ConversationBufferMemory

# used to create the prompt template
from langchain.agents.openai_functions_agent.base import OpenAIFunctionsAgent
from langchain.agents.openai_functions_agent.base import create_openai_functions_agent
from langchain.schema import SystemMessage
from langchain.prompts import MessagesPlaceholder

# used to create the agent executor
from langchain_openai import OpenAI
from langchain.agents import AgentExecutor
from langchain_openai import ChatOpenAI


from langchain.tools import BaseTool
from pydantic import BaseModel, Field
from typing import Optional, Type, List, Union
from langchain.prompts import PromptTemplate
from langchain.chains import LLMMathChain, LLMChain
from langchain.callbacks.manager import (AsyncCallbackManagerForToolRun, CallbackManagerForToolRun)
from langchain.memory import ConversationBufferWindowMemory
from langchain.agents import AgentType, initialize_agent, Tool, AgentExecutor, AgentOutputParser
from langchain.schema import AgentAction, AgentFinish, OutputParserException

from langchain import hub
from langchain.agents import AgentExecutor, create_openai_tools_agent


from langchain.agents import load_tools
from langchain_community.utilities import GoogleSearchAPIWrapper

In [3]:
# insert your API key here
openai_api_key = open("API_key.txt").read().strip()
os.environ["OPENAI_API_KEY"] = openai_api_key
os.environ["SERPAPI_API_KEY"] = '4bde37c74068633153b825cb3ff392ee3a6697a630674d5abb5d55be50f58a49'
os.environ["GOOGLE_API_KEY"] = "AIzaSyDjYdIvnFq1Uw2DW7C5sj8fbXCl9HfF89c"
os.environ["GOOGLE_CSE_ID"] = "74e790ea0cfaf4154"
memory_key = "history"

In [4]:
llm = ChatOpenAI(temperature = 0, openai_api_key=openai_api_key, model="gpt-3.5-turbo")

# Hashing Function 

In [5]:
# Ensures robustness when saving indices. Always ensures each doc or combination of merged docs is assigned a unique path name
def hash_string(input_string: str) -> str:
    """
    Hashes a given string using SHA-256 algorithm.

    Parameters:
        input_string (str): The string to be hashed.
    """
    
    # Encode the string to bytes before hashing
    input_bytes = input_string.encode('utf-8')
    
    # Create a SHA-256 hash object
    sha256_hash = hashlib.sha256()
    
    # Update the hash object with the input bytes
    sha256_hash.update(input_bytes)
    
    # Get the hexadecimal digest of the hash
    hashed_value = sha256_hash.hexdigest()
    
    return hashed_value

# VectorBase Logic (File Upload Script)

In [6]:
# Returns page by page doc in pyPDF wrapper 
def load_document(doc_path: str = ""):
    """
    Returns the pyPDF wrapped doc.
    
    parameters:
    doc_path: this is the path to the uploaded document
    """
    
    loader = PyPDFLoader(doc_path)
    doc = loader.load()
    
    return doc

In [7]:
# Creating an OpenAI embeddings object
embeddings = OpenAIEmbeddings()

In [8]:
# This function simply checks whether the file_name is present at folder_path
def check_file_exists(folder_path, file_name):
    """
    Checks whether file_name is present at folder_path or not.
    
    parameters:
    folder_path: this will be the folder which stores the db info for a PARTICULAR user.
    file_name: just the file name of the particular file at hand (hashed value). THIS IS NOT THE WHOLE file path.
    """
    
    file_path = os.path.join(folder_path, file_name)
    return os.path.exists(file_path)

### PLEASE CHANGE THE SAVED_DB_ROOT VARIABLE HERE ACCORDING TO YOUR PATHS!

In [9]:
def establish_retriever(doc_paths: List[str] = None, chunk_size: int = 1000, overlap: int = 50):
    """
    doc_paths: List of paths to the user uploaded documents.
    chunk_size: Maximum chunk size to which doc is broken into.
    overlap: The # of overlapping tokens between chunks (helps with retaining some context between chunks).
    
    Chunkifies the docs, creates embeddings, uploads embeddings to vector database. Saves and returns retriever.
    In case doc was already found at the user path, simply return the old retriever object.
    
    Whenever len(doc_paths) > 1, assume user wants to merge the docs in the database. 
    db1.merge_from(db2) collapses db2 into the index of db1 so there is no need to store db2 separately.
    """
    
    if not doc_paths or len(doc_paths) < 1:
        raise ValueError("Must give at least one document")
    
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
    
    # extract the document names from the doc paths provided by the user 
    doc_names = [doc.split('/')[-1].split('.')[0] for doc in doc_paths]
    
    
    # CHANGE THIS TO YOUR ROOT (the path to the folder where db info for ALL USERS is stored)
    saved_db_root = 'data/indices' 
    
    # Combine all names for hashing
    saved_db_name = ('_').join(doc_names)
    
    # This is the name of the db index for the current doc paths supplied by the user
    saved_db_value = hash_string(saved_db_name)
    
    # This is the path to the db file where we save the vectorstore index created by this function call (same as saved_db_value but this is the full path with root included)
    saved_db_path = os.path.join(saved_db_root, saved_db_value)
    
    # If this saved_db_name already exists at saved_db_value, then simply fetch the db from the index and return it
    if check_file_exists(saved_db_root, saved_db_value):
        return FAISS.load_local(saved_db_path, embeddings, allow_dangerous_deserialization=True)
    
    # If this is a new set of documents, then create and merge the dbs and then save
    
    # Since all merged dbs need to merge onto some existing db, our db_base will be that special db (its the first doc from doc_paths for simplicity)
    db_base = None 
    
    for i, doc_path in enumerate(doc_paths):        
        doc = load_document(doc_path)
        texts = text_splitter.split_documents(doc)
        db = FAISS.from_documents(texts, embeddings) 
        
        # Set the first db to db_base. Otherwise, merge current db with db_base 
        if i == 0:
            db_base = db
        else:
            # Merge current db object with the base db object.
            db_base.merge_from(db)
        
    # Save the db object every time a new one is created
    db_base.save_local(saved_db_path)
    
    return db_base

In [10]:
doc_paths = ["data/google_earning_call.pdf", "data/tesla_earning_call.pdf", "data/G04_RA1.pdf"]

db = establish_retriever(doc_paths) 
retriever = db.as_retriever() # This gives us the retriever object

In [11]:
retriever.invoke('does the proposed Stupid Backoff work well?')

[Document(page_content='tributed systems to generate translations. However,\nthe authors propose an alternative strategy where\nthey only have to contact one system. This opti-\nmization is achieved by maintaining pre-calculated\nprobabilities for various phrases rather than rely-\ning on raw data. Their approach demonstrates an\nefficiency advantage.\n4.2 Stupid Backoff\nStupid backoff does not generate probabilities but\nrather scores. This could be a potential limitation\nbecause many applications using language models\nrely on normalized probabilities to make decisions.\nWe suggest the authors discuss this as a limitation\nand propose a workaround for users who want to\nuse probabilities rather than scores.\n4.3 Generation of n-grams\nThe text mentions that sharding based solely on\nthe first word of n-grams can lead to imbalanced\nshards, impacting runtime. The authors need to\nmention this as a potential limitation and elaborate\non how they fixed it (using the first two words fo

# Creating Tools

In [12]:
# This is the retrieval tool that searches the documents and fetches chunks most similar to user query using FAISS.
@tool
def retriever_tool(query:str = None):
    "Searches and returns relevant documents to user queries regarding the uploaded document"
    docs = retriever.get_relevant_documents(query)
    return docs

tools = [retriever_tool]

In [13]:
# This is an arithmetic tool which simply performs any calculation the user might want ot perform. 
@tool 
def math_tool(query):
    "Takes in a arithmetic equation as query and solves the equation in a step-by-step manner."
    llm_math_chain = LLMMathChain.from_llm(llm, verbose=True)
    response = llm_math_chain.run(query)
    
    return response

tools.append(math_tool)

In [14]:
search = GoogleSearchAPIWrapper()

search_tool = Tool(
    name="google_search",
    description="Search Google for recent results.",
    func=search.run,
)
tools.append(search_tool)

# Agent Logic (Generation Script)

In [15]:
# Vital to retain memory since that is what enables the back-forth convo
memory = ConversationBufferMemory(memory_key=memory_key, return_messages=True)

In [16]:
# Our custom system prompt
sys_prompt = """
You are an agent who has a second brain, namely its retrieval tool.\
You make use of this tool along with your search and math tools to answer user queries.\

Whenever given a query, follow this protocol to answer:\
1. Always call your retrieval tool to fetch the data relevant to answering the user query.\
2. Identify wether you need to use the search tool or not. ONLY USE THE SEARCH TOOL IF NO DATA IS RETURNED BY RETRIEVAL TOOL.\
3. Augment the retrieved results and answer the user query.

Follow these instructions to format your final response:
1. Be concise. 
2. Follow the protocol mentioned above AT ALL TIMES.
3. Before using the search tool as the user explicitly "NO RESULTS FOUND. WANT ME TO SEARCH THE WEB?"\
if the reply is a yes only then use the search tool otherwise do not use it.
"""

In [17]:
# Here we are simply wrapping the prompts to feed into the model
system_message = SystemMessage(
        content=(f"{sys_prompt}")
)
prompt = OpenAIFunctionsAgent.create_prompt(
        system_message=system_message,
        extra_prompt_messages=[MessagesPlaceholder(variable_name=memory_key)]
)

In [18]:
# Now we define the AGENT
agent = create_openai_functions_agent(llm=llm, tools=tools, prompt=prompt)

In [19]:
# This is where everything is wrapped together to form our RAG agent. RAG is just one capability enabled by our retrieval tool. Adding more tools gives more abilities
agent_executor = AgentExecutor(agent=agent, tools=tools, memory=memory, verbose=True)

In [20]:
# This function passes user queries to the model and returns the inferred response
def infer(query):
    agent_input = {'input':query}
    result = agent_executor(agent_input)
    
    return result['output']

# Example Inferences

In [21]:
# Instance of the retriver tool being called.
print(infer("Can you tell me what was the effect of staff cuts on Google in terms of severance costs?"))

  warn_deprecated(




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `retriever_tool` with `{'query': 'effect of staff cuts on Google severance costs'}`


[0m[36;1m[1;3m[Document(page_content='Severance and office space exit charges are included within our consolidated statements of income as follows (in \nmillions):\nQuarter Ended December 31, 2023 Year Ended December 31, 2023\nSeverance \nand Related Office Space TotalSeverance \nand Related Office Space Total\nCost of revenues $ 2 $ 235 $ 237 $ 479 $ 481 $ 960 \nResearch and development  1  602  603 848  870  1,718 \nSales and marketing  3  198  201 497  257  754 \nGeneral and administrative  1  161  162 264  237  501 \nTotal charges $ 7 $ 1,196 $ 1,203 $ 2,088 $ 1,845 $ 3,933 \nFor segment reporting, the substantial majority of these charges are included within Alphabet-level activities in our \nsegment result s.\nChange in Useful Lives of Our Server and Network Equipment\nIn January 2023, we completed an assessment of the us

In [22]:
print(infer("What was the operating income in Q3 for tesla?"))



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `retriever_tool` with `{'query': 'Tesla operating income Q3'}`


[0m[36;1m[1;3m[Document(page_content='S U M M  A R Y H I G H L I G H T S\n(1) Excludes SBC (stock -based compensation) and release of valuation allowance on deferred tax assets; (2) Free \ncash flow = operating cash flow less capex; (3) Includes cash, cash equivalents and investments; (4) Tesla estimate \nbased on preliminary data; (5) Calculated by dividing Cost of Automotive Sales Revenue by respective quarter’s \nnew deliveries (ex -operating leases); (6)Active driver supervision required; does not make the vehicle autonomous.Profitability $8.9B GAAP operating income in 2023; $2.1B in Q4\n$15.0B GAAP net income in 2023; $7.9B in Q4\n$10.9B non -GAAP net income1 in 2023; $2.5B in Q4\nOne-time non- cash tax benefit of $5.9B recorded in Q4 for the\nrelease of valuation allowance on certain deferred tax assetsIn 2023, we delivered over 1.2 million M

In [23]:
print(infer("Was the stupid backoff method any good?"))



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `retriever_tool` with `{'query': 'stupid backoff method'}`


[0m[36;1m[1;3m[Document(page_content='showcases the authors’ ingenuity in addressing\nefficiency and overhead concerns which is a major\nstrength of the paper.\nSecondly, the authors’ ability to clearly artic-\nulate the challenges they faced when training a\nmodel at this scale (up to a trillion tokens) sets the\nstage perfectly for their proposed Stupid Backoff\nmethod. For example, the number of additional\nsteps required for the distributed training of the\nKneser-Ney algorithm (Table 1, page 5) already\nmakes it a daunting option compared to the much\nsimpler Stupid Backoff method.\nThirdly, the authors’ ability to go beyond just\nstating facts or equations adds more meaning to\ntheir work. We would like to cite two distinct ex-\namples of this:\n•When presenting the stupid backoff model for-\nmally (Brants et al., 2007) (equation 5, page\n2), the

In [24]:
print(infer('why is the sky blue?'))



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mNO RESULTS FOUND. WANT ME TO SEARCH THE WEB?[0m

[1m> Finished chain.[0m
NO RESULTS FOUND. WANT ME TO SEARCH THE WEB?


In [25]:
print(infer('yes'))



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `google_search` with `why is the sky blue`


[0m[38;5;200m[1;3mWhy Is the Sky Blue? The Short Answer: Sunlight reaches Earth's atmosphere and is scattered in all directions by all the gases and particles in the air. Blue ... Nov 4, 2023 ... The sky's blueness isn't from reflecting the water. Instead, its color has to do with scattered light. Gases and particles in Earth's atmosphere scatter sunlight in all directions. Blue light is scattered more than other colors because it travels as shorter, ... Why is the sky blue? ... As white light passes through our atmosphere, tiny air molecules cause it to 'scatter'. The scattering caused by these tiny air molecules ... Why is the sky blue? A clear cloudless day-time sky is blue because molecules in the air scatter blue light from the Sun more than they scatter red light. When ... Jan 31, 2020 ... The sky is blue due to a phenomenon called Raleigh scattering. This scatt