# RAG_toy

**Over 'FinanceBench' tiny database**:

-[MICROSOFT_2023_10K](https://github.com/castillosebastian/genai0/blob/main/data/financebench/MICROSOFT_2023_10K.pdf)    
-[JOHNSON&JOHNSON_2022Q4_EARNINGS](https://github.com/castillosebastian/genai0/blob/main/data/financebench/JOHNSON&JOHNSON_2022Q4_EARNINGS.pdf)   
-[Pfizer_2023Q2_10Q](https://github.com/castillosebastian/genai0/blob/main/data/financebench/Pfizer_2023Q2_10Q.pdf)   
-[BESTBUY_2017_10K](https://github.com/castillosebastian/genai0/blob/main/data/financebench/BESTBUY_2017_10K.pdf)   
-[BESTBUY_2019_10K](https://github.com/castillosebastian/genai0/blob/main/data/financebench/BESTBUY_2019_10K.pdf)   

# 1. Set credential to AI-Search and Azure OpenAI (keep them secret)

In [1]:
! pip install openai azure-search-documents langchain --quiet
# keys (keep private)----------------------------------------------
import os
from azure.core.credentials import AzureKeyCredential
os.environ["AZURE_OPENAI_API_KEY"]  = 'b82effcf491e45a088b1cd578713311c'
os.environ["OPENAI_API_VERSION"]    = '2023-05-15'
os.environ["AZURE_OPENAI_ENDPOINT"] = 'https://usesharedaopenai001.openai.azure.com/'
os.environ["OPENAI_API_TYPE"]       = 'azure'
# Variables-------------------------------------------------------
index                               = "azure-cognitive-search-vector-demo"
azure_search_endpoint               = 'https://genai0.search.windows.net'
MODEL                               = "gtp35turbo-latest"
key                                 = 'lvhCA67EeE3JRyxyem5L0wGJSfOxscm2jft887ECdJAzSeDzoCNZ'
model                               = "text-embedding-ada-002"
credential                          = AzureKeyCredential(key)
COMPLETION_TOKENS                   = 1000
top_search_vector_k                 = 5

# 2. Some helper functions

In [2]:
import os
import sys
import pandas as pd
import openai
import json
import openai
from openai import AzureOpenAI
from langchain.embeddings import AzureOpenAIEmbeddings
from langchain.chat_models import AzureChatOpenAI
from langchain.docstore.document import Document
from azure.search.documents.models import (
    QueryAnswerType,
    QueryCaptionType,
    QueryCaptionResult,
    QueryAnswerResult,
    SemanticErrorMode,
    SemanticErrorReason,
    SemanticSearchResultsType,
    QueryType,
    VectorizedQuery,
    VectorQuery,
    VectorFilterMode,
)
from langchain.vectorstores.azuresearch import AzureSearch
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from IPython.display import display, HTML, Markdown
# Some helper functions---------------------------------------------
def OpenAIembeddings():
    open_ai_embeddings = AzureOpenAIEmbeddings(
        azure_deployment="text-embedding-ada-002",
        chunk_size=1000,
    )
    return open_ai_embeddings

client = AzureOpenAI()

def generate_embeddings(text, model=model):
        return client.embeddings.create(input = [text], model=model).data[0].embedding

# Retriever------------------------------------------------------------
from azure.search.documents import SearchClient, SearchIndexingBufferedSender
search_client = SearchClient(endpoint=azure_search_endpoint, index_name=index, credential=credential)

# 3. Ask your question

In [3]:
# the question to ask, enable just one, for now ;)
QUESTION = "What is the revenue of JOHNSON&JOHNSON"
# get final answer from LLM
final_answer = True

# 4. Retrieve-Augment-Generate

In [4]:
vector_query = VectorizedQuery(vector=generate_embeddings(QUESTION), k_nearest_neighbors=5, fields="contentVector")
results = search_client.search(
    search_text=None,
    vector_queries= [vector_query],
    include_total_count=True,
    select=["id", "company_name", "source", "doc_type", "page_content"],
)
#results.get_count()
# Format Azure AI Search results as an ordered dictionary
from collections import OrderedDict
ordered_results = OrderedDict()
for result in results:
    ordered_results[result['id']]={
        "score": result['@search.score'],
        "company_name": result['company_name'],
        "source": result['source'],
        "doc_type": result['doc_type'],
        "page_content": result['page_content']
    }
# From the ordered dictionary, build the "Documents" array, needed by langchain "qa_with_sources" chain
top_docs = []
for key,value in ordered_results.items():
    references = f'Company: {value["company_name"]}, SEC report: {value["doc_type"]} \n (reference doc:"{value["source"]}", reference_score:{value["score"]} - id:{key}).'
    top_docs.append(Document(page_content=value["page_content"], metadata={"source": references}))

In [5]:
# Prepare the Open AI deployment
#
if final_answer:    
    from langchain.chat_models import AzureChatOpenAI
    llm = AzureChatOpenAI(model_name=MODEL)
    from langchain.chains.qa_with_sources import load_qa_with_sources_chain
    chain = load_qa_with_sources_chain(llm, chain_type='stuff')
    response = chain({"input_documents": top_docs, "question": QUESTION, "language": "English"})

# 5. Answer

In [6]:
if final_answer:    
    display(HTML(f"<br/><br/><b>RAG_toybot final answer:</b>"))
    display(Markdown(response['output_text']))
else:
    for i, doc in enumerate(top_docs):  
        print(f"Retrieved document {i}\n\nContent: {doc.page_content}")
        print(f"Retrieved document {i}\n\nMetadata: {doc.metadata}")
        print('-'*100)  
    print(top_docs)

The revenue of Johnson & Johnson for the full year 2022 was $94,943 million.
SOURCES: Company: JOHNSON&JOHNSON, SEC report: EARNINGS (reference doc:"../../data/financebench/JOHNSON&JOHNSON_2022Q4_EARNINGS.pdf", reference_score:0.86887956 - id:454)