In [1]:
import os
from pathlib import Path
from dotenv import load_dotenv, find_dotenv
import pickle
import asyncio
import nest_asyncio
nest_asyncio.apply()

from src.utils import (
    pdf_data_loader,
    get_nodes,
    get_index,
    get_sentence_window_query_engine,
load_data_to_sql_db,
csv_excel_data_loader,
text_to_query_engine
    )
from sqlalchemy import create_engine
import sqlite3
load_dotenv(find_dotenv())

True

In [2]:
# File Paths
# File Paths
current_path = Path.cwd()
pdf_dirs = current_path.parent / "data" / "pdfs"
# pdf_dirs  = Path("/Users/hamidadesokan/Dropbox/2_Skill_Development/DLML/genai_applications/RAG/data/pdfs")
docs_path = current_path.parent / "data" / "llamadocs"
# docs_path = Path("/Users/hamidadesokan/Dropbox/2_Skill_Development/DLML/genai_applications/RAG/data/llamadocs")
chroma_path = current_path.parent / "data" / "chromadb"
# chroma_path = Path("/Users/hamidadesokan/Dropbox/2_Skill_Development/DLML/genai_applications/RAG/data/chromadb")
sqlitepath = current_path.parent / "data" / "sqlite" / "sqlite.db"
# sqlitepath = Path("/Users/hamidadesokan/Dropbox/2_Skill_Development/DLML/genai_applications/RAG/data/sqlite/sqlite.db")
# Tesla Data
nodes_path = current_path.parent / "data" / "nodes" / "tesla_esg_nodes.pkl"
# nodes_path = Path("/Users/hamidadesokan/Dropbox/2_Skill_Development/DLML/genai_applications/RAG/data/nodes/tesla_esg_nodes.pkl")


# Sales Data
sales_data_path = current_path.parent / "data" / "csv" / "kaggle_sample_superstore.csv"
# sales_data_path = Path("/Users/hamidadesokan/Dropbox/2_Skill_Development/DLML/genai_applications/RAG/data/csv/kaggle_sample_superstore.csv")
sales_nodes_path = current_path.parent / "data" / "nodes" / "sales_nodes.pkl"
# sales_nodes_path = Path("/Users/hamidadesokan/Dropbox/2_Skill_Development/DLML/genai_applications/RAG/data/nodes/sales_nodes.pkl")


parsing_instruction = """
The pdf documents are annual sustainability reports of companies. 
Always quantify your answer using the numbers cited in the relevant text in the document.
Parse every number embedded in the text in a format that is the easiest to use for calculations or retrieval.
Parse the numbers that indicates sustainiability metrics for that year as a list of sustainability metrics with relevanat text and numbers. 
"""


## Load data and save the Document object

In [3]:
# esg_docs = asyncio.run(pdf_data_loader(pdf_dirs, docs_path, "10K_esg",parsing_instruction, 3))
with open(Path.joinpath(docs_path, "10K_esg"), "rb") as f:
    esg_docs = pickle.load(f)

In [4]:
# create nodes
# esg_nodes = get_nodes(esg_docs, nodes_path, is_markdown=True,
#                                         num_workers=6)
# 
with open(Path.joinpath(nodes_path), "rb") as f:
    esg_nodes = pickle.load(f)

In [5]:
# Create index
esg_index = get_index(str(chroma_path), "10K_esg", esg_nodes)

loading index 10K_esg


In [6]:
# Create query engine
esg_query_engine = get_sentence_window_query_engine(esg_index, similarity_top_k=6,
                                                    rerank_top_n=3)

In [8]:
# esg_query = "How much was Tesla's Research and Development amount and as a percentage of revenue in 2020?"
# esg_query = "How much did McKinsey invested in strengthening risk management since 2018?"
# esg_query = "How much was Tesla's Selling, General, and Administrative expense in 2022?"
esg_query = "How much was Tesla's Selling General and Administrative expense in 2022 as a percentage of revenue?"
# esg_query = "How much did Tesla's Selling, General, and Administrative changed from 2022 vs 2021 in dollar amount? "
response = esg_query_engine.query(esg_query)
print(response)


Tesla's Selling, General and Administrative (SG&A) expenses as a percentage of revenue decreased from 8% in 2021 to 5% in 2022.


In [9]:
df = load_data_to_sql_db(str(sales_data_path), str(sqlitepath), "Sales")
conn = sqlite3.connect(str(sqlitepath))
engine = create_engine("sqlite:///" + str(sqlitepath))
sales_sql_query_engine = text_to_query_engine(["Sales"], engine)

In [10]:
# sales_query ="What is the total sales in Los Angeles"
sales_query ="Which Segment sold the most?" 
response = sales_sql_query_engine.query(sales_query)
print(response)

Based on the SQL query provided, the segment that sold the most is the "Consumer" segment, with a total sales of 1,161,401.34.

The SQL query first groups the sales data by the "Segment" column, calculates the total sales for each segment using the SUM(Sales) function, orders the results by the total sales in descending order, and then takes the first (LIMIT 1) result, which represents the segment with the highest total sales.

Therefore, the response to the given query "Which Segment sold the most?" would be:

"The Consumer segment sold the most, with a total sales of 1,161,401.34."


In [10]:
response.metadata

{'6fe76441-0fb2-42cf-8d91-7c1a7ce6be4f': {},
 'sql_query': 'SELECT Segment, SUM(Sales) AS total_sales\nFROM Sales\nGROUP BY Segment\nORDER BY total_sales DESC\nLIMIT 1;',
 'result': [('Consumer', 1161401.3449999888)],
 'col_keys': ['Segment', 'total_sales']}

In [17]:
from llama_index.core.agent import FunctionCallingAgentWorker, AgentRunner
from llama_index.llms.anthropic import Anthropic
from llama_index.llms.openai import OpenAI
from llama_index.core.selectors import LLMSingleSelector, PydanticSingleSelector
from llama_index.core.query_engine import RouterQueryEngine, SubQuestionQueryEngine
from llama_index.core.tools import QueryEngineTool, ToolMetadata
# model_name = "claude-3-haiku-20240307"
llm = Anthropic(model="claude-3-opus-20240229")
# llm = OpenAI(temperature=0.1, model= "gpt-3.5-turbo")
esg_tool = QueryEngineTool(
    query_engine=esg_query_engine,
    metadata=ToolMetadata(
        name = "Tesla10K_Esg_Report_Query_Engine",
         description= "Tool to query McKinsey and Deloitte annual sustainability report. And Teslas 2022 10K financial statement"))
sql_tool = QueryEngineTool(
    query_engine=sales_sql_query_engine,
    metadata=ToolMetadata(
        name ="SQl engine to query the sqlite sales database",
        description="Useful for translating a natural language query into a SQL query over"
        " a table containing: product information"))
query_engine_tools=[
        esg_tool,
        sql_tool]

## ROUTER
# query_engine = RouterQueryEngine(
#     selector=PydanticSingleSelector.from_defaults(llm=llm),
#     query_engine_tools=query_engine_tools,
# )

## SUB QUERY
sq_engine = SubQuestionQueryEngine.from_defaults(
    query_engine_tools=query_engine_tools)

## ANTHROPIC AGENT
# agent_worker = FunctionCallingAgentWorker.from_tools(
#     query_engine_tools, llm=llm, verbose=True
# )
# agent = AgentRunner(agent_worker)

In [18]:
# sales_query ="What is the total sales in Los Angeles"
sales_query ="Which product Segment sold the most using the sales database?"
# sales_query ="How much did Tesla's Selling, General, and Administrative changed from 2022 vs 2021 in dollar amount? "
response = sq_engine.query(sales_query)
print(response)

Generated 3 sub questions.
[1;3;38;2;237;90;200m[SQl engine to query the sqlite sales database] Q: What are the different product segments in the sales database?
[0m[1;3;38;2;90;149;237m[SQl engine to query the sqlite sales database] Q: What is the total sales volume for each product segment?
[0m[1;3;38;2;11;159;203m[SQl engine to query the sqlite sales database] Q: Which product segment had the highest total sales volume?
[0m[1;3;38;2;237;90;200m[SQl engine to query the sqlite sales database] A: Based on the SQL query and its response, the different product segments in the sales database are:

1. Consumer
2. Corporate
3. Home Office

The SQL query selects the distinct values of the "Segment" column from the "Sales" table, which returns the three unique product segments listed above.
[0m[1;3;38;2;11;159;203m[SQl engine to query the sqlite sales database] A: Based on the SQL query provided, the product segment that had the highest total sales volume is the "Consumer" segment. T

In [22]:
# esg_query = "How is McKinsey responding to the war in Ukraine in its sustainability effort?"
esg_query ="Compare Deloitte's sustainbaility goals with that of McKinsey" 
# esg_query = "How much was Tesla's Selling, General, and Administrative expense in 2022?"
# esg_query = "How much did Tesla's Selling, General, and Administrative changed from 2022 vs 2021 in dollar amount? "
# esg_query = "How much was Tesla's Selling General and Administrative expense in 2022 as a percentage of revenue?"
response = sq_engine.query(esg_query)
print(response)

Generated 3 sub questions.
[1;3;38;2;237;90;200m[Tesla10K_Esg_Report_Query_Engine] Q: What are Deloitte's sustainability goals as reported in their annual sustainability report?
[0m[1;3;38;2;90;149;237m[Tesla10K_Esg_Report_Query_Engine] Q: What are McKinsey's sustainability goals as reported in their annual sustainability report?
[0m[1;3;38;2;11;159;203m[Tesla10K_Esg_Report_Query_Engine] Q: How do Deloitte's and McKinsey's sustainability goals compare?
[0m[1;3;38;2;90;149;237m[Tesla10K_Esg_Report_Query_Engine] A: Based on the information provided in the context, McKinsey's key sustainability goals as reported in their annual sustainability report are:

1. Environmental Sustainability: Become the largest private sector catalyst for decarbonization.

2. Social Inclusive Growth: Build inclusive economies, institutions, and workforces that reflect their communities.

3. Governance Responsible Practices: Lead with integrity and set the standard for accountability and compliance in th