In [1]:
from llama_index import (VectorStoreIndex, 
                         SimpleDirectoryReader, 
                         StorageContext, 
                         load_index_from_storage, 
                         ServiceContext)
from llama_index.tools import QueryEngineTool, ToolMetadata
from llama_index.node_parser import SimpleNodeParser
from llama_index.query_engine import SubQuestionQueryEngine
from llama_index.callbacks import CallbackManager, LlamaDebugHandler
from langchain.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    SystemMessagePromptTemplate,
)
from llama_index.prompts import Prompt
from llama_index.llms import OpenAI

from pprint import pprint
import os
import openai
import sys
import json
sys.path.append("../src/")
from pipelines.paths import ARTICLES_JSONS_PATH

%load_ext autoreload
%autoreload 2

index_path = "../../data/temp/article_llama_index"
openai.api_key = os.environ["OPENAI_API_KEY"]
print(openai.api_key)


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


In [2]:
import logging
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))


# Prompts

In [9]:
chat_text_qa_msgs = [
    SystemMessagePromptTemplate.from_template(
        "Always answer the question, even if the context isn't helpful."
    ),
    HumanMessagePromptTemplate.from_template(
        "Passages from news articles below. Use them as context.\n"
        "---------------------\n"
        "{context_str}\n"
        "---------------------\n"
        "Given the context information and not prior knowledge, "
        "answer the question. For any claim you make, link it to the article you got the information from "
        "in this format [Link text Here](https://link-url-here.org): {query_str}\n"
    ),
]
chat_text_qa_msgs_lc = ChatPromptTemplate.from_messages(chat_text_qa_msgs)
text_qa_template = Prompt.from_langchain_prompt(chat_text_qa_msgs_lc)

# Refine Prompt
chat_refine_msgs = [
    SystemMessagePromptTemplate.from_template(
        "Always answer the question, even if the context isn't helpful."
    ),
    HumanMessagePromptTemplate.from_template(
        "We have the opportunity to refine the original answer "
        "(only if needed) with some more context below.\n"
        "------------\n"
        "{context_msg}\n"
        "------------\n"
        "Given the new context, refine the original answer to better "
        "answer the question, always referencing the articles for your claims: {query_str}. "
        "If the context isn't useful, output the original answer again.\n"
        "Original Answer: {existing_answer}"
    ),
]


chat_refine_msgs_lc = ChatPromptTemplate.from_messages(chat_refine_msgs)
refine_template = Prompt.from_langchain_prompt(chat_refine_msgs_lc)



# Build article index and save to disk - RUN ONCE

In [4]:
# Loading from a directory
documents = SimpleDirectoryReader(ARTICLES_JSONS_PATH).load_data()
for document in documents:
    #load document.text as json
    json_text = json.loads(document.text)
    document.text = json_text["text"]
    document.metadata = {"title": json_text["title"], "url": json_text["url"], "date": json_text["date"]}
    
parser = SimpleNodeParser.from_defaults(
    chunk_size=512,
    include_prev_next_rel=False,
    #metadata_extractor=metadata_extractor,
)

nodes = parser.get_nodes_from_documents(documents)
# Construct a simple vector index
index = VectorStoreIndex(nodes)
index.storage_context.persist(persist_dir=index_path)


DEBUG:llama_index.readers.file.base:> [SimpleDirectoryReader] Total files added: 54
> [SimpleDirectoryReader] Total files added: 54
DEBUG:llama_index.node_parser.node_utils:> Adding chunk: Summary
Summary Companies Audi considering U.S....
> Adding chunk: Summary
Summary Companies Audi considering U.S....
DEBUG:llama_index.node_parser.node_utils:> Adding chunk: with the Volkswagen Group.
Industry publication...
> Adding chunk: with the Volkswagen Group.
Industry publication...
DEBUG:llama_index.node_parser.node_utils:> Adding chunk: is due to lay out in March how it will rejig it...
> Adding chunk: is due to lay out in March how it will rejig it...
DEBUG:llama_index.node_parser.node_utils:> Adding chunk: BERLIN, Feb 24 (Reuters) - Volkswagen Group (VO...
> Adding chunk: BERLIN, Feb 24 (Reuters) - Volkswagen Group (VO...
DEBUG:llama_index.node_parser.node_utils:> Adding chunk: by Jan Schwartz, Victoria Waldersee, Writing by...
> Adding chunk: by Jan Schwartz, Victoria Waldersee, Writing

# Querying

In [10]:
## Setup query engine ##
chatgpt = OpenAI(model="gpt-3.5-turbo", temperature=0.0)
service_context = ServiceContext.from_defaults(
    llm = chatgpt,
)
storage_context = StorageContext.from_defaults(persist_dir=index_path)
# load index
index = load_index_from_storage(storage_context, service_context = service_context)
query_engine = index.as_query_engine(
    similarity_top_k=4,
    text_qa_template=text_qa_template,
    refine_template=refine_template,
)

DEBUG:llama_index.storage.kvstore.simple_kvstore:Loading llama_index.storage.kvstore.simple_kvstore from ../../data/temp/article_llama_index/docstore.json.
Loading llama_index.storage.kvstore.simple_kvstore from ../../data/temp/article_llama_index/docstore.json.
DEBUG:fsspec.local:open file: /Users/andrecharneca/Desktop/projects/news-articles-info-extraction/code/notebooks/../../data/temp/article_llama_index/docstore.json
open file: /Users/andrecharneca/Desktop/projects/news-articles-info-extraction/code/notebooks/../../data/temp/article_llama_index/docstore.json
DEBUG:llama_index.storage.kvstore.simple_kvstore:Loading llama_index.storage.kvstore.simple_kvstore from ../../data/temp/article_llama_index/index_store.json.
Loading llama_index.storage.kvstore.simple_kvstore from ../../data/temp/article_llama_index/index_store.json.
DEBUG:fsspec.local:open file: /Users/andrecharneca/Desktop/projects/news-articles-info-extraction/code/notebooks/../../data/temp/article_llama_index/index_store.

DEBUG:llama_index.graph_stores.simple:Loading llama_index.graph_stores.simple from ../../data/temp/article_llama_index/graph_store.json.
Loading llama_index.graph_stores.simple from ../../data/temp/article_llama_index/graph_store.json.
DEBUG:fsspec.local:open file: /Users/andrecharneca/Desktop/projects/news-articles-info-extraction/code/notebooks/../../data/temp/article_llama_index/graph_store.json
open file: /Users/andrecharneca/Desktop/projects/news-articles-info-extraction/code/notebooks/../../data/temp/article_llama_index/graph_store.json
INFO:llama_index.indices.loading:Loading all indices.
Loading all indices.


In [11]:
response = await query_engine.aquery(
    "What are the competitors of Microsoft?",
)

DEBUG:openai:message='Request to OpenAI API' method=post path=https://api.openai.com/v1/embeddings
message='Request to OpenAI API' method=post path=https://api.openai.com/v1/embeddings
DEBUG:openai:api_version=None data='{"input": ["What are the competitors of Microsoft?"], "model": "text-embedding-ada-002", "encoding_format": "base64"}' message='Post details'
api_version=None data='{"input": ["What are the competitors of Microsoft?"], "model": "text-embedding-ada-002", "encoding_format": "base64"}' message='Post details'
DEBUG:urllib3.util.retry:Converted retries value: 2 -> Retry(total=2, connect=None, read=None, redirect=None, status=None)
Converted retries value: 2 -> Retry(total=2, connect=None, read=None, redirect=None, status=None)


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.openai.com:443
Starting new HTTPS connection (1): api.openai.com:443
DEBUG:urllib3.connectionpool:https://api.openai.com:443 "POST /v1/embeddings HTTP/1.1" 200 None
https://api.openai.com:443 "POST /v1/embeddings HTTP/1.1" 200 None
DEBUG:openai:message='OpenAI API response' path=https://api.openai.com/v1/embeddings processing_ms=35 request_id=490b350b44d0bad36ac03e08b928dcd9 response_code=200
message='OpenAI API response' path=https://api.openai.com/v1/embeddings processing_ms=35 request_id=490b350b44d0bad36ac03e08b928dcd9 response_code=200
DEBUG:llama_index.indices.utils:> Top 4 nodes:
> [Node 7acd1be9-a739-44c8-9a6d-9f223f158502] [Similarity score:             0.787217] concerns about the deal.

The U.K.’s Competition and Markets Authority said this month that the t...
> [Node 153b1eab-cacb-4edd-823c-b0c9878dba68] [Similarity score:             0.787096] Smith said.

The Nvidia agreement addresses that as the GeForce

In [15]:
response.source_nodes

[NodeWithScore(node=TextNode(id_='7acd1be9-a739-44c8-9a6d-9f223f158502', embedding=None, metadata={'title': 'Nvidia supports Microsoft, Activision merger after Xbox deal to add games to cloud service', 'url': 'https://www.cnbc.com/2023/02/21/microsoft-will-bring-xbox-games-to-nvidias-cloud-gaming-service.html', 'date': '2023-02-21 00:00:00'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='7329af16-8c63-46ac-9ee9-c5e5c213ab8d', node_type=None, metadata={'title': 'Nvidia supports Microsoft, Activision merger after Xbox deal to add games to cloud service', 'url': 'https://www.cnbc.com/2023/02/21/microsoft-will-bring-xbox-games-to-nvidias-cloud-gaming-service.html', 'date': '2023-02-21 00:00:00'}, hash='8f1a8d3686b3932804945bb829ed3dc5b4ca6e6673039cdd969b1e6b3000f466')}, hash='bd571a7c68dcc0f5b43ae9cf80b2fe8e28316024db20158a4ba32410b3707ed8', text='concerns about the deal.\n\nThe U.K.’s Competition and

In [12]:
string = """Given a user question, and a list of tools, output a list of relevant sub-questions that when composed can help answer the full user question:\\n\\n# Example 1\\n<Tools>\\n```json\\n{\\n    \\"uber_10k\\": \\"Provides information about Uber financials for year 2021\\",\\n    \\"lyft_10k\\": \\"Provides information about Lyft financials for year 2021\\"\\n}\\n```\\n\\n<User Question>\\nCompare and contrast the revenue growth and EBITDA of Uber and Lyft for year 2021\\n\\n\\n<Output>\\n```json\\n[\\n    {\\n        \\"sub_question\\": \\"What is the revenue growth of Uber\\",\\n        \\"tool_name\\": \\"uber_10k\\"\\n    },\\n    {\\n        \\"sub_question\\": \\"What is the EBITDA of Uber\\",\\n        \\"tool_name\\": \\"uber_10k\\"\\n    },\\n    {\\n        \\"sub_question\\": \\"What is the revenue growth of Lyft\\",\\n        \\"tool_name\\": \\"lyft_10k\\"\\n    },\\n    {\\n        \\"sub_question\\": \\"What is the EBITDA of Lyft\\",\\n        \\"tool_name\\": \\"lyft_10k\\"\\n    }\\n]\\n```\\n\\n# Example 2\\n<Tools>\\n```json\\n{\\n    \\"news_articles\\": \\"News articles from the web about companies\\"\\n}\\n```\\n\\n<User Question>\\nGive me a list of the top companies working on AI technology.\\n\\n<Output>\\n""".replace("\\n", "\n")
print(string)

Given a user question, and a list of tools, output a list of relevant sub-questions that when composed can help answer the full user question:

# Example 1
<Tools>
```json
{
    \"uber_10k\": \"Provides information about Uber financials for year 2021\",
    \"lyft_10k\": \"Provides information about Lyft financials for year 2021\"
}
```

<User Question>
Compare and contrast the revenue growth and EBITDA of Uber and Lyft for year 2021


<Output>
```json
[
    {
        \"sub_question\": \"What is the revenue growth of Uber\",
        \"tool_name\": \"uber_10k\"
    },
    {
        \"sub_question\": \"What is the EBITDA of Uber\",
        \"tool_name\": \"uber_10k\"
    },
    {
        \"sub_question\": \"What is the revenue growth of Lyft\",
        \"tool_name\": \"lyft_10k\"
    },
    {
        \"sub_question\": \"What is the EBITDA of Lyft\",
        \"tool_name\": \"lyft_10k\"
    }
]
```

# Example 2
<Tools>
```json
{
    \"news_articles\": \"News articles from the web about com