In [41]:
import nest_asyncio
nest_asyncio.apply()

import llama_index
llama_index.set_global_handler("simple")

import os

os.environ["OPENAI_API_KEY"] = "sk-..."

import logging
import sys

logging.basicConfig(
    stream=sys.stdout, level=logging.INFO
)  # logging.DEBUG for more verbose output

from llama_index import (
    KnowledgeGraphIndex,
    ServiceContext,
    SimpleDirectoryReader,
    SimpleKeywordTableIndex
)
from llama_index.storage.storage_context import StorageContext
from llama_index.graph_stores import NebulaGraphStore
from llama_index.llms import OpenAI

from IPython.display import Markdown, display
from llama_index.llms.palm import PaLM
from llama_index.embeddings import GooglePaLMEmbedding


from llama_index.callbacks import (
    CallbackManager,
    LlamaDebugHandler
)


from llama_index.retrievers import (
    KeywordTableSimpleRetriever
)

from llama_index import Document, SummaryIndex
from llama_index.query_engine import PandasQueryEngine, RetrieverQueryEngine
from llama_index.retrievers import RecursiveRetriever
from llama_index.schema import IndexNode
from llama_hub.file.pymu_pdf.base import PyMuPDFReader
from pathlib import Path
from typing import List
from llama_index.readers import WikipediaReader

from llama_index import (
    VectorStoreIndex,
    SimpleDirectoryReader,
    ServiceContext,
    StorageContext,
    SQLDatabase,
)

from llama_index.node_parser import SentenceSplitter
from llama_index.schema import IndexNode
from llama_index.response.notebook_utils import display_source_node


from llama_index.node_parser import SentenceSplitter
from llama_index.schema import IndexNode
from llama_index.extractors import (
    SummaryExtractor,
    QuestionsAnsweredExtractor,
)


from llama_index import Document
from llama_index.embeddings import OpenAIEmbedding
from llama_index.text_splitter import SentenceSplitter
from llama_index.extractors import TitleExtractor
from llama_index.ingestion import IngestionPipeline, IngestionCache
from llama_index.schema import MetadataMode


In [2]:
llama_debug = LlamaDebugHandler(print_trace_on_end=True)
callback_manager = CallbackManager([llama_debug])

In [32]:
palm_api_key  = "AIzaSyApBCzqW_RF4qbkX9kMoNwjooIqrm8oZEQ"
llm = PaLM(api_key=palm_api_key)

model_name = "models/embedding-gecko-001"
embed_model = GooglePaLMEmbedding(model_name=model_name, api_key=palm_api_key)

service_context = ServiceContext.from_defaults(
                                    llm = llm,
                                    embed_model = embed_model,
                                    chunk_size=512,
                                    callback_manager=callback_manager)

## Create an Ingetion pipeline

In [33]:
from llama_index.node_parser import SentenceSplitter
from llama_index.extractors import (
    SummaryExtractor,
    QuestionsAnsweredExtractor,
    TitleExtractor,
    KeywordExtractor,
    EntityExtractor,
)

transformations = [
    SentenceSplitter(),
    TitleExtractor(nodes=5, llm = llm),
    QuestionsAnsweredExtractor(questions=3, llm=llm),
    SummaryExtractor(summaries=["prev", "self"], llm=llm),
    KeywordExtractor(keywords=10, llm=llm),
EntityExtractor(prediction_threshold=0.5, llm = llm),
]

#### Load documents

In [24]:
from llama_index import download_loader

WikipediaReader = download_loader("WikipediaReader")

loader = WikipediaReader()

documents = loader.load_data(pages=["2023 in science"], auto_suggest=False)
documents[0].text = documents[0].text[:2000]

#### Run pieplene

In [35]:
pipeline = IngestionPipeline(transformations=transformations)

nodes = pipeline.run(documents=documents)





  0%|                                                                                            | 0/1 [00:00<?, ?it/s][A[A[A[A



100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.19s/it][A[A[A[A


** Prompt: **
Context: The following scientific events occurred or were scheduled to occur in 2023.


== Events ==


=== January ===


=== February ===


=== March ===


=== April ===
3 April
Five employees at the National Hurricane Center publish a tropical cyclone report (TCR) on Hurricane Ian, which officially upgrades the hurricane from a Category 4 to a Category 5 on the Saffir–Simpson scale. The TCR also stated that Hurricane Ian caused, with 90% confidence, $112.9 billion worth of damage to the United States, which made Ian the third-costliest United States hurricane on record as well as the costliest hurricane to strike Florida on record.
An unexplained rise of emissions of five chlorofluorocarbons (CFCs), successfully banned by the Montreal Protocol of 1989, is reported. Their climate impact in 2020 is roughly equivalent to that of the CO2e from Denmark in 2018.
A study affirms and explains why a moderate decrease in body temperature extends lifespan.
5 April
The NOAA reports 





  0%|                                                                                            | 0/1 [00:00<?, ?it/s][A[A[A[A



100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:04<00:00,  4.91s/it][A[A[A[A


** Prompt: **
Here is the context:
[Excerpt from document]
document_title: 2023 in science
Excerpt:
-----
The following scientific events occurred or were scheduled to occur in 2023.


== Events ==


=== January ===


=== February ===


=== March ===


=== April ===
3 April
Five employees at the National Hurricane Center publish a tropical cyclone report (TCR) on Hurricane Ian, which officially upgrades the hurricane from a Category 4 to a Category 5 on the Saffir–Simpson scale. The TCR also stated that Hurricane Ian caused, with 90% confidence, $112.9 billion worth of damage to the United States, which made Ian the third-costliest United States hurricane on record as well as the costliest hurricane to strike Florida on record.
An unexplained rise of emissions of five chlorofluorocarbons (CFCs), successfully banned by the Montreal Protocol of 1989, is reported. Their climate impact in 2020 is roughly equivalent to that of the CO2e from Denmark in 2018.
A study affirms and explains why 





  0%|                                                                                            | 0/1 [00:00<?, ?it/s][A[A[A[A



100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:05<00:00,  5.97s/it][A[A[A[A


** Prompt: **
Here is the content of the section:
[Excerpt from document]
document_title: 2023 in science
questions_this_excerpt_can_answer: 1. What was the costliest hurricane to strike Florida on record?
2. What is the largest source of added sugars?
3. What is the role of elites' unsustainable consumption in urban water crises?
Excerpt:
-----
The following scientific events occurred or were scheduled to occur in 2023.


== Events ==


=== January ===


=== February ===


=== March ===


=== April ===
3 April
Five employees at the National Hurricane Center publish a tropical cyclone report (TCR) on Hurricane Ian, which officially upgrades the hurricane from a Category 4 to a Category 5 on the Saffir–Simpson scale. The TCR also stated that Hurricane Ian caused, with 90% confidence, $112.9 billion worth of damage to the United States, which made Ian the third-costliest United States hurricane on record as well as the costliest hurricane to strike Florida on record.
An unexplained rise 





  0%|                                                                                            | 0/1 [00:00<?, ?it/s][A[A[A[A



100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:04<00:00,  4.70s/it][A[A[A[A

** Prompt: **
The following scientific events occurred or were scheduled to occur in 2023.


== Events ==


=== January ===


=== February ===


=== March ===


=== April ===
3 April
Five employees at the National Hurricane Center publish a tropical cyclone report (TCR) on Hurricane Ian, which officially upgrades the hurricane from a Category 4 to a Category 5 on the Saffir–Simpson scale. The TCR also stated that Hurricane Ian caused, with 90% confidence, $112.9 billion worth of damage to the United States, which made Ian the third-costliest United States hurricane on record as well as the costliest hurricane to strike Florida on record.
An unexplained rise of emissions of five chlorofluorocarbons (CFCs), successfully banned by the Montreal Protocol of 1989, is reported. Their climate impact in 2020 is roughly equivalent to that of the CO2e from Denmark in 2018.
A study affirms and explains why a moderate decrease in body temperature extends lifespan.
5 April
The NOAA reports that gree




Extracting entities:   0%|          | 0/1 [00:00<?, ?it/s]

In [38]:
nodes[0].metadata

{'document_title': '2023 in science',
 'questions_this_excerpt_can_answer': "1. What was the costliest hurricane to strike Florida on record?\n2. What is the largest source of added sugars?\n3. What is the role of elites' unsustainable consumption in urban water crises?",
 'section_summary': "Key topics: Hurricane Ian, sugar-sweetened beverages, neurons, elites' unsustainable consumption, wheat blast fungus\nEntities: Hurricane Ian, sugar-sweetened beverages, neurons, elites, Cape Town, wheat blast fungus",
 'excerpt_keywords': 'April, chlorofluorocarbon, CO2, emissions, hurricane, hurricane ian, NOAA, study, temperature, water',
 'entities': ['Cape Town',
  'Hurricane Ian',
  'Florida',
  'National Hurricane Center']}

In [45]:
## This will be presented to the Embedmodel while Answer synthesis
print(
    "LLM sees:\n",
    nodes[0].get_content(metadata_mode=MetadataMode.LLM),
)

LLM sees:
 [Excerpt from document]
document_title: 2023 in science
questions_this_excerpt_can_answer: 1. What was the costliest hurricane to strike Florida on record?
2. What is the largest source of added sugars?
3. What is the role of elites' unsustainable consumption in urban water crises?
section_summary: Key topics: Hurricane Ian, sugar-sweetened beverages, neurons, elites' unsustainable consumption, wheat blast fungus
Entities: Hurricane Ian, sugar-sweetened beverages, neurons, elites, Cape Town, wheat blast fungus
excerpt_keywords: April, chlorofluorocarbon, CO2, emissions, hurricane, hurricane ian, NOAA, study, temperature, water
entities: ['Cape Town', 'Hurricane Ian', 'Florida', 'National Hurricane Center']
Excerpt:
-----
The following scientific events occurred or were scheduled to occur in 2023.


== Events ==


=== January ===


=== February ===


=== March ===


=== April ===
3 April
Five employees at the National Hurricane Center publish a tropical cyclone report (TCR) o

In [44]:
## This will be presented to the Embedmodel while embedding
print(
    "LLM sees:\n",
    nodes[0].get_content(metadata_mode=MetadataMode.EMBED),
)

LLM sees:
 [Excerpt from document]
document_title: 2023 in science
questions_this_excerpt_can_answer: 1. What was the costliest hurricane to strike Florida on record?
2. What is the largest source of added sugars?
3. What is the role of elites' unsustainable consumption in urban water crises?
section_summary: Key topics: Hurricane Ian, sugar-sweetened beverages, neurons, elites' unsustainable consumption, wheat blast fungus
Entities: Hurricane Ian, sugar-sweetened beverages, neurons, elites, Cape Town, wheat blast fungus
excerpt_keywords: April, chlorofluorocarbon, CO2, emissions, hurricane, hurricane ian, NOAA, study, temperature, water
entities: ['Cape Town', 'Hurricane Ian', 'Florida', 'National Hurricane Center']
Excerpt:
-----
The following scientific events occurred or were scheduled to occur in 2023.


== Events ==


=== January ===


=== February ===


=== March ===


=== April ===
3 April
Five employees at the National Hurricane Center publish a tropical cyclone report (TCR) o

##### Further Index creation and query engine creation on top of the extracted information will help in effective document retrival for sure

In [46]:
len(nodes)

1

In [48]:
from copy import deepcopy

nodes_no_metadata = deepcopy(nodes)
for node in nodes_no_metadata:
    node.metadata = {
        k: node.metadata[k]
        for k in node.metadata
        if k in ["page_label", "file_name"]
    }
print(
    "LLM sees:\n",
    (nodes_no_metadata)[0].get_content(metadata_mode=MetadataMode.LLM),
)

LLM sees:
 The following scientific events occurred or were scheduled to occur in 2023.


== Events ==


=== January ===


=== February ===


=== March ===


=== April ===
3 April
Five employees at the National Hurricane Center publish a tropical cyclone report (TCR) on Hurricane Ian, which officially upgrades the hurricane from a Category 4 to a Category 5 on the Saffir–Simpson scale. The TCR also stated that Hurricane Ian caused, with 90% confidence, $112.9 billion worth of damage to the United States, which made Ian the third-costliest United States hurricane on record as well as the costliest hurricane to strike Florida on record.
An unexplained rise of emissions of five chlorofluorocarbons (CFCs), successfully banned by the Montreal Protocol of 1989, is reported. Their climate impact in 2020 is roughly equivalent to that of the CO2e from Denmark in 2018.
A study affirms and explains why a moderate decrease in body temperature extends lifespan.
5 April
The NOAA reports that greenho

In [49]:
from llama_index import VectorStoreIndex
from llama_index.query_engine import SubQuestionQueryEngine
from llama_index.tools import QueryEngineTool, ToolMetadata

In [52]:
index_no_metadata = VectorStoreIndex(
    nodes=nodes_no_metadata,
    service_context=service_context)
engine_no_metadata = index_no_metadata.as_query_engine(
    similarity_top_k=10,
)

**********
Trace: index_construction
    |_embedding ->  1.74743 seconds
**********


## Our own question generator module

In [54]:
from llama_index.question_gen.llm_generators import LLMQuestionGenerator
from llama_index.question_gen.prompts import DEFAULT_SUB_QUESTION_PROMPT_TMPL


question_gen = LLMQuestionGenerator.from_defaults(
    service_context=service_context,
    prompt_template_str="""
        Follow the example, but instead of giving a question, always prefix the question 
        with: 'By first identifying and quoting the most relevant sources, '. 
        """
    + DEFAULT_SUB_QUESTION_PROMPT_TMPL,
)

In [60]:
final_engine_no_metadata = SubQuestionQueryEngine.from_defaults(
    query_engine_tools=[
        QueryEngineTool(
            query_engine=engine_no_metadata,
            metadata=ToolMetadata(
                name="since diecest",
                description="Information about articl in science direct",
            ),
        )
    ],
    question_gen=question_gen,
    use_async=True,
)

In [62]:
# response_no_metadata = final_engine_no_metadata.query(
#     """
#     What was the hurican and how it affetcted the lives?
#     """
# )
# print(response_no_metadata.response)

In [67]:
## Queing over extracted information metadata

In [68]:
index = VectorStoreIndex(
    nodes=nodes,
    service_context=service_context)

engine = index.as_query_engine(
    similarity_top_k=10,
)

**********
Trace: index_construction
    |_embedding ->  0.498416 seconds
**********


In [69]:
final_engine = SubQuestionQueryEngine.from_defaults(
    query_engine_tools=[
        QueryEngineTool(
            query_engine=engine,
            metadata=ToolMetadata(
                 name="since diecest",
                description="Information about articl in science direct",
            ),
        )
    ],
    question_gen=question_gen,
    use_async=True,
)

In [71]:
# response_no_metadata = final_engine_no_metadata.query(
#     """
#     What was the hurican and how it affetcted the lives?
#     """
# )
# print(response_no_metadata.response)