In [21]:
import llama_index
llama_index.set_global_handler("simple")

In [29]:
import os

os.environ["OPENAI_API_KEY"] = "sk-..."

import logging
import sys

logging.basicConfig(
    stream=sys.stdout, level=logging.INFO
)  # logging.DEBUG for more verbose output

from llama_index import (
    KnowledgeGraphIndex,
    ServiceContext,
    SimpleDirectoryReader,
    SimpleKeywordTableIndex
)
from llama_index.storage.storage_context import StorageContext
from llama_index.graph_stores import NebulaGraphStore
from llama_index.llms import OpenAI

from IPython.display import Markdown, display
from llama_index.llms.palm import PaLM
from llama_index.embeddings import GooglePaLMEmbedding


from llama_index.callbacks import (
    CallbackManager,
    LlamaDebugHandler
)


from llama_index.retrievers import (
    KeywordTableSimpleRetriever
)


from llama_index.readers import WikipediaReader

from llama_index import (
    VectorStoreIndex,
    SimpleDirectoryReader,
    ServiceContext,
    StorageContext,
    SQLDatabase,
)


from llama_index.node_parser import get_leaf_nodes, get_root_nodes


In [31]:
llama_debug = LlamaDebugHandler(print_trace_on_end=True)
callback_manager = CallbackManager([llama_debug])

palm_api_key  = "AIzaSyApBCzqW_RF4qbkX9kMoNwjooIqrm8oZEQ"
model = PaLM(api_key=palm_api_key)

model_name = "models/embedding-gecko-001"
embed_model = GooglePaLMEmbedding(model_name=model_name, api_key=palm_api_key)

service_context = ServiceContext.from_defaults(
                                    llm = model,
                                    embed_model = embed_model,
                                    chunk_size=512,
                                    callback_manager=callback_manager)

In [32]:
from llama_index.node_parser import (
    HierarchicalNodeParser,
    SentenceSplitter,
)

In [33]:
node_parser = HierarchicalNodeParser.from_defaults()


In [34]:
node_parser.node_parser_map

{'chunk_size_2048': SentenceSplitter(include_metadata=True, include_prev_next_rel=True, callback_manager=<llama_index.callbacks.base.CallbackManager object at 0x0000019BF1F3E3D0>, id_func=<function default_id_func at 0x0000019BE92AD4C0>, chunk_size=2048, chunk_overlap=20, separator=' ', paragraph_separator='\n\n\n', secondary_chunking_regex='[^,.;。？！]+[,.;。？！]?'),
 'chunk_size_512': SentenceSplitter(include_metadata=True, include_prev_next_rel=True, callback_manager=<llama_index.callbacks.base.CallbackManager object at 0x0000019BF1F3E3D0>, id_func=<function default_id_func at 0x0000019BE92AD4C0>, chunk_size=512, chunk_overlap=20, separator=' ', paragraph_separator='\n\n\n', secondary_chunking_regex='[^,.;。？！]+[,.;。？！]?'),
 'chunk_size_128': SentenceSplitter(include_metadata=True, include_prev_next_rel=True, callback_manager=<llama_index.callbacks.base.CallbackManager object at 0x0000019BF1F3E3D0>, id_func=<function default_id_func at 0x0000019BE92AD4C0>, chunk_size=128, chunk_overlap=2

In [35]:
cities = ["Toronto", "Berlin", "Tokyo"]
wiki_docs = WikipediaReader().load_data(pages=cities)

In [38]:
nodes = node_parser.get_nodes_from_documents([wiki_docs[0]])

In [40]:
len(nodes)

238

In [42]:
leaf_nodes = get_leaf_nodes(nodes)
len(leaf_nodes)

184

In [57]:
len(leaf_nodes[5].text)

553

In [47]:
root_nodes = get_root_nodes(nodes)
len(root_nodes)

11

In [97]:
counter = 0
for i in leaf_nodes:
    if len(i.text)<512:
        print(len(i.text))
        counter += 1
print(counter)

427
337
373
499
466
423
270
361
480
292
504
312
387
102
480
466
303
440
482
296
405
466
211
259
476
412
263
401
183
510
178
426
251
491
363
457
428
428
466
357
484
370
344
488
150
423
441
263
362
496
211
330
429
240
340
359
481
358
436
359
266
138
325
450
225
421
202
474
467
351
297
440
441
433
21
240
199
374
422
387
470
475
363
314
479
445
383
461
469
201
467
333
410
359
437
456
436
387
396
415
325
298
209
432
484
377
501
107


In [94]:
for i in root_nodes:
    if len(i.text)>2048:
        print(len(i.text))

6636
5335
8352
8673
8142
8152
9698
8987
8034
7808
2131


In [52]:
len(root_nodes[1].text)

5335

In [59]:
from llama_index.storage.docstore import SimpleDocumentStore
from llama_index.storage import StorageContext


In [60]:
docstore = SimpleDocumentStore()

# insert nodes into docstore
docstore.add_documents(nodes)

# define storage context (will include vector store by default too)
storage_context = StorageContext.from_defaults(docstore=docstore)


In [63]:
len(docstore.docs)

238

In [64]:
## Load index into vector index
from llama_index import VectorStoreIndex

base_index = VectorStoreIndex(
    leaf_nodes,
    storage_context=storage_context,
    service_context=service_context,
)

**********
Trace: index_construction
    |_embedding ->  2.449107 seconds
    |_embedding ->  0.900787 seconds
    |_embedding ->  1.256966 seconds
    |_embedding ->  0.899054 seconds
    |_embedding ->  1.253397 seconds
    |_embedding ->  0.905029 seconds
    |_embedding ->  1.21318 seconds
    |_embedding ->  0.86224 seconds
    |_embedding ->  1.206991 seconds
    |_embedding ->  0.94571 seconds
    |_embedding ->  1.147054 seconds
    |_embedding ->  0.963931 seconds
    |_embedding ->  1.185967 seconds
    |_embedding ->  1.584169 seconds
    |_embedding ->  0.910062 seconds
    |_embedding ->  1.124424 seconds
    |_embedding ->  0.915089 seconds
    |_embedding ->  1.180759 seconds
    |_embedding ->  0.649494 seconds
**********


In [66]:
from llama_index.retrievers.auto_merging_retriever import AutoMergingRetriever


In [67]:
base_retriever = base_index.as_retriever(similarity_top_k=6)
retriever = AutoMergingRetriever(base_retriever, storage_context, verbose=True)

In [83]:
# query_str = "What were some lessons learned from red-teaming?"
# query_str = "Can you tell me about the key concepts for safety finetuning"
query_str = (
    "Info about arts and culture of the tornto"
    " demographic of toronto"
)

nodes = retriever.retrieve(query_str)

In [84]:
base_nodes = base_retriever.retrieve(query_str)

**********
Trace: query
    |_retrieve ->  0.853332 seconds
      |_embedding ->  0.831201 seconds
**********


In [85]:
len(nodes)

6

In [86]:
from llama_index.response.notebook_utils import display_source_node

for node in nodes:
    display_source_node(node, source_length=10000)

**Node ID:** 95d9850e-988a-4c3e-bf1e-48e89991ce52<br>**Similarity:** 0.7562218590336226<br>**Text:** == Demographics ==

In the 2021 Census of Population conducted by Statistics Canada, Toronto had a population of 2,794,356 living in 1,160,892 of its 1,253,238 total private dwellings, a change of 2.3% from its 2016 population of 2,731,571.<br>

**Node ID:** 4c679530-77b6-4be5-ab37-4c0bdfa2f349<br>**Similarity:** 0.7530244922594963<br>**Text:** == Culture and contemporary life ==

Toronto's theatre and performing arts scene has more than fifty ballet and dance companies, six opera companies, two symphony orchestras and a host of theatres. The city is home to the National Ballet of Canada, the Canadian Opera Company, the Toronto Symphony Orchestra, the Canadian Electronic Ensemble, and the Canadian Stage Company.<br>

**Node ID:** b7c41d07-9b39-49c2-9765-647cca4a4446<br>**Similarity:** 0.744775821257333<br>**Text:** Lawrence Market neighbourhood. The Eaton Centre is Toronto's most popular tourist attraction with over 52 million visitors annually.
Greektown on the Danforth is home to the annual "Taste of the Danforth" festival which attracts over one million people in 2+1⁄2 days. Toronto is also home to Casa Loma, the former estate of Sir Henry Pellatt, a prominent Toronto financier, industrialist and military man. Other notable neighbourhoods and attractions in Toronto include The Beaches, the Toronto Islands, Kensington Market, Fort York, and the Hockey Hall of Fame.


== Education ==<br>

**Node ID:** 303792a2-5ced-4c36-9ba9-c5412fa901a3<br>**Similarity:** 0.7404163434325577<br>**Text:** Its varied cultural institutions, which include numerous museums and galleries, festivals and public events, entertainment districts, national historic sites, and sports activities, attract over 43 million tourists each year. Toronto is known for its many skyscrapers and high-rise buildings, in particular the tallest free-standing structure on land outside of Asia, the CN Tower.The city is home to the Toronto Stock Exchange, the headquarters of Canada's five largest banks, and the headquarters of many large Canadian and multinational corporations.<br>

**Node ID:** 62a8c8d6-a9e8-450d-a1b6-9a00e987ada4<br>**Similarity:** 0.7357389197110095<br>**Text:** Visible minorities are projected to increase to 63 per cent of the city's population by 2031.This diversity is reflected in Toronto's ethnic neighbourhoods, which include Chinatown, Corso Italia, Greektown, Kensington Market, Koreatown, Little India, Little Italy, Little Jamaica, Little Portugal and Roncesvalles (Polish community).


=== Religion ===
According to the 2021 census, religious groups in Toronto included:
Christianity (1,274,450 persons or 46.2%)
No religion and secular perspectives (845,615 persons or 30.6%)
Islam (264,<br>

**Node ID:** 49e69bfd-f475-42d2-af53-e50964dc9f73<br>**Similarity:** 0.731356053958129<br>**Text:** ==== Old Toronto ====
The pre-amalgamation City of Toronto covers the downtown core and also older neighbourhoods to the east, west, and north of it. It is the most densely populated part of the city. The Financial District contains the First Canadian Place, Toronto-Dominion Centre, Scotia Plaza, Royal Bank Plaza, Commerce Court and Brookfield Place. This area includes, among others, the neighbourhoods of St. James Town, Garden District, St. Lawrence, Corktown, and Church and Wellesley.<br>

In [88]:
leaf_nodes[0].relationships

{<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='3081ef16-ed73-4553-ae1e-939d1d1542c7', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='b82480b6aa4effe82b70fed0955014d5e51098a73e264863c93cae425b523438'),
 <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='cc5074a5-300d-4ea2-bff9-998a3ecc34ef', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='9544b163cf00946ce6d6028869db570dabbbcb0e4c839bf39d4e9e98bf810139'),
 <NodeRelationship.PARENT: '4'>: RelatedNodeInfo(node_id='3081ef16-ed73-4553-ae1e-939d1d1542c7', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='b82480b6aa4effe82b70fed0955014d5e51098a73e264863c93cae425b523438')}