
# Llamaindex - Advanced RAG - video

useful links:
- https://www.llamaindex.ai/


In [None]:
%%capture
!pip install llama-index >> null
!pip install openai >> null
!pip install pypdf >> null   # for reading PDF files
!pip install docx2txt > null # for reading MS doc files

In [None]:
import os
import openai

import logging
import sys
from pprint import pprint

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

from llama_index import (
    VectorStoreIndex,
    SimpleDirectoryReader,
    load_index_from_storage,
    StorageContext,
    ServiceContext,
    Document
)

from llama_index.llms import OpenAI, Anthropic
from llama_index.node_parser import SentenceWindowNodeParser, HierarchicalNodeParser, get_leaf_nodes
from llama_index.text_splitter import SentenceSplitter
from llama_index.embeddings import OpenAIEmbedding, HuggingFaceEmbedding
from llama_index.schema import MetadataMode
from llama_index.postprocessor import MetadataReplacementPostProcessor

# from IPython.display import Markdown, display
# from transformers import AutoTokenizer, T5ForConditionalGeneration

# Step 0:  Authentication with Org ID and API Key

In [None]:
openai_key = "" #<--- Your API KEY
#org_ID = "xxxxxxxxxxxx" #<--- Your Organization ID

In [None]:
openai.api_key = openai_key

# Step 1:  Fetch Data and Store into local directory

In [None]:
# create local directory and retrieve file from external source
!mkdir -p 'my_data'
!wget 'https://www.gutenberg.org/cache/epub/72306/pg72306.txt' -O './my_data/teahistory.txt'
!wget 'https://www.gutenberg.org/cache/epub/11367/pg11367.txt' -O './my_data/chinahistory.txt'

--2023-12-16 10:48:18--  https://www.gutenberg.org/cache/epub/72306/pg72306.txt
Resolving www.gutenberg.org (www.gutenberg.org)... 152.19.134.47, 2610:28:3090:3000:0:bad:cafe:47
Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 493827 (482K) [text/plain]
Saving to: ‘./my_data/teahistory.txt’


2023-12-16 10:48:19 (1.38 MB/s) - ‘./my_data/teahistory.txt’ saved [493827/493827]

--2023-12-16 10:48:19--  https://www.gutenberg.org/cache/epub/11367/pg11367.txt
Resolving www.gutenberg.org (www.gutenberg.org)... 152.19.134.47, 2610:28:3090:3000:0:bad:cafe:47
Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 977274 (954K) [text/plain]
Saving to: ‘./my_data/chinahistory.txt’


2023-12-16 10:48:19 (2.46 MB/s) - ‘./my_data/chinahistory.txt’ saved [977274/977274]



# Step 2:  Load into files into "Document" Object

In [None]:
 documents = SimpleDirectoryReader("./my_data/").load_data()

# Step 2B (Optional):  Inspect the documents obect

In [None]:
# Inspect the documents
print("length of doc: "+ str(len(documents)))
print("----")
pprint(documents)


length of doc: 2
----
[Document(id_='d479241a-fc72-4523-b296-340c48178b63', embedding=None, metadata={'file_path': 'my_data/chinahistory.txt', 'file_name': 'chinahistory.txt', 'file_type': 'text/plain', 'file_size': 977274, 'creation_date': '2023-12-16', 'last_modified_date': '2023-12-05', 'last_accessed_date': '2023-12-16'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, hash='57e5f35ff2c3d7a3ee93c25f7064f3f58d64abf3a605d5652177f004bab7810f', text='\ufeffThe Project Gutenberg eBook of A History of China\n    \nThis ebook is for the use of anyone anywhere in the United States and\nmost other parts of the world at no cost and with almost no restrictions\nwhatsoever. You may copy it, give it away or re-use it under the terms\nof the Project Gutenberg License in

In [None]:
documents[0].metadata
documents[1].metadata

{'file_path': 'my_data/teahistory.txt',
 'file_name': 'teahistory.txt',
 'file_type': 'text/plain',
 'file_size': 493827,
 'creation_date': '2023-12-16',
 'last_modified_date': '2023-12-04',
 'last_accessed_date': '2023-12-16'}

# Step 3:  Node Parsing & Indexing (Base & Sentence Window Method)

In [None]:
# create the sentence window node parser w/ default settings
sentence_node_parser = SentenceWindowNodeParser.from_defaults(
    window_size=3,
    window_metadata_key="window",
    original_text_metadata_key="original_text"
)

base_node_parser = SentenceSplitter()

llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)


[nltk_data] Downloading package punkt to /tmp/llama_index...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
nodes = sentence_node_parser.get_nodes_from_documents(documents)
base_nodes = base_node_parser.get_nodes_from_documents(documents)

In [None]:
print("---------")
print("SENTENCE NODES")
print("---------")
print(nodes[100])
print("---------")
print("BASE NODES")
print("---------")
print(base_nodes[100])

---------
SENTENCE NODES
---------
Node ID: dc609cf2-6abe-4029-ac06-6a5c69cd7c94
Text: We have no desire to show that China's history is the most
glorious or her civilization the oldest in the world.
---------
BASE NODES
---------
Node ID: d84f172e-d135-4452-b23d-b6a2665fc33b
Text: This one fact alone demonstrates that the Hsia rejected Chinese
culture and were nationalistic Hun. Thus there were now two realms in
North China, one undergoing progressive sinification, the other
falling back to the old traditions of the Huns.  3 _Rise of the Toba
to a great Power_  The present province of Szechwan, in the west, had
belonged t...


In [None]:
dict(base_nodes[100])

{'id_': 'd84f172e-d135-4452-b23d-b6a2665fc33b',
 'embedding': None,
 'metadata': {'file_path': 'my_data/chinahistory.txt',
  'file_name': 'chinahistory.txt',
  'file_type': 'text/plain',
  'file_size': 977274,
  'creation_date': '2023-12-16',
  'last_modified_date': '2023-12-05',
  'last_accessed_date': '2023-12-16'},
 'excluded_embed_metadata_keys': ['file_name',
  'file_type',
  'file_size',
  'creation_date',
  'last_modified_date',
  'last_accessed_date'],
 'excluded_llm_metadata_keys': ['file_name',
  'file_type',
  'file_size',
  'creation_date',
  'last_modified_date',
  'last_accessed_date'],
 'relationships': {<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='d479241a-fc72-4523-b296-340c48178b63', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'file_path': 'my_data/chinahistory.txt', 'file_name': 'chinahistory.txt', 'file_type': 'text/plain', 'file_size': 977274, 'creation_date': '2023-12-16', 'last_modified_date': '2023-12-05', 'last_accessed_date': '2023-12-16'}, has

In [None]:
ctx_sentence = ServiceContext.from_defaults(llm=llm, embed_model=OpenAIEmbedding(embed_batch_size=50), node_parser=sentence_node_parser)
ctx_base = ServiceContext.from_defaults(llm=llm, embed_model=OpenAIEmbedding(embed_batch_size=50), node_parser=base_node_parser)

sentence_index = VectorStoreIndex(nodes, service_context=ctx_sentence)
base_index = VectorStoreIndex(base_nodes, service_context=ctx_base)

# Step 4:  Save to Persistent Storage

In [None]:
sentence_index.storage_context.persist(persist_dir="./sentence_index")
base_index.storage_context.persist(persist_dir="./base_index")


In [None]:
# Download to own computer for backup

!zip -r ./indexes.zip ./*_index

from google.colab import files
files.download("./indexes.zip")

  adding: base_index/ (stored 0%)
  adding: base_index/index_store.json (deflated 68%)
  adding: base_index/graph_store.json (stored 0%)
  adding: base_index/docstore.json (deflated 76%)
  adding: base_index/image__vector_store.json (deflated 19%)
  adding: base_index/default__vector_store.json (deflated 62%)
  adding: sentence_index/ (stored 0%)
  adding: sentence_index/index_store.json (deflated 68%)
  adding: sentence_index/graph_store.json (stored 0%)
  adding: sentence_index/docstore.json (deflated 94%)
  adding: sentence_index/image__vector_store.json (deflated 19%)
  adding: sentence_index/default__vector_store.json (deflated 63%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Step 5:  Retrieve from Storage

In [None]:
# rebuild storage context
SC_retrieved_sentence = StorageContext.from_defaults(persist_dir="./sentence_index")
SC_retrieved_base = StorageContext.from_defaults(persist_dir="./base_index")

In [None]:
# load index
retrieved_sentence_index = load_index_from_storage(SC_retrieved_sentence)
retrieved_base_index = load_index_from_storage(SC_retrieved_base)

# Step 6: Create query engine

In [None]:
from llama_index.postprocessor import MetadataReplacementPostProcessor

sentence_query_engine = retrieved_sentence_index.as_query_engine(
    similarity_top_k=5,
    verbose=True,
    # the target key defaults to `window` to match the node_parser's default
    node_postprocessors=[
        MetadataReplacementPostProcessor(target_metadata_key="window")
    ],
)

base_query_engine = retrieved_base_index.as_query_engine(
    similarity_top_k=5,
    verbose=True
)

# Step 7:  Inference

In [None]:
question = "Something happened in the United States 10 years after the first American ships sailed for China which could have made it more expensive to purchase tea. what happened that year? Try to break down your answer into steps."

In [None]:
base_response = base_query_engine.query(
    question
)
print(base_response)

1. American ships sailed for China in 1784 and brought back a significant amount of Tea.
2. In the following years, additional ships brought even more Tea to the United States.
3. The earliest official record of Tea importation into the United States was made in 1790.
4. Over time, the importation, value, and consumption of Tea in the United States increased.
5. In 1794, the rates of duty on imported Tea were significantly increased, potentially making it more expensive to purchase Tea in the United States.
6. This increase in duty could have had an impact on the cost of purchasing Tea in the United States.


In [None]:
sentence_response = sentence_query_engine.query(
    question
)
print(sentence_response)

Step 1: The first American ships sailed for China in 1784, bringing back 880,000 pounds of Tea.

Step 2: During 1786-87, five other ships brought to the United States over 1,000,000 pounds of Tea.

Step 3: In 1794, the rates of duty levied on tea by the United States were increased by 75 percent on direct importations and 100 percent.

Step 4: Therefore, 10 years after the first American ships sailed for China, in 1794, the rates of duty on tea were increased, which could have made it more expensive to purchase tea in the United States.


In [None]:
window = sentence_response.source_nodes[0].node.metadata["window"]
sentence = sentence_response.source_nodes[0].node.metadata["original_text"]

print(f"Window: {window}")
print("------------------")
print(f"Original Sentence: {sentence}")

Window: The East India Company enjoyed a monopoly of the trade in Tea up to
1834, when, owing to the methods of calculation adopted by the Company,
and the heavier expenses which always attend every department of a trade
monopoly, the prices were greatly enhanced.  Much dissatisfaction
prevailing with its management, this system of importing Teas was
abolished, the Company being deprived of its exclusive privileges, and
the Tea trade thrown open to all.

 In all probability Tea first reached America from England, which country
began to export in 1711, but it is claimed to have been previously
introduced by some Dutch smugglers, no definite date being given.  The
first American ship sailed for China in 1784, two more vessels being
dispatched the following year, bringing back 880,000 pounds of Tea.
 During 1786-87, five other ships brought to the United States over
1,000,000 pounds.  In 1844, the “Howqua” and “Montauk” were built
expressly for the Tea trade, being the first of the class 

In [None]:
for source_node in sentence_response.source_nodes:
    print(source_node.node.metadata["original_text"])
    print("--------")

The
first American ship sailed for China in 1784, two more vessels being
dispatched the following year, bringing back 880,000 pounds of Tea.

--------
But in order to stimulate
American shipping these duties were reduced to 8, 13 and 26 cents
respectively, the following year, when imported from Europe in American
vessels, and to 6, 10 and 20 cents when imported direct from China in
the same manner. 
--------
--------------


In 1858 the United States Government ordered and received about 10,000
tea-plants from China in Wardian cases in which the seeds were sown just
previous to shipment, many of them germinated during the voyage, the
plants averaging 18 inches in height on their arrival in this country.

--------
The quantity of China and
Japan teas consumed in the whole United Kingdom declining to about
50,000,000 pounds in 1890, although the prices for them were exceedingly
low during that period. 
--------
Up to 1856 China tea was the only tea used in the United States, but
during t

In [None]:
from llama_index.evaluation import (
    DatasetGenerator,
    QueryResponseDataset,
)
from llama_index import ServiceContext
from llama_index.llms import OpenAI
import nest_asyncio
import random

nest_asyncio.apply()

In [None]:
len(base_nodes)

422

In [None]:
num_nodes_eval = 30
# there are 428 nodes total. Take the first 200 to generate questions (the back half of the doc is all references)
sample_eval_nodes = random.sample(base_nodes[:200], num_nodes_eval)
# NOTE: run this if the dataset isn't already saved
eval_service_context = ServiceContext.from_defaults(llm=OpenAI(model="gpt-3.5-turbo"))
# generate questions from the largest chunks (1024)
dataset_generator = DatasetGenerator(
    sample_eval_nodes,
    service_context=eval_service_context,
    show_progress=True,
    num_questions_per_chunk=2,
)

  dataset_generator = DatasetGenerator(


In [None]:
eval_dataset = await dataset_generator.agenerate_dataset_from_nodes()

100%|██████████| 30/30 [00:03<00:00,  8.11it/s]
100%|██████████| 2/2 [00:08<00:00,  4.10s/it]
100%|██████████| 2/2 [00:02<00:00,  1.46s/it]
100%|██████████| 2/2 [00:03<00:00,  1.91s/it]
100%|██████████| 2/2 [00:09<00:00,  4.92s/it]
100%|██████████| 2/2 [00:04<00:00,  2.33s/it]
100%|██████████| 2/2 [00:04<00:00,  2.44s/it]
100%|██████████| 2/2 [00:04<00:00,  2.47s/it]
100%|██████████| 2/2 [00:03<00:00,  1.70s/it]
100%|██████████| 2/2 [00:06<00:00,  3.33s/it]
100%|██████████| 2/2 [00:02<00:00,  1.41s/it]
100%|██████████| 2/2 [00:04<00:00,  2.39s/it]
100%|██████████| 2/2 [00:04<00:00,  2.44s/it]
100%|██████████| 2/2 [00:03<00:00,  1.62s/it]
100%|██████████| 2/2 [00:08<00:00,  4.25s/it]
100%|██████████| 2/2 [00:05<00:00,  2.50s/it]
100%|██████████| 2/2 [00:06<00:00,  3.48s/it]
100%|██████████| 2/2 [00:09<00:00,  4.94s/it]
100%|██████████| 2/2 [00:03<00:00,  1.92s/it]
100%|██████████| 2/2 [00:05<00:00,  2.72s/it]
100%|██████████| 2/2 [00:06<00:00,  3.40s/it]
100%|██████████| 2/2 [00:04<00:0

In [None]:
eval_dataset.save_json("data/ipcc_eval_qr_dataset.json")

In [None]:
# optional
eval_dataset = QueryResponseDataset.from_json("data/ipcc_eval_qr_dataset.json")

  return cls(**data)


In [None]:
import asyncio
import nest_asyncio

nest_asyncio.apply()

from llama_index.evaluation import (
    CorrectnessEvaluator,
    SemanticSimilarityEvaluator,
    RelevancyEvaluator,
    FaithfulnessEvaluator,
    PairwiseComparisonEvaluator,
)


from collections import defaultdict
import pandas as pd

# NOTE: can uncomment other evaluators
evaluator_c = CorrectnessEvaluator(service_context=eval_service_context)
evaluator_s = SemanticSimilarityEvaluator(service_context=eval_service_context)
evaluator_r = RelevancyEvaluator(service_context=eval_service_context)
evaluator_f = FaithfulnessEvaluator(service_context=eval_service_context)
# pairwise_evaluator = PairwiseComparisonEvaluator(service_context=eval_service_context)

In [None]:
from llama_index.evaluation.eval_utils import get_responses, get_results_df
from llama_index.evaluation import BatchEvalRunner

max_samples = 30

eval_qs = eval_dataset.questions
ref_response_strs = [r for (_, r) in eval_dataset.qr_pairs]

# resetup base query engine and sentence window query engine
# base query engine
base_query_engine = base_index.as_query_engine(similarity_top_k=2)
# sentence window query engine
query_engine = sentence_index.as_query_engine(
    similarity_top_k=2,
    # the target key defaults to `window` to match the node_parser's default
    node_postprocessors=[
        MetadataReplacementPostProcessor(target_metadata_key="window")
    ],
)

In [None]:
import numpy as np

base_pred_responses = get_responses(
    eval_qs[:max_samples], base_query_engine, show_progress=True
)
pred_responses = get_responses(
    eval_qs[:max_samples], query_engine, show_progress=True
)

pred_response_strs = [str(p) for p in pred_responses]
base_pred_response_strs = [str(p) for p in base_pred_responses]

100%|██████████| 30/30 [00:12<00:00,  2.31it/s]
100%|██████████| 30/30 [00:54<00:00,  1.80s/it]


In [None]:
evaluator_dict = {
    "correctness": evaluator_c,
    "faithfulness": evaluator_f,
    "relevancy": evaluator_r,
    "semantic_similarity": evaluator_s,
}
batch_runner = BatchEvalRunner(evaluator_dict, workers=2, show_progress=True)


eval_results = await batch_runner.aevaluate_responses(
    queries=eval_qs[:max_samples],
    responses=pred_responses[:max_samples],
    reference=ref_response_strs[:max_samples],
)

100%|██████████| 120/120 [01:04<00:00,  1.85it/s]


In [None]:
base_eval_results = await batch_runner.aevaluate_responses(
    queries=eval_qs[:max_samples],
    responses=base_pred_responses[:max_samples],
    reference=ref_response_strs[:max_samples],
)

100%|██████████| 120/120 [01:13<00:00,  1.63it/s]


In [None]:
results_df = get_results_df(
    [eval_results, base_eval_results],
    ["Sentence Window Retriever", "Base Retriever"],
    ["correctness", "relevancy", "faithfulness", "semantic_similarity"],
)
display(results_df)

Unnamed: 0,names,correctness,relevancy,faithfulness,semantic_similarity
0,Sentence Window Retriever,3.716667,0.9,0.833333,0.957258
1,Base Retriever,3.95,0.766667,0.333333,0.968345
