In [27]:
%pip install -qq llama-index python-dotenv beautifulsoup4 requests openai

Note: you may need to restart the kernel to use updated packages.


# Demo

In [28]:
import os 
import pprint
import requests

import dotenv 


dotenv.load_dotenv()

True

In [29]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader

In [30]:
number_of_books = 10
for i in range(1, number_of_books+1):
    url = f'https://www.gutenberg.org/cache/epub/{i}/pg{i}.txt'
    response = requests.get(url)
    if response.status_code == 200:
        with open(f'./data/pg{i}.txt', 'w') as f:
            f.write(response.text)
    else:
        print(f'Failed to download {url}')

In [31]:
docs = SimpleDirectoryReader('data').load_data()
index = VectorStoreIndex.from_documents(docs)

In [32]:
query_engine = index.as_query_engine()

In [33]:
question = "What does the congress of the US consists of?"
response = query_engine.query(question)

In [34]:
# query_engine.query??

In [35]:
pprint.pprint(response)

Response(response='The Congress of the United States consists of a Senate and '
                  'House of Representatives.',
         source_nodes=[NodeWithScore(node=TextNode(id_='d4e0c0e1-f428-4d67-92ed-ebed4c6ad7e9', embedding=None, metadata={'file_path': 'c:\\Users\\User\\Desktop\\Development\\llama_index_demo\\data\\pg1.txt', 'file_name': 'pg1.txt', 'file_type': 'text/plain', 'file_size': 121266, 'creation_date': '2024-07-12', 'last_modified_date': '2024-07-13'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='c074fca4-e931-4e29-8cd9-e68751217e99', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'file_path': 'c:\\Users\\User\\Desktop\\Development\\llama_index_demo\\data\\pg1.txt', 'file_name': 'pg1.t

# Token / Character / Docs Count

In [36]:
import tiktoken

data_directory = './data'
encoder = tiktoken.get_encoding('cl100k_base')

total_token = 0
total_file_size = 0
total_char_size = 0
for file_name in os.listdir(data_directory):
    file_path = os.path.join(data_directory, file_name)
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
        token_count = len(encoder.encode(text))
        print(f"File: {file_name}, Token Count: {token_count}, FileSize: {os.path.getsize(file_path)}")
        total_token += token_count
        total_file_size += os.path.getsize(file_path)
        total_char_size += len(text)

print(f"Total token count: {total_token}")
print(f"Total char size: {total_char_size}")
print(f"Total file size: {total_file_size}")
print(f"Average token size: {total_file_size/total_token}")

File: pg1.txt, Token Count: 25679, FileSize: 121266
File: pg10.txt, Token Count: 1155915, FileSize: 4555961
File: pg2.txt, Token Count: 4803, FileSize: 23631
File: pg3.txt, Token Count: 5891, FileSize: 28162
File: pg4.txt, Token Count: 4629, FileSize: 22427
File: pg5.txt, Token Count: 10292, FileSize: 49573
File: pg6.txt, Token Count: 5795, FileSize: 27422
File: pg7.txt, Token Count: 4599, FileSize: 22363
File: pg8.txt, Token Count: 5119, FileSize: 24825
File: pg9.txt, Token Count: 8776, FileSize: 42269
Total token count: 1231498
Total char size: 4805027
Total file size: 4917899
Average token size: 3.9934283287508383


# Settings

In [37]:
from llama_index.core import Settings
pprint.pprint(Settings)

_Settings(_llm=OpenAI(callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x000002089C903E10>, system_prompt=None, messages_to_prompt=<function messages_to_prompt at 0x000002089824DE40>, completion_to_prompt=<function default_completion_to_prompt at 0x00000208983007C0>, output_parser=None, pydantic_program_mode=<PydanticProgramMode.DEFAULT: 'default'>, query_wrapper_prompt=None, model='gpt-3.5-turbo', temperature=0.1, max_tokens=None, logprobs=None, top_logprobs=0, additional_kwargs={}, max_retries=3, timeout=60.0, default_headers=None, reuse_client=True, api_key='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', api_base='https://api.openai.com/v1', api_version=''),
          _embed_model=OpenAIEmbedding(model_name='text-embedding-ada-002', embed_batch_size=100, callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x000002089C903E10>, num_workers=None, additional_kwargs={}, api_key='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'

# Doc Store

In [38]:
dir(index.docstore)

['__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_aget_ref_doc_id',
 '_aremove_from_ref_doc_node',
 '_async_prepare_kv_pairs',
 '_batch_size',
 '_get_kv_pairs_for_insert',
 '_get_ref_doc_id',
 '_kvstore',
 '_merge_ref_doc_kv_pairs',
 '_metadata_collection',
 '_metadata_collection_suffix',
 '_namespace',
 '_node_collection',
 '_node_collection_suffix',
 '_prepare_kv_pairs',
 '_ref_doc_collection',
 '_ref_doc_collection_suffix',
 '_remove_from_ref_doc_node',
 '_remove_legacy_info',
 'add_documents',
 'adelete_document',
 'adelete_ref_doc',
 'adocument_exists',
 'aget_all_document_hashes',
 'aget_all_ref

In [39]:
for k, v in index.docstore.docs.items():
    print('key>>>', k)
    print('len(v)>>>', len(v.text))
    print('tokens >>>', len(encoder.encode(v.text)))
    print('value>>>', v.text)
    break

key>>> 97981709-b6bd-458f-ab4c-4091a028134b
len(v)>>> 4371
tokens >>> 969
value>>> ﻿The Project Gutenberg eBook of The Declaration of Independence of the United States of America
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this ebook or online
at www.gutenberg.org. If you are not located in the United States,
you will have to check the laws of the country where you are located
before using this eBook.

Title: The Declaration of Independence of the United States of America

Author: Thomas Jefferson

Release date: December 1, 1971 [eBook #1]
                Most recently updated: January 1, 2021

Language: English



*** START OF THE PROJECT GUTENBERG EBOOK THE DECLARATION OF INDEPENDENCE OF THE UNITED STATES OF AMERICA ***



December, 1971  [Etext #1]


The Project Gu

In [40]:
v.text

'\ufeffThe Project Gutenberg eBook of The Declaration of Independence of the United States of America\r\r\n    \r\r\nThis ebook is for the use of anyone anywhere in the United States and\r\r\nmost other parts of the world at no cost and with almost no restrictions\r\r\nwhatsoever. You may copy it, give it away or re-use it under the terms\r\r\nof the Project Gutenberg License included with this ebook or online\r\r\nat www.gutenberg.org. If you are not located in the United States,\r\r\nyou will have to check the laws of the country where you are located\r\r\nbefore using this eBook.\r\r\n\r\r\nTitle: The Declaration of Independence of the United States of America\r\r\n\r\r\nAuthor: Thomas Jefferson\r\r\n\r\r\nRelease date: December 1, 1971 [eBook #1]\r\r\n                Most recently updated: January 1, 2021\r\r\n\r\r\nLanguage: English\r\r\n\r\r\n\r\r\n\r\r\n*** START OF THE PROJECT GUTENBERG EBOOK THE DECLARATION OF INDEPENDENCE OF THE UNITED STATES OF AMERICA ***\r\r\n\r\r\n\r\r\n\

In [41]:
from openai import OpenAI
client = OpenAI()

def get_embedding(text, model="text-embedding-ada-002"):
   # text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding

em_openai = get_embedding(v.text)

In [42]:
em_vs = index.vector_store.get(k)

In [43]:
from numpy import dot
from numpy.linalg import norm

cos_sim = dot(em_vs, em_openai)/(norm(em_vs)*norm(em_openai))
cos_sim

0.9749907379798458

# vector store

In [44]:
vs = index.vector_store

In [45]:
for k, v in vs.data.metadata_dict.items():
    print(k)
    pprint.pprint(v)
    break

97981709-b6bd-458f-ab4c-4091a028134b
{'_node_type': 'TextNode',
 'creation_date': '2024-07-12',
 'doc_id': 'c074fca4-e931-4e29-8cd9-e68751217e99',
 'document_id': 'c074fca4-e931-4e29-8cd9-e68751217e99',
 'file_name': 'pg1.txt',
 'file_path': 'c:\\Users\\User\\Desktop\\Development\\llama_index_demo\\data\\pg1.txt',
 'file_size': 121266,
 'file_type': 'text/plain',
 'last_modified_date': '2024-07-13',
 'ref_doc_id': 'c074fca4-e931-4e29-8cd9-e68751217e99'}


In [46]:
for k, v in vs.data.embedding_dict.items():
    print(k, v)
    break

97981709-b6bd-458f-ab4c-4091a028134b [0.007635970134288073, -0.009355495683848858, -0.029040886089205742, -0.04114125669002533, -0.007018214091658592, 0.016456501558423042, -0.014240223914384842, -0.00021772703621536493, -0.017857598140835762, 3.90077693737112e-05, 0.0021080116275697947, 0.023958731442689896, 0.004063176456838846, -0.0026668577920645475, 0.0018532670801505446, 0.027206724509596825, 0.014316647313535213, -0.008725003339350224, 0.013297668658196926, -0.012851865962147713, -0.010005095042288303, -0.00021454272791743279, 0.0008215513662435114, 0.027767162770032883, 0.02122022584080696, -0.008024455048143864, 0.021156540140509605, -0.03484906256198883, 0.024149790406227112, -0.02110559120774269, -0.009546554647386074, -0.001092217513360083, -0.009864985011518002, -0.019322378560900688, -0.04093746095895767, -0.02212456986308098, -0.03143548592925072, -0.007737867999821901, 0.0013214877108111978, -0.015768691897392273, 0.006712520495057106, -0.013998216949403286, -0.00989045

In [47]:
vs.get(k)

[0.007635970134288073,
 -0.009355495683848858,
 -0.029040886089205742,
 -0.04114125669002533,
 -0.007018214091658592,
 0.016456501558423042,
 -0.014240223914384842,
 -0.00021772703621536493,
 -0.017857598140835762,
 3.90077693737112e-05,
 0.0021080116275697947,
 0.023958731442689896,
 0.004063176456838846,
 -0.0026668577920645475,
 0.0018532670801505446,
 0.027206724509596825,
 0.014316647313535213,
 -0.008725003339350224,
 0.013297668658196926,
 -0.012851865962147713,
 -0.010005095042288303,
 -0.00021454272791743279,
 0.0008215513662435114,
 0.027767162770032883,
 0.02122022584080696,
 -0.008024455048143864,
 0.021156540140509605,
 -0.03484906256198883,
 0.024149790406227112,
 -0.02110559120774269,
 -0.009546554647386074,
 -0.001092217513360083,
 -0.009864985011518002,
 -0.019322378560900688,
 -0.04093746095895767,
 -0.02212456986308098,
 -0.03143548592925072,
 -0.007737867999821901,
 0.0013214877108111978,
 -0.015768691897392273,
 0.006712520495057106,
 -0.013998216949403286,
 -0.009

In [48]:
len(vs.get(k))

1536

# Query 

In [49]:
from llama_index.core.vector_stores import VectorStoreQuery
from openai import OpenAI

print(question)
em_query = get_embedding(question)
print(em_query)

What does the congress of the US consists of?
[0.005216749384999275, 0.010167470201849937, -0.010874715633690357, -0.021814316511154175, -0.01628611981868744, 0.01076441165059805, -0.005677432287484407, -0.005070758052170277, -0.012049132958054543, 0.0032085601706057787, 0.002644061343744397, 0.00940831657499075, -0.029302038252353668, -0.014404457062482834, -0.008324737660586834, 0.005625524092465639, 0.018777702003717422, -0.0029084673151373863, 0.0038119901437312365, -0.03913859650492668, 0.00546006765216589, -0.01661054417490959, -0.019841814413666725, 0.02543489634990692, -0.0015207407996058464, 0.008778932504355907, 0.03599816560745239, -0.0020098111126571894, 0.0006306004943326116, -0.00020945670257788152, 0.01776549592614174, -0.006657194811850786, -0.020698295906186104, -0.028237925842404366, -0.0008564812014810741, 0.001852464978583157, -0.002637572819367051, 0.006190023384988308, -0.00691024586558342, -0.015935741364955902, 0.03438901901245117, 0.0024591393303126097, -0.0103

In [50]:
vs.query(VectorStoreQuery(query_embedding=[em_openai]))

VectorStoreQueryResult(nodes=None, similarities=[array([0.97499074])], ids=['97981709-b6bd-458f-ab4c-4091a028134b'])

In [51]:
ds = index.docstore
print(ds.get_node(k).text)

The Project Gutenberg eBook of The Declaration of Independence of the United States of America
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this ebook or online
at www.gutenberg.org. If you are not located in the United States,
you will have to check the laws of the country where you are located
before using this eBook.

Title: The Declaration of Independence of the United States of America

Author: Thomas Jefferson

Release date: December 1, 1971 [eBook #1]
                Most recently updated: January 1, 2021

Language: English



*** START OF THE PROJECT GUTENBERG EBOOK THE DECLARATION OF INDEPENDENCE OF THE UNITED STATES OF AMERICA ***



December, 1971  [Etext #1]


The Project Gutenberg Etext of The Declaration of Independence.

All of the original Project Guten

In [52]:
from llama_index.core.schema import QueryBundle


# Step 1: Generate embeddings for the query
em_query = get_embedding(question)

# Step 2: Create a QueryBundle
query_bundle = QueryBundle(question)

# Step 3: Retrieve nodes using the retriever
retriver = index.as_retriever()
nodes = retriver.retrieve(query_bundle)

In [53]:
get_embedding(question) == query_bundle.embedding

True

In [54]:
query_engine.query(question).source_nodes == retriver.retrieve(query_bundle)

True

In [55]:
retriver.retrieve(query_bundle)

[NodeWithScore(node=TextNode(id_='d4e0c0e1-f428-4d67-92ed-ebed4c6ad7e9', embedding=None, metadata={'file_path': 'c:\\Users\\User\\Desktop\\Development\\llama_index_demo\\data\\pg1.txt', 'file_name': 'pg1.txt', 'file_type': 'text/plain', 'file_size': 121266, 'creation_date': '2024-07-12', 'last_modified_date': '2024-07-13'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='c074fca4-e931-4e29-8cd9-e68751217e99', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'file_path': 'c:\\Users\\User\\Desktop\\Development\\llama_index_demo\\data\\pg1.txt', 'file_name': 'pg1.txt', 'file_type': 'text/plain', 'file_size': 121266, 'creation_date': '2024-07-12', 'last_modified_date': '2024-07-13'}, hash='48089749d018ec43f241ef

In [56]:
# retriver.retrieve??

In [57]:
vs.query(VectorStoreQuery(query_embedding=[em_query], similarity_top_k=2))

VectorStoreQueryResult(nodes=None, similarities=[array([0.84504648]), array([0.83960497])], ids=['d4e0c0e1-f428-4d67-92ed-ebed4c6ad7e9', '6feda703-cbd7-4b24-b9ce-c15660a40c38'])

In [58]:
dict_similarity = {}
for k, v in vs.data.embedding_dict.items():
    cos_sim = dot(v, em_query)/(norm(v)*norm(em_query))
    dict_similarity[k] = cos_sim

sorted_dict_similarity = dict(sorted(dict_similarity.items(), key=lambda x: x[1], reverse=True))
sorted_dict_similarity

{'d4e0c0e1-f428-4d67-92ed-ebed4c6ad7e9': 0.845046477613661,
 '6feda703-cbd7-4b24-b9ce-c15660a40c38': 0.8396049689371684,
 '24c580a6-dc2d-4f48-8815-86ab7cf46b69': 0.8353611197677153,
 '8e0307e7-2f60-431c-915d-c3e2070ec859': 0.8321014592088639,
 '6f74a818-f3c1-4d53-ac81-5a70d855b619': 0.823427257609029,
 'fd6de816-a7e7-435a-918a-e356d4b664a0': 0.8190166544561256,
 'f26cf6ea-794a-4e83-8629-ca2e953bc251': 0.817566591258813,
 'bc6acaa5-fe57-43bb-bb1f-d0a02ae03972': 0.8175058624962146,
 '196cd451-1b16-4f16-87c4-78cf933d037b': 0.8159881522395901,
 '0332331b-4ef8-49ea-a9c4-e5940ddb3bc1': 0.8155900401352102,
 '4c2b0c41-7915-415f-849f-0f475602e2a7': 0.8125749943608528,
 '528ff74f-35cd-4ef4-8528-d99fce34abf1': 0.8113099345341199,
 'b7572a4c-5c2d-47f5-8c81-76ebc3d97418': 0.8073470944471944,
 'd55b389a-0234-408e-9c7f-3a58391e259d': 0.8064870928018922,
 '1770791c-290e-4dd6-b5be-986b164a6022': 0.8058199563901074,
 '907af2f8-bfd6-459f-9a49-c459dd1032dc': 0.8029180036120694,
 'ac034cfe-2c28-4aeb-a011-7

In [59]:
from llama_index.core import get_response_synthesizer

response_synthesizer = get_response_synthesizer()