In [29]:
from llama_index.vector_stores.qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from config import *
from pprint import pprint

In [30]:
client = QdrantClient(URL,port=6333)

In [31]:
vector_store = QdrantVectorStore(client=client, collection_name=COLLECTION_NAME)

In [32]:
import fitz

In [33]:
doc = fitz.open(DIR_PATH + "/HIGHWAYS.pdf")

In [34]:
from llama_index.core.node_parser import SentenceSplitter

In [35]:
text_splitter = SentenceSplitter(
    chunk_size=512
)

In [36]:
text_chunks = []
# maintain relationship with source doc index, to help inject doc metadata in (3)
doc_idxs = []
for doc_idx, page in enumerate(doc):
    page_text = page.get_text("text")
    cur_text_chunks = text_splitter.split_text(page_text)
    text_chunks.extend(cur_text_chunks)
    doc_idxs.extend([doc_idx] * len(cur_text_chunks))

In [37]:
len(doc_idxs)

20

In [38]:
pprint(text_chunks[1])

('NATIONAL HIGHWAYS DEVELOPMENT PROJECT\n'
 'Infrastructure is critical to improved productivity across all sectors. \n'
 'The \n'
 'current \n'
 'level \n'
 'of \n'
 'annual \n'
 'Gross \n'
 'Capital \n'
 'Formation \n'
 'in \n'
 'the \n'
 'infrastructure sectors is about 4.6% of GDP and needs to be ramped up \n'
 'to 8 % during the course of the Eleventh Five Year Plan. This would \n'
 'require a very significant scaling up of investment from the public as \n'
 'well as the private sectors. While public investment in infrastructure \n'
 'would continue to increase, the role of private participation needs to \n'
 'expand significantly to address the deficit in infrastructure services.\n'
 'Contents\n'
 'Contents\n'
 '4\n'
 '6\n'
 '8\n'
 '12\n'
 '10\n'
 '14\n'
 '15\n'
 'Indian Road Infrastructure\n'
 'Policy Initiatives \n'
 'National Highways Development Project\n'
 'Financing of National Highways \n'
 'Development Projects\n'
 'Public Private Participation\n'
 'Investment Environment

In [39]:
from llama_index.core.schema import TextNode

In [40]:
nodes = []
for idx, text_chunk in enumerate(text_chunks):
    node = TextNode(
        text=text_chunk,
    )
    src_doc_idx = doc_idxs[idx]
    src_page = doc[src_doc_idx]
    nodes.append(node)

In [41]:
from llama_index.llms.azure_openai import AzureOpenAI
llm = AzureOpenAI(
    model="gpt-35-turbo",
    deployment_name=deployment_id_gpt4,
    api_key=key,
    azure_endpoint=endpoint,
    api_version=api_version,
)

In [42]:
from llama_index.core.extractors import (
    QuestionsAnsweredExtractor,
    TitleExtractor,
)
from llama_index.core.ingestion import IngestionPipeline

extractors = [
    TitleExtractor(nodes=5, llm=llm),
    QuestionsAnsweredExtractor(questions=3, llm=llm),
]

In [43]:
pipeline = IngestionPipeline(
    transformations=extractors,
)
nodes = await pipeline.arun(nodes=nodes, in_place=False)

100%|██████████| 5/5 [00:00<00:00,  7.49it/s]
100%|██████████| 20/20 [00:09<00:00,  2.12it/s]


In [44]:
pprint(nodes[0].metadata)

{'document_title': 'Opportunities and Policy Initiatives for Profitable '
                   'Partnership and Infrastructure Investment in Indian '
                   'Highways and National Highways Development Project: A '
                   'Comprehensive Overview.',
 'questions_this_excerpt_can_answer': '1. What is the status of the Indian '
                                      'Highways and National Highways '
                                      'Development Project as of August 2006?\n'
                                      '2. Who prepared the brochure on the '
                                      'emerging opportunities for profitable '
                                      'partnership in Indian highways?\n'
                                      '3. Who designed the brochure on the '
                                      'Indian Highways and National Highways '
                                      'Development Project? \n'
                                      '\n'
       

In [45]:
from llama_index.embeddings.fastembed import FastEmbedEmbedding

embed_model = FastEmbedEmbedding()

Fetching 9 files: 100%|██████████| 9/9 [00:00<00:00, 28948.42it/s]


In [46]:
for node in nodes:
    node_embedding = embed_model.get_text_embedding(
        node.get_content(metadata_mode="all")
    )
    node.embedding = node_embedding

In [47]:
pprint(nodes[0])

TextNode(id_='cab8f96b-b514-4d67-9120-0e966436bfb1', embedding=[-0.04695820063352585, 0.024249175563454628, -0.018253210932016373, 0.008525742217898369, 0.011761258356273174, 0.039416685700416565, -0.02633238211274147, 0.012222268618643284, -0.0746278241276741, 0.026546934619545937, -0.010993728414177895, -0.0068054539151489735, 0.011392280459403992, 0.003379110712558031, -0.022071877494454384, 0.040086641907691956, -0.0015025608008727431, -0.11408837884664536, 0.030617743730545044, 0.030993018299341202, 0.0742076113820076, -0.03122398443520069, -0.0018415898084640503, -0.03510534390807152, 0.06025262922048569, 0.040362659841775894, 0.0352342426776886, -0.047318462282419205, 0.004384218715131283, -0.15309694409370422, -0.02285687066614628, -0.005235077813267708, -0.01735851541161537, -0.04344358667731285, -0.06125152111053467, 0.07274097204208374, -0.0025266127195209265, 0.11468368768692017, 0.0987730622291565, -0.06899642199277878, 0.02406373992562294, -0.01338239386677742, 0.00647450

In [48]:
vector_store.add(nodes)

['cab8f96b-b514-4d67-9120-0e966436bfb1',
 'f9a8287d-cf93-4ac6-8cdf-b3ae020f533d',
 'f1f2e847-5885-4dd8-8adb-44c7cd8b12e1',
 '78d34def-97dc-4dac-bdb6-080ad865c8b1',
 'd0a7886b-f489-4d89-a48b-7a8a4f4f5ab0',
 '0add89c9-cb02-47ce-84f3-39f8d8f0924b',
 'f573e3db-776f-4d61-a937-e76595ac6613',
 '23b309dd-03a7-4667-a0df-e47c707a3c7e',
 '4a4b1e4a-f565-4913-879d-7372613ac967',
 'e2ea2da3-7208-432a-862d-fae9ffe87bd9',
 '9d3b5ef7-fe08-4a83-8112-c29094ccd326',
 'bd182b8d-089a-4304-8358-25ed468e526c',
 '5c78a53f-46df-41af-b6f1-a221bab6f204',
 '0eb5b085-fa70-42db-b587-f6a2fead3d34',
 'ebcb8533-b399-417c-b3af-2030593c426e',
 '60863204-a0b2-401b-af8f-4f44b18787b4',
 '7f4aaf02-42e0-4a9e-ad98-78b3f7d1d35c',
 '12a749de-47cd-4a99-b535-f3fcb448ed75',
 '2fdc6f0b-3182-48a2-8961-f1503cce0075',
 '5000378d-bc02-4ff7-a3b4-4ef385c0caf3']

In [49]:
from llama_index.core import Settings

Settings.llm = llm
Settings.embed_model = embed_model

In [50]:
from llama_index.core  import VectorStoreIndex

In [51]:
index = VectorStoreIndex.from_vector_store(vector_store=vector_store)

In [52]:
query_engine = index.as_query_engine()

In [53]:
query = "What should government do about road safety?"

In [54]:
response = query_engine.query(query)

In [55]:
print(response.get_formatted_sources())
print("query was:", query)
print("answer was:", response)

> Source (Doc id: 888a8a07-aa16-42ae-8d8f-71249e220188): 14  Report of the Committee
of motorization suggest that strategies for
reducing traffic injurie...

> Source (Doc id: 4dd6886a-6874-4f91-9c62-1bb03fbecba3): 8  Report of the Committee
road safety. There is also no effective
mechanism for coordinating th...
query was: What should government do about road safety?
answer was: Governments should make road safety a political priority, develop a multidisciplinary approach to road safety, appoint a lead agency for road safety, give it adequate resources, and make it publicly accountable. They should also set appropriate road safety targets and establish national road safety plans to achieve them, support the creation of safety advocacy groups, create budgets for road safety, and increase investment in demonstrably effective road safety activities.


In [56]:
pprint(str(response))

('Governments should make road safety a political priority, develop a '
 'multidisciplinary approach to road safety, appoint a lead agency for road '
 'safety, give it adequate resources, and make it publicly accountable. They '
 'should also set appropriate road safety targets and establish national road '
 'safety plans to achieve them, support the creation of safety advocacy '
 'groups, create budgets for road safety, and increase investment in '
 'demonstrably effective road safety activities.')
