In [32]:
from llama_index.vector_stores.qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from config import *
from pprint import pprint

In [33]:
client = QdrantClient(URL,port=6333)

In [34]:
vector_store = QdrantVectorStore(client=client, collection_name=COLLECTION_NAME)

In [35]:
import fitz

In [36]:
doc = fitz.open(DIR_PATH + "/Tata-Steel-Corporate-Brochure-2020-21.pdf")

In [37]:
from llama_index.core.node_parser import SentenceSplitter

In [38]:
text_splitter = SentenceSplitter(
    chunk_size=512
)

In [39]:
text_chunks = []
# maintain relationship with source doc index, to help inject doc metadata in (3)
doc_idxs = []
for doc_idx, page in enumerate(doc):
    page_text = page.get_text("text")
    cur_text_chunks = text_splitter.split_text(page_text)
    text_chunks.extend(cur_text_chunks)
    doc_idxs.extend([doc_idx] * len(cur_text_chunks))

In [40]:
len(doc_idxs)

56

In [41]:
pprint(text_chunks[1])

('A vision for tomorrow \n'
 '2\n'
 ' \n'
 'Tata group overview \n'
 '12\n'
 ' \n'
 'Tata Steel group overview \n'
 '18\n'
 ' \n'
 'Tata Steel India overview \n'
 '28\n'
 'A vision beyond operations \n'
 '40\n'
 ' \n'
 'Innovation \n'
 '42\n'
 ' \n'
 'Technology \n'
 '46\n'
 ' \n'
 'Sustainability \n'
 '50\n'
 'Directory of group companies \n'
 '64\n'
 'Disclaimer: While care has been taken to ensure that the \n'
 'information in the Corporate Brochure is accurate, neither Tata \n'
 'Steel nor its subsidiaries accept responsibility or liability for \n'
 'errors or information that is found to be misleading. All content \n'
 'published is copyright of Tata\xa0Steel and may not be reproduced \n'
 'without the written permission of the publishers.\n'
 'Contents\n'
 'GAS STORAGE, PORT TALBOT')


In [42]:
from llama_index.core.schema import TextNode

In [43]:
nodes = []
for idx, text_chunk in enumerate(text_chunks):
    node = TextNode(
        text=text_chunk,
    )
    src_doc_idx = doc_idxs[idx]
    src_page = doc[src_doc_idx]
    nodes.append(node)

In [44]:
from llama_index.llms.azure_openai import AzureOpenAI
llm = AzureOpenAI(
    model="gpt-35-turbo",
    deployment_name=deployment_id_gpt4,
    api_key=key,
    azure_endpoint=endpoint,
    api_version=api_version,
)

In [45]:
from llama_index.core.extractors import (
    QuestionsAnsweredExtractor,
    TitleExtractor,
)
from llama_index.core.ingestion import IngestionPipeline

extractors = [
    TitleExtractor(nodes=5, llm=llm),
    QuestionsAnsweredExtractor(questions=3, llm=llm),
]

In [46]:
pipeline = IngestionPipeline(
    transformations=extractors,
)
nodes = await pipeline.arun(nodes=nodes, in_place=False)

  0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 5/5 [00:00<00:00,  5.82it/s]
100%|██████████| 56/56 [00:18<00:00,  3.06it/s]


In [48]:
pprint(nodes[0].metadata)

{'document_title': 'Innovative Approaches to Sustainability and Steel '
                   'Production for Building Landmarks and Advancing '
                   'Industries.',
 'questions_this_excerpt_can_answer': '1. What are some innovative approaches '
                                      'to sustainability in steel production '
                                      'that can be used to build landmarks and '
                                      'advance industries?\n'
                                      '2. How can steel production be made '
                                      'future-ready while also being resilient '
                                      'and resolute?\n'
                                      '3. What is the content of the Corporate '
                                      'Brochure for FY 2020-21 and how does it '
                                      'relate to innovative approaches to '
                                      'sustainability and steel produc

In [50]:
from llama_index.embeddings.fastembed import FastEmbedEmbedding

embed_model = FastEmbedEmbedding()

Fetching 9 files:   0%|          | 0/9 [00:00<?, ?it/s]

Fetching 9 files: 100%|██████████| 9/9 [00:00<00:00, 17.11it/s]


In [51]:
for node in nodes:
    node_embedding = embed_model.get_text_embedding(
        node.get_content(metadata_mode="all")
    )
    node.embedding = node_embedding

In [52]:
pprint(nodes[0])

TextNode(id_='1f7db6a6-6ad9-42b8-8cdd-5e3d1c89fc38', embedding=[-0.03957550600171089, 0.04148934409022331, 0.01611850969493389, 0.0357433557510376, 0.03565003350377083, 0.03834414482116699, -0.055616386234760284, -0.015265755355358124, -0.02329101972281933, -0.016623063012957573, 8.186084596673027e-05, -0.03751275688409805, -0.0016249370528385043, -0.00993414781987667, 0.031663376837968826, 0.06717957556247711, -0.0013144455151632428, -0.025387492030858994, -0.005217358469963074, -0.002842647023499012, 0.030334535986185074, -0.03319692984223366, -0.008700383827090263, 0.004193538334220648, 0.05778948962688446, 0.007334763649851084, -0.020845139399170876, -0.003431371646001935, 0.006867974065244198, -0.18131230771541595, -0.014363307505846024, 0.010435276664793491, 0.0011663608020171523, -0.012378002516925335, -0.010504655539989471, -0.006966032087802887, -0.02077666111290455, 0.04153577983379364, 0.016413889825344086, 0.08788636326789856, 0.027744222432374954, 0.02897641621530056, 0.01

In [53]:
vector_store.add(nodes)

['1f7db6a6-6ad9-42b8-8cdd-5e3d1c89fc38',
 'fb18a1d3-7df0-456e-b01b-c726caa62ee3',
 '99d7e582-f0d2-429b-b2a6-f004afb59beb',
 'bceecd49-43d3-498f-b973-f7dbc449ed21',
 'e4b19d33-7939-4a5e-8db4-0c26a99813a8',
 '0ffb2ac5-a1c7-4441-97cf-308eb0ce6994',
 '75450904-59cf-45e4-9707-ed3be6633a71',
 'f939e1eb-64a1-403b-926e-a9b8d1e0de0b',
 '9c93a6bd-a721-46a6-bcc3-f34aac2bfba1',
 '0db3fe49-f095-4d3f-90ae-ca7f5cfcb700',
 '37edae1e-ccbb-4039-9466-8504b42e1901',
 '6cc719ce-3e64-48d2-b3a4-21a985eb36e3',
 '5fa4be0a-5ef1-44dc-9457-9f25d325c0f6',
 '79a1dfab-d7b3-4660-9dea-a2969d58ebd1',
 'e6b9613a-77f8-4219-a844-c77ae05af847',
 '0e799ecd-8189-4023-9df6-a78406f4c4f0',
 'efdb8f29-4dc8-43a2-82e8-59d3faa438b3',
 '3a194a6a-0f53-4089-bb46-065107fe6137',
 'cdd39858-e716-4f2f-9c55-077cea2f7715',
 '40cbcba8-6844-47c3-aad3-31b5e0bb173c',
 '53b4196a-c213-42fa-8007-dbf0b5db176a',
 '81f60fb7-19af-4f7b-aee6-b7a57048444c',
 '8ba9c32d-93e5-401b-863b-d831a33155be',
 'bca0538f-8954-4843-8c84-29d265eae814',
 'f1445b5a-8db0-

In [54]:
from llama_index.core import Settings

Settings.llm = llm
Settings.embed_model = embed_model

In [55]:
from llama_index.core  import VectorStoreIndex

In [56]:
index = VectorStoreIndex.from_vector_store(vector_store=vector_store)

In [57]:
query_engine = index.as_query_engine()

In [58]:
query = "Describe about Tata Steel Kalinganagar?"

In [59]:
response = query_engine.query(query)

In [60]:
print(response.get_formatted_sources())
print("query was:", query)
print("answer was:", response)

> Source (Doc id: f1445b5a-8db0-4108-9202-a21285ae6b34): Key digital enablers
Key operational 
highlights
• Fastest ramp-up of 
greenfield plant
• Fastest...

> Source (Doc id: 53b4196a-c213-42fa-8007-dbf0b5db176a): Inside Tata Steel India
We are India’s lowest cost producer 
of steel. Our history and journey 
a...
query was: Describe about Tata Steel Kalinganagar?
answer was: Tata Steel Kalinganagar is the second integrated steel plant in India owned by Tata Steel. It is spread over 3,000 acres and was commissioned in 2016. The plant achieved production levels at its rated capacity in less than two years and is designed to have a minimal water and carbon footprint. It has key digital enablers and operational highlights such as data-backed decision-making and real-time monitoring, advanced analytics, high-speed data extraction, artificial intelligence modelling, 3D printing, and predictive maintenance deployment. Tata Steel Kalinganagar is also included in the elite Global Lighthouse N

In [61]:
pprint(str(response))

('Tata Steel Kalinganagar is the second integrated steel plant in India owned '
 'by Tata Steel. It is spread over 3,000 acres and was commissioned in 2016. '
 'The plant achieved production levels at its rated capacity in less than two '
 'years and is designed to have a minimal water and carbon footprint. It has '
 'key digital enablers and operational highlights such as data-backed '
 'decision-making and real-time monitoring, advanced analytics, high-speed '
 'data extraction, artificial intelligence modelling, 3D printing, and '
 'predictive maintenance deployment. Tata Steel Kalinganagar is also included '
 'in the elite Global Lighthouse Network of the World Economic Forum for its '
 'leadership in applying Industry 4.0 technologies.')
