In [1]:
from llama_index.vector_stores.qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from config import *
from pprint import pprint

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

In [3]:
client = QdrantClient(URL,port=6333)

In [4]:
vector_store = QdrantVectorStore(client=client, collection_name=COLLECTION_NAME)

In [5]:
import fitz

In [6]:
doc = fitz.open(DIR_PATH + "/Tata-Steel-Corporate-Brochure-2020-21.pdf")

In [7]:
from llama_index.core.node_parser import SentenceSplitter

In [8]:
text_splitter = SentenceSplitter(
    chunk_size=512
)

In [9]:
text_chunks = []
# maintain relationship with source doc index, to help inject doc metadata in (3)
doc_idxs = []
for doc_idx, page in enumerate(doc):
    page_text = page.get_text("text")
    cur_text_chunks = text_splitter.split_text(page_text)
    text_chunks.extend(cur_text_chunks)
    doc_idxs.extend([doc_idx] * len(cur_text_chunks))

In [10]:
len(doc_idxs)

56

In [11]:
pprint(text_chunks[1])

('A vision for tomorrow \n'
 '2\n'
 ' \n'
 'Tata group overview \n'
 '12\n'
 ' \n'
 'Tata Steel group overview \n'
 '18\n'
 ' \n'
 'Tata Steel India overview \n'
 '28\n'
 'A vision beyond operations \n'
 '40\n'
 ' \n'
 'Innovation \n'
 '42\n'
 ' \n'
 'Technology \n'
 '46\n'
 ' \n'
 'Sustainability \n'
 '50\n'
 'Directory of group companies \n'
 '64\n'
 'Disclaimer: While care has been taken to ensure that the \n'
 'information in the Corporate Brochure is accurate, neither Tata \n'
 'Steel nor its subsidiaries accept responsibility or liability for \n'
 'errors or information that is found to be misleading. All content \n'
 'published is copyright of Tata\xa0Steel and may not be reproduced \n'
 'without the written permission of the publishers.\n'
 'Contents\n'
 'GAS STORAGE, PORT TALBOT')


In [12]:
from llama_index.core.schema import TextNode

In [13]:
nodes = []
for idx, text_chunk in enumerate(text_chunks):
    node = TextNode(
        text=text_chunk,
    )
    src_doc_idx = doc_idxs[idx]
    src_page = doc[src_doc_idx]
    nodes.append(node)

In [14]:
from llama_index.llms.azure_openai import AzureOpenAI
llm = AzureOpenAI(
    model="gpt-35-turbo",
    deployment_name=deployment_id_gpt4,
    api_key=key,
    azure_endpoint=endpoint,
    api_version=api_version,
)

In [15]:
from llama_index.core.extractors import (
    QuestionsAnsweredExtractor,
    TitleExtractor,
)
from llama_index.core.ingestion import IngestionPipeline

extractors = [
    TitleExtractor(nodes=5, llm=llm),
    QuestionsAnsweredExtractor(questions=3, llm=llm),
]

In [16]:
pipeline = IngestionPipeline(
    transformations=extractors,
)
nodes = await pipeline.arun(nodes=nodes, in_place=False)

100%|██████████| 5/5 [00:01<00:00,  4.98it/s]
100%|██████████| 56/56 [00:20<00:00,  2.72it/s]


In [17]:
pprint(nodes[0].metadata)

{'document_title': '"Advancing Innovation, Technology, and Sustainability in '
                   'Tata Steel Group Companies: Building a Strong Foundation '
                   'for the Future through Resilience, Resolve, and Readiness '
                   'in Steel Structures, Landmarks, and Automotive Materials."',
 'questions_this_excerpt_can_answer': '1. What are the key values and '
                                      'priorities of Tata Steel Group '
                                      'Companies as outlined in their FY '
                                      '2020-21 corporate brochure?\n'
                                      '2. How does Tata Steel Group Companies '
                                      'plan to advance innovation, technology, '
                                      'and sustainability in their steel '
                                      'structures, landmarks, and automotive '
                                      'materials?\n'
                        

In [18]:
embed_model = HuggingFaceEmbedding(
    model_name="sentence-transformers/all-mpnet-base-v2", max_length=512
)

In [19]:
for node in nodes:
    node_embedding = embed_model.get_text_embedding(
        node.get_content(metadata_mode="all")
    )
    node.embedding = node_embedding

In [20]:
pprint(nodes[0])

TextNode(id_='7f9344f1-4d56-40d2-b5c7-de44051c367d', embedding=[0.07332110404968262, -0.01834375038743019, -0.017841346561908722, -6.164814840303734e-05, -0.011439749039709568, 0.011648415587842464, 0.03290417790412903, -0.025870459154248238, -0.054562654346227646, 0.029987089335918427, 0.03685632720589638, 0.042044639587402344, 0.026776768267154694, 0.007439950946718454, -0.017005229368805885, -0.031248535960912704, 0.0202496238052845, 0.04422365501523018, -0.01032673567533493, 0.010341097600758076, -0.06599833816289902, 0.0307069830596447, -0.002586797811090946, -0.021809523925185204, -0.011109840124845505, -0.004834817722439766, 0.009994703345000744, -0.01432375330477953, -0.008309019729495049, -0.015463868156075478, 0.0555838905274868, -0.02269810624420643, 0.006189480423927307, -0.07201331108808517, 2.0446620965230977e-06, -0.02534395456314087, -0.07320263981819153, 0.0005319679039530456, 0.04069706052541733, 0.021410314366221428, 0.00945022702217102, -0.015610123053193092, -0.062

In [21]:
vector_store.add(nodes)

['7f9344f1-4d56-40d2-b5c7-de44051c367d',
 '6b6d3b2a-cab5-49d2-b0cb-0a8173b12998',
 'e13758e8-c276-42d4-ad6c-a30095f84553',
 '67382971-96a4-4ce1-8536-7935d729532d',
 '553b7616-1e0d-4c12-8455-aaf7eaa47101',
 '3c197497-2198-4eea-9a19-cd8daa0bf7ae',
 'd06f2852-209a-42b4-a7e2-a326a50a6d56',
 '90eee11f-b46d-4d43-bce8-7aa182d8b5a9',
 'b118e837-9ec4-4a5d-88c5-ad43fb00f026',
 '7865dbdb-4225-457d-94eb-d1a199e5467d',
 'ac59fe0d-ae87-41db-8f53-5759795e0497',
 'bb81bc8d-b65b-4714-8b41-a745c3d57975',
 '649a4a1e-03c7-4c8f-8f3d-0e68847b7785',
 'f0ef648f-fedd-4d5c-90e1-9ca85c853221',
 'd56803a9-e19b-4968-a88a-a71204d36ebe',
 '5c45c1cc-38e7-45b7-ac6c-35c30bc19c92',
 'fbafec1b-4a95-4f8f-8c8a-9412e824d7a0',
 '3fc0cc3f-d371-4aca-bcbf-0fad7498bf5e',
 '8bf51030-9b7c-499e-8178-4923a0310896',
 'dd237a3e-1c49-444e-b446-77fea481727e',
 '5a5e1a64-707f-47d9-90fe-d4d791c65f82',
 '3f1c5b48-40a7-400f-9cc1-2f96b123c772',
 'e1efb5c1-68f1-4b6e-a241-02aac8bd2189',
 '219dbfb9-d866-452d-b3be-e69ab23c989a',
 'd4fb1ef8-9402-

In [22]:
from llama_index.core import Settings

Settings.llm = llm
Settings.embed_model = embed_model

In [23]:
from llama_index.core  import VectorStoreIndex

In [24]:
index = VectorStoreIndex.from_vector_store(vector_store=vector_store)

In [25]:
query_engine = index.as_query_engine()

In [26]:
query = "Describe about Tata Steel Kalinganagar?"

In [27]:
response = query_engine.query(query)

In [28]:
print(response.get_formatted_sources())
print("query was:", query)
print("answer was:", response)

> Source (Doc id: d06f2852-209a-42b4-a7e2-a326a50a6d56): The world as we know it would not be the 
same without steel. From the buildings we 
work and liv...

> Source (Doc id: d4fb1ef8-9402-4a92-b16e-f093d4ee2ae2): Key digital enablers
Key operational 
highlights
• Fastest ramp-up of 
greenfield plant
• Fastest...
query was: Describe about Tata Steel Kalinganagar?
answer was: Tata Steel Kalinganagar (TSK) is the second integrated steel plant of Tata Steel in India, which manufactures high-end flat products. It covers an area of 3,000 acres and was commissioned in 2016. TSK achieved production levels at its rated capacity in less than two years and is designed to have a minimal water and carbon footprint. The plant has state-of-the-art equipment and modern facilities, which have enabled highly cost-competitive and productive plant operations. Tata Steel has initiated the next phase of capacity expansion in Kalinganagar in FY 2018-19, which includes investments in raw material capacity 

In [29]:
pprint(str(response))

('Tata Steel Kalinganagar (TSK) is the second integrated steel plant of Tata '
 'Steel in India, which manufactures high-end flat products. It covers an area '
 'of 3,000 acres and was commissioned in 2016. TSK achieved production levels '
 'at its rated capacity in less than two years and is designed to have a '
 'minimal water and carbon footprint. The plant has state-of-the-art equipment '
 'and modern facilities, which have enabled highly cost-competitive and '
 'productive plant operations. Tata Steel has initiated the next phase of '
 'capacity expansion in Kalinganagar in FY 2018-19, which includes investments '
 'in raw material capacity expansion, upstream and mid-stream facilities, '
 'infrastructure, and downstream facilities. The expansion will enable Tata '
 'Steel to produce value-added products like cold rolled galvanized and '
 'annealed products, as well as meet the requirements of automotive, general '
 'engineering, and other high-end quality product market segments.