In [1]:
from llama_index.vector_stores.qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from config import *
from pprint import pprint

In [2]:
client = QdrantClient(URL,port=6333)

In [3]:
vector_store = QdrantVectorStore(client=client, collection_name=COLLECTION_NAME)

In [4]:
import fitz

In [5]:
doc = fitz.open(DIR_PATH + "/CHAP04-BIOLOGY-CLASS11.pdf")

In [6]:
from llama_index.node_parser import SentenceSplitter

In [7]:
text_splitter = SentenceSplitter(
    chunk_size=512
)

In [8]:
text_chunks = []
# maintain relationship with source doc index, to help inject doc metadata in (3)
doc_idxs = []
for doc_idx, page in enumerate(doc):
    page_text = page.get_text("text")
    cur_text_chunks = text_splitter.split_text(page_text)
    text_chunks.extend(cur_text_chunks)
    doc_idxs.extend([doc_idx] * len(cur_text_chunks))

In [9]:
len(doc_idxs)

29

In [11]:
pprint(text_chunks[1])

('38\n'
 'BIOLOGY\n'
 'Echinoderms and Chordates, organs have\n'
 'associated to form functional systems, each\n'
 'system concerned with a specific physiological\n'
 'function. This pattern is called organ system\n'
 'level of organisation. Organ systems in different\n'
 'groups of animals exhibit various patterns of\n'
 'complexities. For example, the digestive system\n'
 'in Platyhelminthes has only a single opening\n'
 'to the outside of the body that serves as both\n'
 'mouth and anus, and is hence called\n'
 'incomplete. A complete digestive system has\n'
 'two openings, mouth and anus. Similarly, the\n'
 'circulatory system may be of two types:\n'
 '(i) open type in which the blood is pumped\n'
 'out of the heart and the cells and tissues are\n'
 'directly bathed in it and\n'
 '(ii) closed type in which the blood is circulated\n'
 'through a series of vessels of varying diameters\n'
 '(arteries, veins and capillaries).\n'
 '4.1.2\n'
 'Symmetry\n'
 'Animals can be categorised on 

In [12]:
from llama_index.schema import TextNode

In [13]:
nodes = []
for idx, text_chunk in enumerate(text_chunks):
    node = TextNode(
        text=text_chunk,
    )
    src_doc_idx = doc_idxs[idx]
    src_page = doc[src_doc_idx]
    nodes.append(node)

In [14]:
from llama_index.llms import AzureOpenAI
llm = AzureOpenAI(
    model="gpt-35-turbo",
    deployment_name=deployment_id_gpt4,
    api_key=key,
    azure_endpoint=endpoint,
    api_version=api_version,
)

In [15]:
from llama_index.extractors import (
    QuestionsAnsweredExtractor,
    TitleExtractor,
)
from llama_index.ingestion import IngestionPipeline

extractors = [
    TitleExtractor(nodes=5, llm=llm),
    QuestionsAnsweredExtractor(questions=3, llm=llm),
]

In [16]:
pipeline = IngestionPipeline(
    transformations=extractors,
)
nodes = await pipeline.arun(nodes=nodes, in_place=False)

100%|██████████| 5/5 [00:01<00:00,  4.72it/s]
100%|██████████| 29/29 [00:13<00:00,  2.11it/s]


In [17]:
pprint(nodes[0].metadata)

{'document_title': 'Fundamentals of Animal Classification: From Levels of '
                   'Organisation to Phylum Porifera',
 'questions_this_excerpt_can_answer': '1. What are the fundamental features '
                                      'used as the basis of animal '
                                      'classification?\n'
                                      '2. How do different animals exhibit '
                                      'different levels of organization of '
                                      'cells?\n'
                                      '3. Why is classification important in '
                                      'the study of animals and how does it '
                                      'help in assigning a systematic position '
                                      'to newly described species? \n'
                                      '\n'
                                      'Higher-level summary: This excerpt '
                                  

In [18]:
from llama_index.embeddings import FastEmbedEmbedding

embed_model = FastEmbedEmbedding()

100%|██████████| 76.7M/76.7M [00:01<00:00, 66.0MiB/s]


In [19]:
for node in nodes:
    node_embedding = embed_model.get_text_embedding(
        node.get_content(metadata_mode="all")
    )
    node.embedding = node_embedding

In [20]:
pprint(nodes[0])

TextNode(id_='7e503a5d-078d-4b32-bfe7-a33e79dc4000', embedding=[0.02657456137239933, -0.042736079543828964, 0.027832886204123497, 0.012503774836659431, 0.02205989882349968, -0.01876184530556202, 0.07691063731908798, 0.03886473923921585, 0.021060079336166382, 0.016123531386256218, 0.035125140100717545, -0.07436162233352661, 0.005483281332999468, -0.0019588121213018894, -0.04246576875448227, -0.02140616998076439, -0.03559023141860962, -0.002127237617969513, -0.04538920521736145, -0.0218068677932024, 0.03331537917256355, -0.003251616610214114, -0.0417177639901638, 0.0018559462623670697, -0.003484701504930854, 0.018791688606142998, -0.024753045290708542, -0.013792507350444794, -0.011626145802438259, -0.2150440663099289, -0.02067537046968937, 0.021727435290813446, 0.05008382722735405, 0.014523151330649853, -0.0101810647174716, 0.028320742771029472, 0.034344788640737534, -0.016332436352968216, -0.0020065982826054096, 0.0620521679520607, 0.022378506138920784, -0.024644896388053894, -0.0019384

In [21]:
vector_store.add(nodes)

['7e503a5d-078d-4b32-bfe7-a33e79dc4000',
 'b806f0dc-a48d-407e-b529-80ffd52102d6',
 '7d2d5431-af2d-4dc1-a6f7-96330e4cfedc',
 '1e892134-5aa2-4ead-91ef-39b76ae85ad4',
 '661caeab-7f3b-40c9-a94f-dea14b88fccc',
 'a7528805-84bf-404e-ae66-e3cf804d3f5d',
 'bc247bdb-345c-4765-b0f3-a9e10e4ea913',
 '72b9f87e-2fae-4796-a71a-5a5df6428e0b',
 'cf4fb500-9410-4382-ba13-09449d3fe1d2',
 'a0d97c56-f38a-48e7-b888-320bc02a9f2d',
 'eab2821a-9eea-4339-a47b-a0402b90f904',
 'e24c9810-6ad4-4b8b-900d-e4abb2f09641',
 'e4e2968d-8182-4d71-83dd-f7450605dafe',
 '52974f0b-fbbf-4964-879f-c32e4ec7a812',
 '710611f8-8a5b-44ba-b533-eba0c7c21d20',
 '411ae1a8-4dc9-4366-a571-7e5a642331c3',
 'f01b4c3f-2579-469d-ac14-0f82df8b9aef',
 'fc3d3ebb-6091-4156-b230-06400e5c5b26',
 'f90f0996-5822-48c5-8959-ed08e0af8229',
 'a78378d0-b47e-44b5-92d6-ec154bd601bc',
 'b2903d39-39bb-46b9-b3e5-781c73352d7b',
 '514077ec-2be4-4c4a-8f47-fda7d7e38137',
 '94724a34-9593-4a5d-8f67-4e8fb8433a57',
 '5bb4ff8d-9ecf-44aa-8969-886b0c359f03',
 'a84cc165-709c-

In [22]:
from llama_index import VectorStoreIndex,ServiceContext
from llama_index.storage import StorageContext

In [23]:
service_context = ServiceContext.from_defaults(llm=llm,embed_model=embed_model)

In [24]:
index = VectorStoreIndex.from_vector_store(service_context=service_context,vector_store=vector_store)

In [25]:
query_engine = index.as_query_engine()

In [26]:
query = "What is Diploblastic and Triploblastic Organisation?"

In [27]:
response = query_engine.query(query)

In [28]:
print(response.get_formatted_sources())
print("query was:", query)
print("answer was:", response)

> Source (Doc id: 7d2d5431-af2d-4dc1-a6f7-96330e4cfedc): When
any plane passing through the central axis of
the body divides the organism into two identic...

> Source (Doc id: b806f0dc-a48d-407e-b529-80ffd52102d6): 38
BIOLOGY
Echinoderms and Chordates, organs have
associated to form functional systems, each
sys...
query was: What is Diploblastic and Triploblastic Organisation?
answer was: Diploblastic animals are those in which the cells are arranged in two embryonic layers, an external ectoderm and an internal endoderm, with an undifferentiated layer called mesoglea present in between. Coelenterates are an example of diploblastic animals. On the other hand, triploblastic animals have three embryonic layers - ectoderm, mesoderm, and endoderm.


In [29]:
pprint(str(response))

('Diploblastic animals are those in which the cells are arranged in two '
 'embryonic layers, an external ectoderm and an internal endoderm, with an '
 'undifferentiated layer called mesoglea present in between. Coelenterates are '
 'an example of diploblastic animals. On the other hand, triploblastic animals '
 'have three embryonic layers - ectoderm, mesoderm, and endoderm.')
