In [9]:
import os
from config.loadConfig import load_project_config
from DB.qDrant import check_qdrant_status, QdrantConfig
from qdrant_client import QdrantClient
from sentence_transformers import SentenceTransformer
from RAG.RAG_Retrieve import custom_embed_function
# Encode the query text using the same model


# Connect to Qdrant
config = load_project_config(os.path.join(os.getcwd(), "config/config.yaml"))
qdrantConfig = QdrantConfig(
    host=config["db"]["qdrant"]["connection"]["host"],
    port=config["db"]["qdrant"]["connection"]["port"],
    timeout=config["db"]["qdrant"]["connection"]["timeout"]
)
status, client = check_qdrant_status(qdrantConfig)
embedding_model = SentenceTransformer(config['embedding_model'])
small_collection_name = config['db']['collections']['collection_small']['name']
large_collection_name = config['db']['collections']['collection_large']['name']

Connected. Collections: ['wiki_small_chunks', 'wiki_large_chunks']


In [20]:
result = client.scroll(
    collection_name="wiki_small_chunks",
    limit=10,
    with_payload=True,
    with_vectors=True
)
for point in result[0]:  # result[0] is the list of points
    print(point.id, point.payload["text"])


0005fc6f-7d8b-4c0a-b76b-7a09606a00c2 books, ethnographies). Such is the case with environmental justice advocate Melissa Checker and her relationship with the people of Hyde Park.
001cfbbf-d1af-4625-935f-8711a6d8263b Zeno
0022f0fb-4022-4240-94e6-5dcc27fee683 Eugene Lawson heads the Community Bank of Madison, then gets a job with the government when his bank goes bankrupt due to new government policies. One of the looter's cabal, he is a collectivist who abhors production and money-making.
0027f4c5-352d-4b5f-9355-3c2d4d3c7272 must begin with a consonant. In Cushitic and Chadic languages, a glottal stop or glottal fricative may be inserted to prevent a word from beginning with a vowel. Typically, syllables only begin with a single consonant.
0029c247-34ba-41e4-83a0-81e92325f82e As one of the Catalan Countries, Andorra is home to a team of castellers, or Catalan human tower builders. The , based in the town of Santa Coloma d'Andorra, are recognized by the , the governing body of castells.

In [64]:
query_text = "Where is alabama?"
print(f"\n--- DEMO: Querying Small Chunks for: '{query_text}' ---")

query_vector = custom_embed_function(embedding_model=embedding_model, texts=[query_text])
print(len(query_vector[0]))

# query_vector = [0.0] * 384  # same dimension as collection

results = client.query_points(
    collection_name="wiki_small_chunks",
    query=query_vector[0],
    limit=5,
    with_payload=True
)

payloads = [point.payload for point in results.points]


--- DEMO: Querying Small Chunks for: 'Where is alabama?' ---
384


In [67]:
for i in payloads:
    print(i["parent_id"])

a1a83717-a9ab-4719-893f-112d12881d2d
903e2b0f-3a5f-4a88-a4d1-103bc2d72de9
903e2b0f-3a5f-4a88-a4d1-103bc2d72de9
a1a83717-a9ab-4719-893f-112d12881d2d
89b8f70b-c6bc-4403-827b-41f3c4c40e42


In [69]:
result = client.retrieve(
    collection_name="wiki_large_chunks",
    ids=["89b8f70b-c6bc-4403-827b-41f3c4c40e42"],
    with_payload=True
)
print(result)

[Record(id='89b8f70b-c6bc-4403-827b-41f3c4c40e42', payload={'text': 'Flora and fauna \n\nAlabama is home to a diverse array of flora and fauna in habitats that range from the Tennessee Valley, Appalachian Plateau, and Ridge-and-Valley Appalachians of the north to the Piedmont, Canebrake, and Black Belt of the central region to the Gulf Coastal Plain and beaches along the Gulf of Mexico in the south. The state is usually ranked among the top in nation for its range of overall biodiversity.\n\nAlabama is in the subtropical coniferous forest biome and once boasted huge expanses of pine forest, which still form the largest proportion of forests in the state. It currently ranks fifth in the nation for the diversity of its flora. It is home to nearly 4,000 pteridophyte and spermatophyte plant species.\n\nIndigenous animal species in the state include 62 mammal species, 93 reptile species, 73 amphibian species, roughly 307 native freshwater fish species, and 420 bird species that spend at lea

In [None]:
from qdrant_client.http import models

client.create_payload_index(
    collection_name="your_collection",
    field_name="text",
    field_schema=models.TextIndexParams(
        type="text",
        tokenizer=models.TokenizerType.WORD,
        min_token_len=2,
        max_token_len=20
    )
)