In [8]:
import pandas as pd
import time
import zlib

from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.http import models as qmodels
import time


LOCAL_PATH = f"./qdrant_local_{int(time.time())}"  # new folder each run to avoid lock
client = QdrantClient(path=LOCAL_PATH)


# -------------------------
# Load chunked dataset
# -------------------------
CHUNKS_CSV = "PubMed_chunks.csv"
df = pd.read_csv(CHUNKS_CSV)

# Ensure text exists and is clean
if "text" not in df.columns:
    raise ValueError("Expected a 'text' column in PubMed_chunks.csv")

df["text"] = df["text"].fillna("").astype(str)
df = df[df["text"].str.strip().ne("")].reset_index(drop=True)

# -------------------------
# Create stable integer IDs (qid) for Qdrant
# -------------------------
if "chunk_id" in df.columns:
    df["qid"] = df["chunk_id"].astype(str).apply(lambda s: zlib.crc32(s.encode("utf-8")))
else:
    # Fallback: build a stable key from common fields if present
    # (PMID + chunk_index + text).
    pmid_col = "PMID" if "PMID" in df.columns else None
    chunk_idx_col = "chunk_index" if "chunk_index" in df.columns else None

    def make_key(row):
        pmid_part = str(row[pmid_col]) if pmid_col else ""
        idx_part = str(row[chunk_idx_col]) if chunk_idx_col else ""
        text_part = row["text"]
        return f"{pmid_part}::{idx_part}::{text_part}"

    df["qid"] = df.apply(lambda r: zlib.crc32(make_key(r).encode("utf-8")), axis=1)

# Ensure qid is int (Qdrant local-safe)
df["qid"] = df["qid"].astype(int)

# Ensure uniqueness; if collisions happen, you’ll catch it here
if df["qid"].duplicated().any():
    dup = df[df["qid"].duplicated(keep=False)].sort_values("qid")
    raise ValueError(
        "Detected duplicate qid values (hash collisions). "
        "Switch to a stronger ID scheme or include more fields in the hash.\n"
        f"Example duplicates:\n{dup[['qid']].head(10)}"
    )

# -------------------------
# Embed chunks
# -------------------------
EMBED_MODEL = "all-MiniLM-L6-v2"
embedder = SentenceTransformer(EMBED_MODEL)
VECTOR_SIZE = embedder.get_sentence_embedding_dimension()

# -------------------------
# Create / reset Qdrant collection (LOCAL)
# -------------------------
COLLECTION = "pubmed_chunks"
client = QdrantClient(url="http://localhost:6333")  # local storage domain

client.recreate_collection(
    collection_name=COLLECTION,
    vectors_config=qmodels.VectorParams(
        size=VECTOR_SIZE,
        distance=qmodels.Distance.COSINE
    )
)

# -------------------------
# Upsert in batches
# -------------------------
batch_size = 256
SLEEP_SEC = 0.0  # can set to 0.05 if I want to be extra gentle on my poor laptop lol for later iterations

total = len(df)
for start in range(0, total, batch_size):
    end = min(start + batch_size, total)
    batch = df.iloc[start:end]

    texts = batch["text"].tolist()
    vectors = embedder.encode(
        texts,
        normalize_embeddings=True,   # cosine-friendly
        convert_to_numpy=True
    )

    points = []
    for row, vec in zip(batch.to_dict(orient="records"), vectors):
        qid = int(row["qid"])

        # Payload: keep everything except helper ID column
        payload = {k: v for k, v in row.items() if k not in {"qid"}}

        # Make sure key provenance fields exist if CSV has them
        # (No harm if they’re already present.)
        payload["qid"] = qid

        points.append(
            qmodels.PointStruct(
                id=qid,
                vector=vec.tolist(),
                payload=payload
            )
        )

    client.upsert(collection_name=COLLECTION, points=points)

    if (start // batch_size) % 10 == 0 or end == total:
        print(f"Indexed {end}/{total} chunks...")

    if SLEEP_SEC:
        time.sleep(SLEEP_SEC)

print(f"✅ Done: embedded + indexed {len(df)} chunks into Qdrant collection '{COLLECTION}'.")

  client.recreate_collection(


Indexed 256/1239 chunks...
Indexed 1239/1239 chunks...
✅ Done: embedded + indexed 1239 chunks into Qdrant collection 'pubmed_chunks'.


In [9]:
# Testing if saved to the localhost.
from qdrant_client import QdrantClient

client = QdrantClient(url="http://localhost:6333")
print(client.get_collections())

collections=[CollectionDescription(name='pubmed_chunks')]
