In [1]:
import os

import polars as pl
import qdrant_client
from fastembed import SparseEmbedding, SparseTextEmbedding
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from qdrant_client.models import (
    Distance,
    NamedSparseVector,
    PointStruct,
    SearchRequest,
    SparseIndexParams,
    SparseVector,
    SparseVectorParams,
    VectorParams,
)
from tqdm import tqdm

In [2]:
rootdir = "/Users/carsten/Documents/Science"
docs = []

In [3]:
for subdir, dirs, files in os.walk(rootdir):
    for file in files:
        if file.endswith(".pdf"):
            file_path = os.path.join(subdir, file)
            print(file_path)
            loader = PyMuPDFLoader(
                file_path=file_path,
                mode="page",
            )
            docs_lazy = loader.lazy_load()
            for doc in docs_lazy:
                docs.append(doc)

/Users/carsten/Documents/Science/MCMC from Scratch a Practical Introduction to Markov Chain Monte Carlo.pdf
/Users/carsten/Documents/Science/An Introduction to Sequential Monte Carlo.pdf
/Users/carsten/Documents/Science/Mastering Probabilistic Graphical Models using Python.pdf
/Users/carsten/Documents/Science/Machine Learning Algorithms in Depth.pdf
/Users/carsten/Documents/Science/StatProofBook.pdf
/Users/carsten/Documents/Science/Hamiltonian Monte Carlo Methods in Machine Learning.pdf
/Users/carsten/Documents/Science/An Introduction to MCMC for Machine Learning.pdf
/Users/carsten/Documents/Science/An Introduction to Probabilistic Programming.pdf
/Users/carsten/Documents/Science/Statistics/Think Stats 2nd ed.pdf
/Users/carsten/Documents/Science/Statistics/Introduction to Statistics at DTU.pdf
/Users/carsten/Documents/Science/Statistics/OpenIntro Statistics.pdf
/Users/carsten/Documents/Science/Statistics/Python for Probability Statistics and Machine learning 2nd ed.pdf
/Users/carsten/D

In [4]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=512, chunk_overlap=0, add_start_index=True
)

In [5]:
chunks = text_splitter.split_documents(docs)
len(chunks)

457477

In [6]:
chunks[0]

Document(metadata={'producer': 'iText® 5.5.13.2 ©2000-2020 iText Group NV (AGPL-version); modified using iText® 7.1.14 ©2000-2020 iText Group NV (AGPL-version)', 'creator': '', 'creationdate': '2023-03-21T08:13:09+05:30', 'source': '/Users/carsten/Documents/Science/MCMC from Scratch a Practical Introduction to Markov Chain Monte Carlo.pdf', 'file_path': '/Users/carsten/Documents/Science/MCMC from Scratch a Practical Introduction to Markov Chain Monte Carlo.pdf', 'total_pages': 198, 'format': 'PDF 1.4', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2023-03-21T08:28:46+05:30', 'trapped': '', 'modDate': "D:20230321082846+05'30'", 'creationDate': "D:20230321081309+05'30'", 'page': 0, 'start_index': 0}, page_content='Masanori\xa0Hanada\nSo\xa0Matsuura\nMCMC \nfrom\xa0Scratch\nA\xa0Practical Introduction to\xa0Markov Chain \nMonte Carlo')

In [7]:
sparse_model_name = "Qdrant/bm25"

In [8]:
sparse_model = SparseTextEmbedding(model_name=sparse_model_name, batch_size=32)

In [9]:
qdrant_client = qdrant_client.QdrantClient("localhost:6333")

In [18]:
collection_name = "PDFs"

exists = qdrant_client.collection_exists(collection_name=collection_name)

if not exists:
    print("Creating collection")
    qdrant_client.create_collection(
        collection_name,
        vectors_config={
            "text-dense": VectorParams(
                size=1024,
                distance=Distance.COSINE,
            )
        },
        sparse_vectors_config={
            "text-sparse": SparseVectorParams(
                index=SparseIndexParams(
                    on_disk=False,
                )
            )
        },
    )

Creating collection


In [11]:
def make_sparse_embedding(texts: list[str]) -> list[SparseEmbedding]:
    return list(sparse_model.embed(texts, batch_size=256))

In [12]:
rows = []

for chunk in chunks:
    text = chunk.page_content
    embeddings = make_sparse_embedding([text])
    rows.append(
        {
            "text": text,
            "source": chunk.metadata["source"],
            "page_label": chunk.metadata["page"],
            "sparse_embedding_values": embeddings[0].values,
            "sparse_embedding_indices": embeddings[0].indices,
        }
    )


In [13]:
df = pl.DataFrame(rows)

In [14]:
def make_points(df) -> list[PointStruct]:
    points = []
    idx = 1
    for row in df.iter_rows(named=True):
        sparse_vector = SparseVector(
            indices=row["sparse_embedding_indices"].tolist(),
            values=row["sparse_embedding_values"].tolist(),
        )
        point = PointStruct(
            id=idx,
            payload={
                "page": row["page_label"],
                "source": row["source"],
                "text": row["text"],
            },
            vector={
                "text-sparse": sparse_vector,
            },
        )
        points.append(point)
        idx += 1
    return points

In [15]:
points: list[PointStruct] = make_points(df)

In [19]:
batch_size = 1000

for i in tqdm(range(0, len(points), batch_size)):
    batch = points[i : i + batch_size]
    qdrant_client.upsert(collection_name, batch)

100%|██████████| 458/458 [00:49<00:00,  9.31it/s]


In [22]:
snapshot_info = qdrant_client.create_snapshot(
    collection_name=collection_name, wait=True
)

snapshot_url = f"/collections/test_collection/snapshots/{snapshot_info.name}"

ResponseHandlingException: timed out

In [None]:
import os

import requests


In [None]:
# Create a directory to store snapshots
os.makedirs("snapshots", exist_ok=True)

snapshot_name = os.path.basename(snapshot_url)
local_snapshot_path = os.path.join("snapshots", snapshot_name)

response = requests.get(snapshot_url)
with open(local_snapshot_path, "wb") as f:
    response.raise_for_status()
    f.write(response.content)

In [23]:
def search(query_text: str):
    # # Compute sparse and dense vectors
    query_sparse_vectors: list[SparseEmbedding] = make_sparse_embedding([query_text])
    # query_dense_vector: list[np.ndarray] = make_dense_embedding([query_text])

    search_results = qdrant_client.search_batch(
        collection_name=collection_name,
        requests=[
            SearchRequest(
                vector=NamedSparseVector(
                    name="text-sparse",
                    vector=SparseVector(
                        indices=query_sparse_vectors[0].indices.tolist(),
                        values=query_sparse_vectors[0].values.tolist(),
                    ),
                ),
                limit=10,
                with_payload=True,
            ),
        ],
    )

    return search_results

In [24]:
query_text = "reinforcement learning"
search_results = search(query_text)

  search_results = qdrant_client.search_batch(


In [25]:
search_results

[[ScoredPoint(id=453329, version=453, score=6.888141, payload={'page': 382, 'source': '/Users/carsten/Documents/Science/ReinforcementLearning/Reinforcement Learning An introduction.pdf', 'text': 'several publications that propose theories of behavioral vigor based on reinforcement\nlearning.\nWe turn now to the subject of learning when reinforcing stimuli occur well after the\nevents they reinforce. The mechanisms used by reinforcement learning algorithms to\nenable learning with delayed reinforcement—eligibility traces and TD learning—closely\ncorrespond to psychologists’ hypotheses about how animals can learn under these condi-\ntions.\n14.4\nDelayed Reinforcement'}, vector=None, shard_key=None, order_value=None),
  ScoredPoint(id=300346, version=300, score=6.8126993, payload={'page': 47, 'source': '/Users/carsten/Documents/Science/NotCategorized/Foundations of Deep Reinforcement Learning.pdf', 'text': '1.6 Reinforcement Learning and Supervised Learning\n19\n1.6\nReinforcement Learni

In [None]:
search_results[0][0]

ScoredPoint(id=6863, version=0, score=3.4013352394104004, payload={'page': 198, 'source': '/Users/carsten/Documents/Science/LLM/Learn Generative AI with PyTorch.pdf', 'text': '# obtain average encoding for each group\n_,_,women_g_encodings=vae.encoder(women_g_batch)\nwomen_g_encoding=women_g_encodings.mean(dim=0)\n_,_,men_ng_encodings=vae.encoder(men_ng_batch)\nmen_ng_encoding=men_ng_encodings.mean(dim=0)\n_,_,women_ng_encodings=vae.encoder(women_ng_batch)\nwomen_ng_encoding=women_ng_encodings.mean(dim=0)                  #D\n# decode for each group\nwomen_g_recon=vae.decoder(women_g_encoding.unsqueeze(0))\nmen_ng_recon=vae.decoder(men_ng_encoding.unsqueeze(0))'}, vector=None, shard_key=None, order_value=None)

In [None]:
search_results[0][0].payload["source"]

'/Users/carsten/Documents/Science/LLM/Learn Generative AI with PyTorch.pdf'

In [None]:
import webbrowser

result = search_results[0][1]

url = f"file://{result.payload['source']}#page={result.payload['page']}"
url = url.replace(" ", "%20")
print(url)

file:///Users/carsten/Documents/Science/LLM/Learn%20Generative%20AI%20with%20PyTorch.pdf#page=194


In [None]:
from selenium import webdriver

driver = webdriver.Chrome()  # Optional argument, if not specified will search path.
driver.get(url)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
url2 = "file:///Users/carsten/Documents/Science/LLM/Learn%20Generative%20AI%20with%20PyTorch.pdf#page=198"
webbrowser.open(url2)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


True

In [None]:
from selenium import webdriver

driver = webdriver.Chrome()  # Optional argument, if not specified will search path.
driver.get(url2)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
