In [21]:
import qdrant_client
import requests
from fastembed import SparseEmbedding
from qdrant_client.models import (
    Distance,
    NamedSparseVector,
    SearchRequest,
    SparseIndexParams,
    SparseVector,
    SparseVectorParams,
    VectorParams,
)
from tqdm import tqdm

import create_embeddings as ce

In [8]:
rootdir = "/Users/carsten/Documents/Science/"

In [9]:
docs = ce.read_pdfs(rootdir)

/Users/carsten/Documents/Science/MCMC from Scratch a Practical Introduction to Markov Chain Monte Carlo.pdf
/Users/carsten/Documents/Science/Scientific Method, Statistical Method and the Speed of Light.pdf
/Users/carsten/Documents/Science/Statistical Thinking in Empirical Enquiry.pdf
/Users/carsten/Documents/Science/An Introduction to Sequential Monte Carlo.pdf
/Users/carsten/Documents/Science/Mastering Probabilistic Graphical Models using Python.pdf
/Users/carsten/Documents/Science/Machine Learning Algorithms in Depth.pdf
/Users/carsten/Documents/Science/StatProofBook.pdf
/Users/carsten/Documents/Science/Hamiltonian Monte Carlo Methods in Machine Learning.pdf
/Users/carsten/Documents/Science/An Introduction to MCMC for Machine Learning.pdf
/Users/carsten/Documents/Science/An Introduction to Probabilistic Programming.pdf
/Users/carsten/Documents/Science/Statistics/Think Stats 2nd ed.pdf
/Users/carsten/Documents/Science/Statistics/Introduction to Statistics at DTU.pdf
/Users/carsten/Doc

In [10]:
len(docs)

101517

In [11]:
points = ce._create_points(docs)

In [22]:
qdrant_client = qdrant_client.QdrantClient("localhost:6333")

In [23]:
collection_name = "PDFs"

exists = qdrant_client.collection_exists(collection_name=collection_name)

if not exists:
    print("Creating collection")
    created = qdrant_client.create_collection(
        collection_name,
        vectors_config={
            "text-dense": VectorParams(
                size=1024,
                distance=Distance.COSINE,
            )
        },
        sparse_vectors_config={
            "text-sparse": SparseVectorParams(
                index=SparseIndexParams(
                    on_disk=False,
                )
            )
        },
    )
    print(f"Collection created {created}")

Creating collection
Collection created True


In [24]:
batch_size = 1000

for i in tqdm(range(0, len(points), batch_size)):
    batch = points[i : i + batch_size]
    qdrant_client.upsert(collection_name, batch)

100%|██████████| 459/459 [00:49<00:00,  9.18it/s]


In [25]:
def search(query_text: str):
    query_sparse_vectors: list[SparseEmbedding] = ce.make_sparse_embedding([query_text])

    search_results = qdrant_client.search_batch(
        collection_name=collection_name,
        requests=[
            SearchRequest(
                vector=NamedSparseVector(
                    name="text-sparse",
                    vector=SparseVector(
                        indices=query_sparse_vectors[0].indices.tolist(),
                        values=query_sparse_vectors[0].values.tolist(),
                    ),
                ),
                limit=10,
                with_payload=True,
            ),
        ],
    )

    return search_results

In [26]:
query_text = "reinforcement learning"
search_results = search(query_text)

  search_results = qdrant_client.search_batch(


In [27]:
search_results

[[ScoredPoint(id=453935, version=453, score=6.888141, payload={'page': 382, 'source': '/Users/carsten/Documents/Science/ReinforcementLearning/Reinforcement Learning An introduction.pdf', 'text': 'several publications that propose theories of behavioral vigor based on reinforcement\nlearning.\nWe turn now to the subject of learning when reinforcing stimuli occur well after the\nevents they reinforce. The mechanisms used by reinforcement learning algorithms to\nenable learning with delayed reinforcement—eligibility traces and TD learning—closely\ncorrespond to psychologists’ hypotheses about how animals can learn under these condi-\ntions.\n14.4\nDelayed Reinforcement'}, vector=None, shard_key=None, order_value=None),
  ScoredPoint(id=300952, version=300, score=6.8126993, payload={'page': 47, 'source': '/Users/carsten/Documents/Science/NotCategorized/Foundations of Deep Reinforcement Learning.pdf', 'text': '1.6 Reinforcement Learning and Supervised Learning\n19\n1.6\nReinforcement Learni

In [None]:
search_results[0][0]

ScoredPoint(id=6863, version=0, score=3.4013352394104004, payload={'page': 198, 'source': '/Users/carsten/Documents/Science/LLM/Learn Generative AI with PyTorch.pdf', 'text': '# obtain average encoding for each group\n_,_,women_g_encodings=vae.encoder(women_g_batch)\nwomen_g_encoding=women_g_encodings.mean(dim=0)\n_,_,men_ng_encodings=vae.encoder(men_ng_batch)\nmen_ng_encoding=men_ng_encodings.mean(dim=0)\n_,_,women_ng_encodings=vae.encoder(women_ng_batch)\nwomen_ng_encoding=women_ng_encodings.mean(dim=0)                  #D\n# decode for each group\nwomen_g_recon=vae.decoder(women_g_encoding.unsqueeze(0))\nmen_ng_recon=vae.decoder(men_ng_encoding.unsqueeze(0))'}, vector=None, shard_key=None, order_value=None)

In [None]:
search_results[0][0].payload["source"]

'/Users/carsten/Documents/Science/LLM/Learn Generative AI with PyTorch.pdf'

In [None]:
import webbrowser

result = search_results[0][1]

url = f"file://{result.payload['source']}#page={result.payload['page']}"
url = url.replace(" ", "%20")
print(url)

file:///Users/carsten/Documents/Science/LLM/Learn%20Generative%20AI%20with%20PyTorch.pdf#page=194


In [None]:
from selenium import webdriver

driver = webdriver.Chrome()  # Optional argument, if not specified will search path.
driver.get(url)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
url2 = "file:///Users/carsten/Documents/Science/LLM/Learn%20Generative%20AI%20with%20PyTorch.pdf#page=198"
webbrowser.open(url2)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


True

In [None]:
from selenium import webdriver

driver = webdriver.Chrome()  # Optional argument, if not specified will search path.
driver.get(url2)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [6]:
url = "http://localhost:8000/document"
payload = {
    "page_content": docs[2].page_content,
    "metadata": docs[2].metadata,
}
response = requests.post(url, json=payload)

url = "http://localhost:8000/semantic_search"
payload = {
    "query": "MCMC",
    "limit": 3,
}
response = requests.post(url, json=payload)

ConnectionError: HTTPConnectionPool(host='localhost', port=8000): Max retries exceeded with url: /document (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x11a9f6db0>: Failed to establish a new connection: [Errno 61] Connection refused'))