# QDRANT - Feedback Collection Quickstart

A quickstart notebook to get started with Qdrant

In [None]:
import os
import numpy as np
from pprint import pprint
from src.utils.bigquery import query_bigquery
from datetime import datetime
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance

from src.collection.set_collection import (
    create_vectors_from_data,
    create_collection,
    upsert_to_collection_from_vectors,
)
from src.collection.query_collection import get_top_k_results

PUBLISHING_PROJECT_ID = os.getenv("PUBLISHING_PROJECT_ID")
LABELLED_FEEDBACK_DATASET = os.getenv("LABELLED_FEEDBACK_DATASET")
PUBLISHING_VIEW = os.getenv("PUBLISHING_VIEW")
OPENAI_LABELLED_FEEDBACK_TABLE = os.getenv("OPENAI_LABELLED_FEEDBACK_TABLE")

In [None]:
# Query BQ to pull the human labelled feedback data
query_read = """

SELECT 
    feedback.type,
    feedback.created,
    feedback.subject_page_path,
    CONCAT('https://www.gov.uk', feedback.subject_page_path) AS reconstructed_path,
    feedback.feedback_record_id,
    feedback.response_value,
    feedback.embeddings,
    feedback.sentiment,
    feedback.spam_classification,
    feedback.spam_probability,
    labels.labels,
    labels.urgency
FROM @PUBLISHING_VIEW feedback
JOIN @labelled_feedback_table labels
  ON feedback.feedback_record_id=labels.id
WHERE feedback.created > DATE("2023-08-01")
"""


query_read = query_read.replace(
    "@labelled_feedback_table", str(OPENAI_LABELLED_FEEDBACK_TABLE)
).replace("@PUBLISHING_VIEW", str(PUBLISHING_VIEW))

# Call the function to execute the query
docs = query_bigquery(
    PUBLISHING_PROJECT_ID,
    LABELLED_FEEDBACK_DATASET,
    query_read,
)

In [None]:
client = QdrantClient("localhost", port=6333)

In [None]:
collection_name = "feedback_test_collection_1"

create_collection(client, collection_name, size=768, distance_metric=Distance.DOT)

In [None]:
# Convert example data into PointStructs for upsertion
points_to_upsert = create_vectors_from_data(
    docs, id_key="feedback_record_id", embedding_key="embeddings"
)

In [None]:
type(points_to_upsert[0])

In [None]:
# Upsert data to collection
upsert_to_collection_from_vectors(client, collection_name, data=points_to_upsert)

In [None]:
# Clean up any stale collections
stale_collection_name = "stale_collection_name"
client.delete_collection(collection_name=f"{stale_collection_name}")

In [None]:
# Embed keyword for query search using a local sentence transformers model...
model = SentenceTransformer("all-mpnet-base-v2")

In [None]:
query_embedding = model.encode("tax")

In [None]:
len(query_embedding)

In [None]:
# filter_key = "subject_page_path"
# filter_values = None
# filter_values = ["/government/publications/childcare-service-compensation"]

search_result = get_top_k_results(client, collection_name, query_embedding, k=5)

In [None]:
pprint(search_result)

------

## ---Langchain Create Collection Implementation---


In [None]:
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import Qdrant
from langchain_openai import OpenAIEmbeddings

In [None]:
loader = TextLoader("../path/to/docs")
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)

embeddings = OpenAIEmbeddings()

In [None]:
# In memory
qdrant = Qdrant.from_documents(
    docs,
    embeddings,
    location=":memory:",  # Local mode with in-memory storage only
    collection_name="my_documents",
)

In [None]:
# with Docker / K8s
url = "<---qdrant url here --->"
qdrant = Qdrant.from_documents(
    docs,
    embeddings,
    url=url,
    prefer_grpc=True,
    collection_name="my_documents",
    # force_recreate=True,
)

In [None]:
query = ""
found_docs = qdrant.similarity_search_with_score(query)

In [None]:
document, score = found_docs[0]
print(document.page_content)
print(f"\nScore: {score}")

In [None]:
# Generate random vector for testing - list of 768 random numbers between 0 and 1
random_numbers = np.random.rand(768).tolist()

# Display the first 10 numbers
random_numbers[:10], len(random_numbers)