### Contents

[Imports](#Imports)

[bottom](#bottom)

### Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
from os.path import join as pjoin
from pprint import pprint
import sys 
import json 
import itertools 
from tqdm import tqdm
from datetime import datetime, date, timedelta
from load_dotenv import load_dotenv
load_dotenv() # OPENAI_API_KEY

import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from IPython.display import display, clear_output

In [3]:
matplotlib.rcParams['figure.figsize'] = (7, 7)
sns.set_style('whitegrid')

---

link to the tutorial: https://qdrant.tech/documentation/tutorials/neural-search/

1. Run qdrant container

```bash
docker run -d -p 6333:6333 \
    -v $(pwd)/qdrant_storage:/qdrant/storage \
    --name qdrant \
    qdrant/qdrant
```

2. Loading documents and embed them

In [4]:
# loading documents
import requests

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
docs_raw = docs_response.json()

documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

len(documents)

948

In [7]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-MiniLM-L12-v2")

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
embeddings_array = model.encode([doc['text'] for doc in documents], show_progress_bar=True)
print(f"embeddings_array: {embeddings_array.shape[0]:,} x {embeddings_array.shape[1]:,}")

embeddings_array: 948 x 384


In [12]:
print(f"cwd: {os.getcwd()}")
np.save(pjoin(os.getcwd(), "startup_vectors.npy"), embeddings_array, allow_pickle=False)

cwd: /Users/antonandreytsev/Desktop/llm_zoomcamp/hw3


3. Initialise qdrant

In [5]:
# Import client library
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance

qd_client = QdrantClient("http://localhost:6333")
qd_client

<qdrant_client.qdrant_client.QdrantClient at 0x110489d00>

In [13]:
qd_client.recreate_collection(
    collection_name="ml_zoomcamp_faq",
    vectors_config=VectorParams(size=384, distance=Distance.COSINE),
)

  qd_client.recreate_collection(


True

In [24]:
# payload is now an iterator over startup data
payload = (doc for doc in documents)

# Load all vectors into memory, numpy array works as iterable for itself.
# Other option would be to use Mmap, if you don't want to load all data into RAM
vectors = np.load("./startup_vectors.npy")

In [25]:
qd_client.upload_collection(
    collection_name="ml_zoomcamp_faq",
    vectors=vectors,
    payload=payload,
    ids=None,  # Vector ids will be assigned automatically
    batch_size=256,  # How many vectors will be uploaded in a single request?
)
qd_client

<qdrant_client.qdrant_client.QdrantClient at 0x110489d00>

4. Search for results

In [28]:
from pprint import pprint 

text = 'How can I run spark with docker?'
vector = model.encode(text).tolist()

# Use `vector` for search for closest vectors in the collection
collection_name = 'ml_zoomcamp_faq'
search_result = qd_client.search(
    collection_name=collection_name,
    query_vector=vector,
    query_filter=None,  # If you don't want any filters for now
    limit=5,  # 5 the most closest results is enough
)
# `search_result` contains found vector ids with similarity scores along with the stored payload
# In this function you are interested in payload only
payloads = [hit.payload for hit in search_result]
print(f"payloads:")
pprint(payloads)

payloads:
[{'course': 'data-engineering-zoomcamp',
  'question': 'Python Kafka: ./build.sh: Permission denied Error',
  'section': 'Module 6: streaming with kafka',
  'text': 'Run this command in terminal in the same directory '
          '(/docker/spark):\n'
          'chmod +x build.sh'},
 {'course': 'data-engineering-zoomcamp',
  'question': 'Spark docker-compose setup',
  'section': 'Module 5: pyspark',
  'text': 'To run spark in docker setup\n'
          '1. Build bitnami spark docker\n'
          'a. clone bitnami repo using command\n'
          'git clone https://github.com/bitnami/containers.git\n'
          '(tested on commit 9cef8b892d29c04f8a271a644341c8222790c992)\n'
          'b. edit file `bitnami/spark/3.3/debian-11/Dockerfile` and update '
          'java and spark version as following\n'
          '"python-3.10.10-2-linux-${OS_ARCH}-debian-11" \\\n'
          '"java-17.0.5-8-3-linux-${OS_ARCH}-debian-11" \\\n'
          'reference: https://github.com/bitnami/containers

In [31]:
from qdrant_client.models import Filter

# Define a filter for course
course_filter = Filter(**{
    "must": [{
        "key": "course", # Store city information in a field of the same name 
        "match": { # This condition checks if payload field has the requested value
            "value": 'data-engineering-zoomcamp'
        }
    }]
})

vector = model.encode(text).tolist()

# Use `vector` for search for closest vectors in the collection
collection_name = 'ml_zoomcamp_faq'
search_result = qd_client.search(
    collection_name=collection_name,
    query_vector=vector,
    query_filter=course_filter,
    limit=5,  # 5 the most closest results is enough
)
# `search_result` contains found vector ids with similarity scores along with the stored payload
# In this function you are interested in payload only
payloads = [hit.payload for hit in search_result]
print(f"payloads:")
pprint(payloads)

payloads:
[{'course': 'data-engineering-zoomcamp',
  'question': 'Python Kafka: ./build.sh: Permission denied Error',
  'section': 'Module 6: streaming with kafka',
  'text': 'Run this command in terminal in the same directory '
          '(/docker/spark):\n'
          'chmod +x build.sh'},
 {'course': 'data-engineering-zoomcamp',
  'question': 'Spark docker-compose setup',
  'section': 'Module 5: pyspark',
  'text': 'To run spark in docker setup\n'
          '1. Build bitnami spark docker\n'
          'a. clone bitnami repo using command\n'
          'git clone https://github.com/bitnami/containers.git\n'
          '(tested on commit 9cef8b892d29c04f8a271a644341c8222790c992)\n'
          'b. edit file `bitnami/spark/3.3/debian-11/Dockerfile` and update '
          'java and spark version as following\n'
          '"python-3.10.10-2-linux-${OS_ARCH}-debian-11" \\\n'
          '"java-17.0.5-8-3-linux-${OS_ARCH}-debian-11" \\\n'
          'reference: https://github.com/bitnami/containers

### bottom