# Semantic Search 101: Build your first Semantic Search Engine in 5 minutes


https://qdrant.tech/documentation/beginner-tutorials/search-beginners/

In [4]:
from fastembed import TextEmbedding
from qdrant_client import QdrantClient, models

# [model["model"] for model in TextEmbedding.list_supported_models()]
embedding_model = TextEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [17]:
model = [
    model
    for model in TextEmbedding.list_supported_models()
    if "all-MiniLM" in model["model"]
][0]
model

{'model': 'sentence-transformers/all-MiniLM-L6-v2',
 'dim': 384,
 'description': 'Text embeddings, Unimodal (text), English, 256 input tokens truncation, Prefixes for queries/documents: not necessary, 2021 year.',
 'license': 'apache-2.0',
 'size_in_GB': 0.09,
 'sources': {'url': 'https://storage.googleapis.com/qdrant-fastembed/sentence-transformers-all-MiniLM-L6-v2.tar.gz',
  'hf': 'qdrant/all-MiniLM-L6-v2-onnx'},
 'model_file': 'model.onnx'}

## Add the dataset

In [5]:
documents = [
    {
        "name": "The Time Machine",
        "description": "A man travels through time and witnesses the evolution of humanity.",
        "author": "H.G. Wells",
        "year": 1895,
    },
    {
        "name": "Ender's Game",
        "description": "A young boy is trained to become a military leader in a war against an alien race.",
        "author": "Orson Scott Card",
        "year": 1985,
    },
    {
        "name": "Brave New World",
        "description": "A dystopian society where people are genetically engineered and conditioned to conform to a strict social hierarchy.",
        "author": "Aldous Huxley",
        "year": 1932,
    },
    {
        "name": "The Hitchhiker's Guide to the Galaxy",
        "description": "A comedic science fiction series following the misadventures of an unwitting human and his alien friend.",
        "author": "Douglas Adams",
        "year": 1979,
    },
    {
        "name": "Dune",
        "description": "A desert planet is the site of political intrigue and power struggles.",
        "author": "Frank Herbert",
        "year": 1965,
    },
    {
        "name": "Foundation",
        "description": "A mathematician develops a science to predict the future of humanity and works to save civilization from collapse.",
        "author": "Isaac Asimov",
        "year": 1951,
    },
    {
        "name": "Snow Crash",
        "description": "A futuristic world where the internet has evolved into a virtual reality metaverse.",
        "author": "Neal Stephenson",
        "year": 1992,
    },
    {
        "name": "Neuromancer",
        "description": "A hacker is hired to pull off a near-impossible hack and gets pulled into a web of intrigue.",
        "author": "William Gibson",
        "year": 1984,
    },
    {
        "name": "The War of the Worlds",
        "description": "A Martian invasion of Earth throws humanity into chaos.",
        "author": "H.G. Wells",
        "year": 1898,
    },
    {
        "name": "The Hunger Games",
        "description": "A dystopian society where teenagers are forced to fight to the death in a televised spectacle.",
        "author": "Suzanne Collins",
        "year": 2008,
    },
    {
        "name": "The Andromeda Strain",
        "description": "A deadly virus from outer space threatens to wipe out humanity.",
        "author": "Michael Crichton",
        "year": 1969,
    },
    {
        "name": "The Left Hand of Darkness",
        "description": "A human ambassador is sent to a planet where the inhabitants are genderless and can change gender at will.",
        "author": "Ursula K. Le Guin",
        "year": 1969,
    },
    {
        "name": "The Three-Body Problem",
        "description": "Humans encounter an alien civilization that lives in a dying system.",
        "author": "Liu Cixin",
        "year": 2008,
    },
]

In [21]:
embeddings = list(
    embedding_model.embed([document["description"] for document in documents])
)

## Create a collection

In [18]:
client = QdrantClient(":memory:")
client.create_collection(
    collection_name="my_books",
    vectors_config=models.VectorParams(
        size=model["dim"], distance=models.Distance.COSINE
    ),
)

True

## Upload data to collection

In [22]:
client.upload_points(
    collection_name="my_books",
    points=[
        models.PointStruct(id=idx, vector=embeddings[idx], payload=doc)
        for idx, doc in enumerate(documents)
    ],
)

## Query

In [25]:
hits = client.query_points(
    collection_name="my_books",
    query=list(embedding_model.embed("alien invasion"))[0],
    limit=3,
).points

for hit in hits:
    print(hit.payload, "score:", hit.score)

{'name': 'The War of the Worlds', 'description': 'A Martian invasion of Earth throws humanity into chaos.', 'author': 'H.G. Wells', 'year': 1898} score: 0.5700932336027419
{'name': "The Hitchhiker's Guide to the Galaxy", 'description': 'A comedic science fiction series following the misadventures of an unwitting human and his alien friend.', 'author': 'Douglas Adams', 'year': 1979} score: 0.5040469745831534
{'name': 'The Three-Body Problem', 'description': 'Humans encounter an alien civilization that lives in a dying system.', 'author': 'Liu Cixin', 'year': 2008} score: 0.4590294008142361


## Narrow down the query

In [26]:
hits = client.query_points(
    collection_name="my_books",
    query=list(embedding_model.embed("alien invasion"))[0],
    query_filter=models.Filter(
        must=[models.FieldCondition(key="year", range=models.Range(gte=2000))]
    ),
    limit=1,
).points

for hit in hits:
    print(hit.payload, "score:", hit.score)

{'name': 'The Three-Body Problem', 'description': 'Humans encounter an alien civilization that lives in a dying system.', 'author': 'Liu Cixin', 'year': 2008} score: 0.4590294008142361
