# Semantic Search 101: Build a Neural Search Service

https://qdrant.tech/documentation/beginner-tutorials/neural-search/

In [1]:
!curl https://storage.googleapis.com/generall-shared-data/startups_demo.json -O startups_demo.json

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 21.1M  100 21.1M    0     0  4564k      0  0:00:04  0:00:04 --:--:-- 5832k
curl: (6) Could not resolve host: startups_demo.json


In [2]:
!head -n 1 startups_demo.json

{"name":"SaferCodes","images":"https:\/\/safer.codes\/img\/brand\/logo-icon.png","alt":"SaferCodes Logo QR codes generator system forms for COVID-19","description":"QR codes systems for COVID-19.\nSimple tools for bars, restaurants, offices, and other small proximity businesses.","link":"https:\/\/safer.codes","city":"Chicago"}


In [7]:
import json

import numpy as np
import pandas as pd
from fastembed import TextEmbedding
from tqdm.notebook import tqdm

In [13]:
model_name = "sentence-transformers/all-MiniLM-L6-v2"
for model in TextEmbedding.list_supported_models():
    if model["model"] == model_name:
        break

model

{'model': 'sentence-transformers/all-MiniLM-L6-v2',
 'dim': 384,
 'description': 'Text embeddings, Unimodal (text), English, 256 input tokens truncation, Prefixes for queries/documents: not necessary, 2021 year.',
 'license': 'apache-2.0',
 'size_in_GB': 0.09,
 'sources': {'url': 'https://storage.googleapis.com/qdrant-fastembed/sentence-transformers-all-MiniLM-L6-v2.tar.gz',
  'hf': 'qdrant/all-MiniLM-L6-v2-onnx'},
 'model_file': 'model.onnx'}

In [14]:
embedding_model = TextEmbedding(model_name=model_name)

## Prepare sample dataset

In [15]:
df = pd.read_json("startups_demo.json", lines=True)
df.head()

Unnamed: 0,name,images,alt,description,link,city
0,SaferCodes,https://safer.codes/img/brand/logo-icon.png,SaferCodes Logo QR codes generator system form...,QR codes systems for COVID-19.\nSimple tools f...,https://safer.codes,Chicago
1,Human Practice,https://d1qb2nb5cznatu.cloudfront.net/startups...,Human Practice - health care information tech...,Point-of-care word of mouth\nPreferral is a mo...,http://humanpractice.com,Chicago
2,StyleSeek,https://d1qb2nb5cznatu.cloudfront.net/startups...,StyleSeek - e-commerce fashion mass customiza...,Personalized e-commerce for lifestyle products...,http://styleseek.com,Chicago
3,Scout,https://d1qb2nb5cznatu.cloudfront.net/startups...,Scout - security consumer electronics interne...,Hassle-free Home Security\nScout is a self-ins...,http://www.scoutalarm.com,Chicago
4,Invitation codes,https://invitation.codes/img/inv-brand-fb3.png,Invitation App - Share referral codes community,The referral community\nInvitation App is a so...,https://invitation.codes,Chicago


In [16]:
vectors = list(
    embedding_model.embed([row.alt + ". " + row.description for row in df.itertuples()])
)

In [17]:
np.save("startup_vectors.npy", vectors, allow_pickle=False)

## Upload data to qdrant

In [18]:
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams

In [19]:
client = QdrantClient(":memory:")

In [20]:
if not client.collection_exists("startups"):
    client.create_collection(
        collection_name="startups",
        vectors_config=VectorParams(size=384, distance=Distance.COSINE),
    )

In [22]:
fd = open("startups_demo.json")

# Payload is now an iterator over startup data
payload = map(json.loads, fd)

# Load all vectors into memory, numpy array works as iterable afor itself.
# Other options would be to use Mmap, if you don't want to load all data into RAM.
vectors = np.load("startup_vectors.npy")

In [23]:
client.upload_collection(
    collection_name="startups",
    vectors=vectors,
    payload=payload,
    ids=None,  # Vector ids will be assigned automatically
    batch_size=256,  # How many vectors will be uploaded in a single request?
)

  client.upload_collection(


## Build the search API

In [34]:
def search(text: str):
    # Convert text query into vector.
    vector = list(embedding_model.embed(text))[0]

    # Use `vector` to search for closest vectors in the collection.
    search_result = client.query_points(
        collection_name="startups", query=vector, query_filter=None, limit=5
    ).points

    payloads = [hit.payload for hit in search_result]
    return payloads

In [36]:
search("robotic ai")

[{'name': 'Orchid Robotics',
  'images': 'https://d1qb2nb5cznatu.cloudfront.net/startups/i/281423-bd01451f06bd2888ebf239580d502263-thumb_jpg.jpg?buster=1392329500',
  'alt': 'Orchid Robotics -  robotics machine learning artificial intelligence industrial automation',
  'description': "Advanced Software for Robots\nGoogle's machine learning techniques applied to robotics.",
  'link': 'http://www.orchidrobotics.com',
  'city': 'Boston'},
 {'name': 'Robotbase',
  'images': 'https://d1qb2nb5cznatu.cloudfront.net/startups/i/190707-d6f705aa7d803faa6bd5c33c281698b4-thumb_jpg.jpg?buster=1420224362',
  'alt': 'Robotbase -  robotics artificial intelligence internet of things hardware + software',
  'description': "The World's First Artificial Intelligence Personal Robot\nORDER NOW ON KICKSTARTER\nhttps://www.kickstarter.com/projects/403524037/personal-robot\nMeet the world’s first Artificial Intelligence Personal Robot. \xa0\nShe's a friend, a multi-talented personal assistant, an awesome photog

In [37]:
search("financial")

[{'name': 'Finomial',
  'images': 'https://d1qb2nb5cznatu.cloudfront.net/startups/i/65946-437a75bc2055ce89a40402860c2a53b6-thumb_jpg.jpg?buster=1408376313',
  'alt': 'Finomial -  finance',
  'description': '',
  'link': 'http://finomial.com',
  'city': 'New York'},
 {'name': 'U.S. Fiduciary',
  'images': 'https://angel.co/images/shared/nopic_startup.png',
  'alt': 'U.S. Fiduciary -  finance',
  'description': '',
  'link': 'http://www.usfiduciary.com',
  'city': 'Houston'},
 {'name': 'American Express',
  'images': 'https://d1qb2nb5cznatu.cloudfront.net/startups/i/42490-41d65934240d652086fde74873a443fa-thumb_jpg.jpg?buster=1326850941',
  'alt': 'American Express -  finance',
  'description': '',
  'link': 'https://www.americanexpress.com',
  'city': 'New York'},
 {'name': 'Paymentus',
  'images': 'https://d1qb2nb5cznatu.cloudfront.net/startups/i/63392-f817eedec057e834b69dacb82c5c1f38-thumb_jpg.jpg?buster=1408893681',
  'alt': 'Paymentus -  finance',
  'description': '',
  'link': 'http