In [2]:
# We will use startup descriptions in this neural search demo
# Data source: https://startups-list.com/
# It contains name, short descrition, logo and location of startups.
!wget https://storage.googleapis.com/generall-shared-data/startups_demo.json

--2025-02-21 11:59:02--  https://storage.googleapis.com/generall-shared-data/startups_demo.json
Resolving storage.googleapis.com (storage.googleapis.com)... 142.250.70.123, 142.250.66.27, 172.217.174.91, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.250.70.123|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 22205751 (21M) [application/json]
Saving to: ‘startups_demo.json’


2025-02-21 11:59:06 (5.22 MB/s) - ‘startups_demo.json’ saved [22205751/22205751]



In [3]:
# We use SentenceTransformer pre-trained models to convert our text into vectors.
%pip install sentence-transformers


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [4]:
from sentence_transformers import SentenceTransformer
import numpy as np
import json
import pandas as pd
from tqdm.notebook import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# This code will download and create a pre-trained sentence encoder

# all-MiniLM-L6-v2 - is a distilated (lightweight) version of MPNet model.
# It is optimized for the fast inference.
# Full list of available models could be found here https://www.sbert.net/docs/pretrained_models.html
model = SentenceTransformer("all-MiniLM-L6-v2")

In [7]:
df = pd.read_json("./startups_demo.json", lines=True)

In [8]:
# Here we encode all startup descriptions
# We do encoding in batches, as this reduces overhead costs and significantly speeds up the process
vectors = model.encode(
    [row.alt + ". " + row.description for row in df.itertuples()],
    show_progress_bar=True,
)

Batches: 100%|██████████| 1265/1265 [01:05<00:00, 19.30it/s]


In [9]:
# Now we have all our descriptions converted into vectors.
# We have 40474 vectors of 384 dimentions. The output layer of the model has this dimension
vectors.shape

(40474, 384)

In [15]:
# You can download this saved vectors and continue with rest part of the tutorial.
np.save("vectors.npy", vectors, allow_pickle=False)

In [11]:
# Import client library
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance

client = QdrantClient("http://localhost:6333")


In [12]:
if not client.collection_exists("startups"):
    client.create_collection(
        collection_name="startups",
        vectors_config=VectorParams(size=384, distance=Distance.COSINE),
    )


In [16]:
fd = open("./startups_demo.json")

# payload is now an iterator over startup data
payload = map(json.loads, fd)

# Load all vectors into memory, numpy array works as iterable for itself.
# Other option would be to use Mmap, if you don't want to load all data into RAM
vectors = np.load("vectors.npy")


In [17]:
client.upload_collection(
    collection_name="startups",
    vectors=vectors,
    payload=payload,
    ids=None,  # Vector ids will be assigned automatically
    batch_size=256,  # How many vectors will be uploaded in a single request?
)
