##### `Semantic Search with Qdrant and embeddings using HuggingFace Model`

In [1]:
## Import Libraries
import pandas as pd
import os
from tqdm import tqdm
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer

from qdrant_client import QdrantClient
from qdrant_client.http.models import VectorParams, Distance, Batch, PointIdsList

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
## Load dotenv file
_ = load_dotenv(override=True)
qdrant_key = os.getenv('QDRANT_API_KEY')
qdrant_url = os.getenv('QDRANT_URL')

In [3]:
# ## Reading the big file once to slice it, then commenting this code
# FOLDER_PATH = os.path.join(os.getcwd(), 'datasets', 'dataset-semantic')
# df = pd.read_csv(os.path.join(FOLDER_PATH, 'medium_articles.csv'))
# ## Data is very big, for simplicity taking only the first 500 records, Download them and reread them again.
# ## Make sure that the title is string
# df['title'] = df['title'].astype(str)

# df = df.loc[:499, 'title']

# df = pd.DataFrame(df, columns=['title'])
# ## Add random id in DF
# df['id'] = np.arange(3054, 3054+len(df), 1)


# ## Let's consider this is our stock that we want to build semantic search for it.
# df.to_csv(os.path.join(FOLDER_PATH, 'articles_new.csv'), index=False)

In [4]:
## Read the New CSV File
FILE_PATH = os.path.join(os.getcwd(), 'dataset-semantic', 'articles_new.csv')
df = pd.read_csv(FILE_PATH)
df

Unnamed: 0,title,id
0,Mental Note Vol. 24,3054
1,Your Brain On Coronavirus,3055
2,Mind Your Nose,3056
3,The 4 Purposes of Dreams,3057
4,Surviving a Rod Through the Head,3058
...,...,...
495,Is It Worth to Invest In Mobile E-commerce App...,3549
496,Let go of these things for a happier 2021,3550
497,Not Everyone Will like Your Writing,3551
498,Is Technology Neutral?,3552


* `Embeddings using HuggingFace Model`

In [5]:
## A famous transformer (all-MiniLM-L6-v2) from here: https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2

## The Model
model_hugging = SentenceTransformer(model_name_or_path='all-MiniLM-L6-v2', device='cpu')
model_hugging

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

In [6]:
## test model for embedding 
vect_length_hugging = len(model_hugging.encode(df['title'].iloc[0]))
print('Length of Hugging Face Model is:', vect_length_hugging)

## First 10 values
model_hugging.encode(df['title'].iloc[0])[:10]

Length of Hugging Face Model is: 384


array([-0.0131355 ,  0.06555433, -0.01977038, -0.03618841, -0.07723403,
        0.10825918,  0.0849674 ,  0.02290434,  0.02159446,  0.00035316],
      dtype=float32)

* `Qdrant in Code`

In [8]:
## Connect to Qdrant Client

## Initilaize a Client
client = QdrantClient(url=qdrant_url, api_key=qdrant_key)

## Collection Configurations
collection_config = VectorParams(
                            size=vect_length_hugging,     ## The lenght of HuggingFace Model
                            distance=Distance.COSINE,     ## The similarity metric
                            on_disk=True                  ## RAM optimizing
                                )

## Create a Collection 
client.recreate_collection(collection_name='semantic-search-course', vectors_config=collection_config)

True

In [10]:
## Check Status of Collection
collection_status = client.get_collection(collection_name='semantic-search-course').status
collection_count_vectors = client.get_collection(collection_name='semantic-search-course').vectors_count

print(f'Status is: {collection_status}')
print(f'Vectors Count is: {collection_count_vectors}')

Status is: green
Vectors Count is: 0


In [12]:
## Function for upserting data to Qdrant
def upsert_to_qdrant(df, batch_size=32):

    ## A list for failed_ids
    failed_ids = []

    for batch_start in tqdm(range(0, len(df), batch_size)):

        try:
            ## Prepare batches
            batch_end = min(batch_start+batch_size, len(df))
            titles_batch = df['title'][batch_start: batch_end].tolist()
            ids_batch = df['id'][batch_start: batch_end].tolist()     ## No need to be converted to string (Qdrant need integer)

            ## Get Embeddings using HuggingFace model
            embeds = model_hugging.encode(titles_batch).tolist()

            ## Prepare to Qdrant
            to_upsert = Batch(ids=ids_batch, vectors=embeds)

            ## Upsert to Qdrant
            client.upsert(collection_name='semantic-search-course', wait=True, points=to_upsert)


        except Exception as e:
            print(f'Error in upserting: {e}')
            failed_ids.append(ids_batch)

    return failed_ids


## Apply the function
failed_ids = upsert_to_qdrant(df=df, batch_size=32)

100%|██████████| 16/16 [00:16<00:00,  1.04s/it]


In [14]:
## Check Status of Collection
collection_status = client.get_collection(collection_name='semantic-search-course').status
collection_count_vectors = client.get_collection(collection_name='semantic-search-course').vectors_count

print(f'Status is: {collection_status}')
print(f'Vectors Count is: {collection_count_vectors}')

Status is: green
Vectors Count is: 500


In [21]:
## Inference (Query in real-time) (you can make more than query in one, List)
query_text = 'Neutral Technology'

## Generate Embedding for the query_text
query_embedding = model_hugging.encode(query_text).tolist()

## Search in qdrant
results = client.search(collection_name='semantic-search-course', query_vector=query_embedding, limit=10, score_threshold=0.2)

[{'id': point.id, 'score': point.score} for point in results]

[{'id': 3552, 'score': 0.773675},
 {'id': 3368, 'score': 0.36533964},
 {'id': 3393, 'score': 0.34569108},
 {'id': 3107, 'score': 0.33020422},
 {'id': 3524, 'score': 0.27010283},
 {'id': 3246, 'score': 0.2518342},
 {'id': 3084, 'score': 0.24108207},
 {'id': 3150, 'score': 0.23775214},
 {'id': 3486, 'score': 0.23038086},
 {'id': 3311, 'score': 0.22535235}]

-----