<a href="https://colab.research.google.com/github/YahyaAlaaMassoud/learn-search-relevance/blob/main/quora_semantic_search_gradio.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import warnings
warnings.filterwarnings('ignore')

Creating requirments.txt file and writing the needed dependencies, then installing them.

In [2]:
%%writefile requirments.txt
datasets
gradio
qdrant-client
sentence-transformers

Writing requirments.txt


In [None]:
!echo "Requirments Are:" && cat requirments.txt && echo "-----"

!pip install -r requirments.txt

In [4]:
%load_ext gradio

In [55]:
import gradio as gr
import numpy as np

from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient, models


qdrant = QdrantClient(':memory:') # create in-mem instance of vector db
encoder = SentenceTransformer(model_name_or_path='all-MiniLM-L12-v2')


def compute_embedding(sentences, emb_model):
  return emb_model.encode(sentences=sentences)


def get_questions(ds):
    questions = ds['questions']
    questions_text = set()
    for q in questions:
      for q_text in q['text']:
        questions_text.add(q_text)
    unique_questions = list(questions_text)
    return [{'question': q} for q in unique_questions]


def build_index():
  split_range = '[0:100]'
  split = 'train{}'.format(split_range)
  quora_ds = load_dataset(path='quora', split=split)
  quora_questions = get_questions(ds=quora_ds)

  qdrant.recreate_collection(
      collection_name='questions',
      vectors_config=models.VectorParams(
          size=encoder.get_sentence_embedding_dimension(),
          distance=models.Distance.COSINE
      )
  )

  qdrant.upload_records(
      collection_name='questions',
      records=[
          models.Record(
              id=idx,
              payload=entry,
              vector=compute_embedding(entry['question'], encoder).tolist()
          ) for idx, entry in enumerate(quora_questions)
      ]
  )


def query(question, top_k=5):
  results = qdrant.search(
    collection_name='questions',
    query_vector=compute_embedding(question, encoder),
    limit=top_k
  ) # search for top K questions in the quora questions DB similar to the new question
  results_dict = {}
  for result in results:
    results_dict[result.payload['question']] = round(result.score, 3)
  print(results_dict)
  return results_dict


with gr.Blocks() as demo:
  gr.Markdown("Semantic search for similar questions from Quora")
  inp = gr.Textbox(label="Question", placeholder="Enter your question here")
  slider = gr.Slider(value=3, minimum=1, maximum=8, label="Top K", interactive=True, step=1)
  btn = gr.Button("Find similar questions")
  lbl = gr.Label(label="Most similar questions", value={})
  btn.click(fn=query, inputs=[inp, slider], outputs=[lbl])
  gr.Examples(examples=[["Obama", 3]], inputs=[inp, slider])


build_index()
demo.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://b2be740db9dff26de7.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


