<a href="https://colab.research.google.com/github/YahyaAlaaMassoud/learn-search-relevance/blob/main/quora_semantic_search_gradio.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import warnings
warnings.filterwarnings('ignore')

Creating requirments.txt file and writing the needed dependencies, then installing them.

In [2]:
%%writefile requirments.txt
datasets
gradio
qdrant-client
sentence-transformers

Writing requirments.txt


In [None]:
!echo "Requirments Are:" && cat requirments.txt && echo "-----"

!pip install -r requirments.txt

In [4]:
%load_ext gradio

In [59]:
import gradio as gr
import numpy as np

from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient, models


qdrant = QdrantClient(':memory:') # create in-mem instance of vector db
encoder = SentenceTransformer(model_name_or_path='all-MiniLM-L12-v2')
MAX_QUESTIONS = 50


def compute_embedding(sentences, emb_model):
  return emb_model.encode(sentences=sentences)


def get_questions(ds):
  questions_text = set()
  for i, item in enumerate(ds):
    if i == MAX_QUESTIONS:
      break
    for q_text in item['questions']['text']:
      questions_text.add(q_text)
  unique_questions = list(questions_text)
  return [{'question': q} for q in unique_questions]


def build_index():
  quora_ds = load_dataset(path='quora', split='train', streaming=True)
  quora_questions = get_questions(ds=quora_ds)

  qdrant.recreate_collection(
      collection_name='questions',
      vectors_config=models.VectorParams(
          size=encoder.get_sentence_embedding_dimension(),
          distance=models.Distance.COSINE
      )
  )

  qdrant.upload_records(
      collection_name='questions',
      records=[
          models.Record(
              id=idx,
              payload=entry,
              vector=compute_embedding(entry['question'], encoder).tolist()
          ) for idx, entry in enumerate(quora_questions)
      ]
  )


def query(question, top_k=5):
  results = qdrant.search(
    collection_name='questions',
    query_vector=compute_embedding(question, encoder),
    limit=top_k
  ) # search for top K questions in the quora questions DB similar to the new question
  results_dict = {}
  for result in results:
    results_dict[result.payload['question']] = round(result.score, 3)
  return results_dict


with gr.Blocks() as demo:
  build_index()
  gr.Markdown("Semantic search for similar questions from Quora")
  inp = gr.Textbox(label="Question", placeholder="Enter your question here")
  slider = gr.Slider(value=3, minimum=1, maximum=8, label="Top K", interactive=True, step=1)
  btn = gr.Button("Find similar questions")
  lbl = gr.Label(label="Most similar questions", value={})
  btn.click(fn=query, inputs=[inp, slider], outputs=[lbl])
  gr.Examples(examples=[["Obama", 3]], inputs=[inp, slider])


demo.launch(share=True)

[{'question': 'Does society place too much importance on sports?'}, {'question': 'What Game of Thrones villain would be the most likely to give you mercy?'}, {'question': 'Is being a good kid and not being a rebel worth it in the long run?'}, {'question': "What's one thing you would like to do better?"}, {'question': 'When can I expect Cognizant confirmation mail?'}, {'question': 'What does manipulation mean?'}, {'question': 'What are some of the things technicians can tell about the durability and reliability of Laptops and its components?'}, {'question': 'Is being bored good for a kid?'}, {'question': 'What are the questions should not ask on Quora?'}, {'question': 'What is best way to ask for money online?'}, {'question': 'Why did aircraft stop using variable-sweep wings, like those on an F-14?'}, {'question': 'Should I buy tiago?'}, {'question': 'How do I prepare for civil service?'}, {'question': 'What keeps childern active and far from phone and video games?'}, {'question': 'How 

{'Why do some people think Obama will try to take their guns away?': 0.434, 'How will a Trump presidency affect the students presently in US or planning to study in US?': 0.268, 'What would a Trump presidency mean for current international master’s students on an F1 visa?': 0.243}
Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7889 <> https://b2be740db9dff26de7.gradio.live
Killing tunnel 127.0.0.1:7890 <> https://bdec58295c455f70a5.gradio.live
Killing tunnel 127.0.0.1:7891 <> https://5926a17fc203002ba2.gradio.live


