# Semantic Search

### Import the Needed Packages

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec
from _Utils import Utils
import _Utils

import os
import time
import torch
from tqdm.auto import tqdm

### Load the Dataset

In [3]:
dataset = load_dataset('quora', split='train[240000:290000]')
dataset[:5]

{'questions': [{'id': [207550, 351729],
   'text': ['What is the truth of life?', "What's the evil truth of life?"]},
  {'id': [33183, 351730],
   'text': ['Which is the best smartphone under 20K in India?',
    'Which is the best smartphone with in 20k in India?']},
  {'id': [351731, 351732],
   'text': ['Steps taken by Canadian government to improve literacy rate?',
    'Can I send homemade herbal hair oil from India to US via postal or private courier services?']},
  {'id': [37799, 94186],
   'text': ['What is a good way to lose 30 pounds in 2 months?',
    'What can I do to lose 30 pounds in 2 months?']},
  {'id': [351733, 351734],
   'text': ['Which of the following most accurately describes the translation of the graph y = (x+3)^2 -2 to the graph of y = (x -2)^2 +2?',
    'How do you graph x + 2y = -2?']}],
 'is_duplicate': [False, True, False, True, False]}

In [4]:
questions = []
for record in dataset['questions']:
    questions.extend(record['text'])
question = list(set(questions))
print('\n'.join(questions[:10]))
print('-' * 50)
print(f'Number of questions: {len(questions)}')

What is the truth of life?
What's the evil truth of life?
Which is the best smartphone under 20K in India?
Which is the best smartphone with in 20k in India?
Steps taken by Canadian government to improve literacy rate?
Can I send homemade herbal hair oil from India to US via postal or private courier services?
What is a good way to lose 30 pounds in 2 months?
What can I do to lose 30 pounds in 2 months?
Which of the following most accurately describes the translation of the graph y = (x+3)^2 -2 to the graph of y = (x -2)^2 +2?
How do you graph x + 2y = -2?
--------------------------------------------------
Number of questions: 100000


### Check cuda and Setup the model

*all-MiniLM-L6-v2* sentence-transformers model maps sentences to a 384 dimensional dense vector space.

In [6]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
if device != 'cuda':
    print('LOL, you dont have cuda.')
model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

LOL, you dont have cuda.


In [7]:
query = 'which city is the most populated in the world?'
xq = model.encode(query)
xq.shape

(384,)

### Setup Pinecone

In [8]:
utils = Utils()
PINECONE_API_KEY = utils.get_pinecone_api_key()

In [10]:
pinecone = Pinecone(api_key=PINECONE_API_KEY)
INDEX_NAME = utils.create_dlai_index_name('dl-ai')

if INDEX_NAME in [index.name for index in pinecone.list_indexes()]:
    pinecone.delete_index(INDEX_NAME)
# print(INDEX_NAME)
pinecone.create_index(name=INDEX_NAME, 
    dimension=model.get_sentence_embedding_dimension(), 
    metric='cosine',
    spec=ServerlessSpec(cloud='aws', region='us-west-2'))

index = pinecone.Index(INDEX_NAME)
print(index)

<pinecone.data.index.Index object at 0x7f4ef38689d0>


### Create Embeddings and Upsert to Pinecone

In [11]:
batch_size=200
vector_limit=10000

questions = question[:vector_limit]

import json

for i in tqdm(range(0, len(questions), batch_size)):
    # find end of batch
    i_end = min(i+batch_size, len(questions))
    # create IDs batch
    ids = [str(x) for x in range(i, i_end)]
    # create metadata batch
    metadatas = [{'text': text} for text in questions[i:i_end]]
    # create embeddings
    xc = model.encode(questions[i:i_end])
    # create records list for upsert
    records = zip(ids, xc, metadatas)
    # upsert to Pinecone
    index.upsert(vectors=records)

  0%|          | 0/50 [00:00<?, ?it/s]

In [12]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 10000}},
 'total_vector_count': 10000}

### Run Your Query

In [14]:
def run_query(query):
  embedding = model.encode(query).tolist()
  results = index.query(top_k=10, vector=embedding, include_metadata=True, include_values=False)
  for result in results['matches']:
    print(f"{round(result['score'], 2)}: {result['metadata']['text']}")

In [15]:
run_query('which city has the highest population in the world?')

0.77: What country has the biggest population?
0.66: Which is the best city in the world to travel?
0.59: Where is the highest place on Earth?
0.56: What is the best tourist place in world?
0.56: What is best city in India?
0.55: Which country is the largest democracy in the world?
0.55: Which is best city in India?
0.53: Why is Uttar Pradesh the most populous state in India?
0.53: What is the least known country in the world?
0.52: What is the highest mountain in Europe?


In [16]:
query = 'how do i make chocolate cake?'
run_query(query)

0.79: How do you add chocolate chips to cake mix?
0.61: What is a cake mix?
0.55: How do you make baking soda?
0.55: How do I bake a cake in a microwave oven?
0.52: What should you do if your dog eats chocolate?
0.51: Where can I found adorable baked cupcakes in Gold Coast?
0.5: How do I make a red fondant?
0.48: Where can I get an unique taste for cupcakes in Gold Coast?
0.44: Can you make homemade whey from any kind of milk?
0.42: What can be cooked with sweet potato butter?
