In [21]:
import warnings
warnings.filterwarnings('ignore')

In [22]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec
from DLAIUtils import Utils
import DLAIUtils

import os
import time
import torch

In [23]:
from tqdm.auto import tqdm

In [24]:
def get_data():
    from datasets import load_dataset
    return load_dataset('quora', split='train[240000:290000]')

dataset = get_data()
dataset[:5]

questions = []
for record in dataset['questions']:
    questions.extend(record['text'])
question = list(set(questions))
print('\n'.join(questions[:10]))
print('-' * 50)
print(f'Number of questions: {len(questions)}')

What is the truth of life?
What's the evil truth of life?
Which is the best smartphone under 20K in India?
Which is the best smartphone with in 20k in India?
Steps taken by Canadian government to improve literacy rate?
Can I send homemade herbal hair oil from India to US via postal or private courier services?
What is a good way to lose 30 pounds in 2 months?
What can I do to lose 30 pounds in 2 months?
Which of the following most accurately describes the translation of the graph y = (x+3)^2 -2 to the graph of y = (x -2)^2 +2?
How do you graph x + 2y = -2?
--------------------------------------------------
Number of questions: 100000


In [25]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
if device != 'cuda':
    print('Sorry no cuda.')
model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

Sorry no cuda.


In [None]:
query = 'B'
xq = model.encode(query)
xq.shape

(384,)

In [12]:
utils = Utils()
PINECONE_API_KEY = utils.get_pinecone_api_key()

In [14]:
pinecone = Pinecone(api_key=PINECONE_API_KEY)
INDEX_NAME = utils.create_dlai_index_name('dl-ai')

if INDEX_NAME in [index.name for index in pinecone.list_indexes()]:
    pinecone.delete_index(INDEX_NAME)
print(INDEX_NAME)
pinecone.create_index(name=INDEX_NAME, 
    dimension=model.get_sentence_embedding_dimension(), 
    metric='cosine',
    spec=ServerlessSpec(cloud='aws', region='us-east-1'))

index = pinecone.Index(INDEX_NAME)
print(index)

dl-ai-su0kz0l44zv2o7hgjxll8dgfoyrml-dwzeua
<pinecone.data.index.Index object at 0x313813fa0>


In [15]:
batch_size=200
vector_limit=10000

questions = question[:vector_limit]

import json

for i in tqdm(range(0, len(questions), batch_size)):
    # find end of batch
    i_end = min(i+batch_size, len(questions))
    # create IDs batch
    ids = [str(x) for x in range(i, i_end)]
    # create metadata batch
    metadatas = [{'text': text} for text in questions[i:i_end]]
    # create embeddings
    xc = model.encode(questions[i:i_end])
    # create records list for upsert
    records = zip(ids, xc, metadatas)
    # upsert to Pinecone
    index.upsert(vectors=records)

100%|██████████| 50/50 [00:51<00:00,  1.04s/it]


In [16]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 10000}},
 'total_vector_count': 10000}

In [17]:
# small helper function so we can repeat queries later
def run_query(query):
  embedding = model.encode(query).tolist()
  results = index.query(top_k=10, vector=embedding, include_metadata=True, include_values=False)
  for result in results['matches']:
    print(f"{round(result['score'], 2)}: {result['metadata']['text']}")

In [18]:
run_query('which city has the highest population in the world?')

0.7: Where is the most beautiful city in the world?
0.56: What percentage of the world's population lives in developing countries?
0.52: What are the most unsafe cities in America?
0.7: Which is the most beautiful city in world?
0.56: Which is the best city in India?
0.65: Which is the largest country in the world?
0.67: Which is the most urbanised city in India?
0.52: What is the total number of countries in the world?
0.5: What are the best cities and countries for solo travel?
0.49: Which city in China do you prefer to live in? Why?


In [19]:
query = 'how do i make chocolate cake?'
run_query(query)

0.54: How do you make a crispy batter?
0.56: How do we bake cake in microwave oven?
0.58: What's a good recipe for cake featuring Ciroc?
0.52: How do you make pancakes without the batter?
0.51: How long does cake last in the fridge?
0.48: Where can I found different flavours for cupcakes at Gold Coast?
0.46: Where can I find affordable cake shops on the Gold Coast?
0.46: How can I make homemade pancakes without baking soda?
0.46: Where can I get very nice and original flavor cupcakes in Gold Coast?
0.52: How do I make my chocolate last longer (preservation)?


In [20]:
query = 'What are the best in-person social skills?'
run_query(query)

0.58: How do you improve speaking skills in public?
0.58: What are some of the skills that I can learn online really well?
0.56: How can I learn communication skills?
0.56: What are some skills every 24 year old should know?
0.54: What are the top social and behavioral norms?
0.53: How do I improve my conversation skills with any one?
0.51: How do I become a better communicator?
0.5: How can one make friends in class?
0.49: What is the best social network for beginners?
0.48: What skills do self-taught programmers commonly lack?
