In [1]:
# for loading data directly from hugging face
from datasets import load_dataset

In [2]:
ds = load_dataset('wiki_qa', split='train')

Found cached dataset parquet (C:/Users/arupnanda/.cache/huggingface/datasets/parquet/wiki_qa-8063f393970b5c49/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


In [3]:
#check a few rows
ds[:5]

{'question_id': ['Q1', 'Q1', 'Q1', 'Q1', 'Q1'],
 'question': ['how are glacier caves formed?',
  'how are glacier caves formed?',
  'how are glacier caves formed?',
  'how are glacier caves formed?',
  'how are glacier caves formed?'],
 'document_title': ['Glacier cave',
  'Glacier cave',
  'Glacier cave',
  'Glacier cave',
  'Glacier cave'],
 'answer': ['A partly submerged glacier cave on Perito Moreno Glacier .',
  'The ice facade is approximately 60 m high',
  'Ice formations in the Titlis glacier cave',
  'A glacier cave is a cave formed within the ice of a glacier .',
  'Glacier caves are often called ice caves , but this term is properly used to describe bedrock caves that contain year-round ice.'],
 'label': [0, 0, 0, 1, 0]}

In [4]:
# collect only the questions
questions = []
for i in ds ['question']:
    questions.append(i)

In [5]:
# remove duplicates
questions = list(set(questions))

In [6]:
#check a few rows
print('\n'.join(questions[:5]))

what is the ideal mean radiant temperature
Who holds records for most weeks spent at number one as female artist
what is the internal pressure of a bomb calorimeter
when was bow wow born
Who Discovered Nuclear Power


In [7]:
# how many questions did we get?
print(len(questions))

2118


In [8]:
import chromadb

In [9]:
client = chromadb.Client()

In [10]:
coll = client.create_collection(name='my_collection')

In [11]:
"""Prepare the embedding

We will need three things:
1. An ID
2. A document, which is the question we collected
3. A vector representation of the document

To create the the vector, we will use the sentence transformer model we leanred earlier.

from these we will create an embedding
embedding = [(id, document, vector)]

We will add these to the collection using the upsert method. To show progress, we will use the tqdm package.
"""

'Prepare the embedding\n\nWe will need three things:\n1. An ID\n2. A document, which is the question we collected\n3. A vector representation of the document\n\nTo create the the vector, we will use the sentence transformer model we leanred earlier.\n\nfrom these we will create an embedding\nembedding = [(id, document, vector)]\n\nWe will add these to the collection using the upsert method. To show progress, we will use the tqdm package.\n'

In [12]:
from tqdm.auto import tqdm

In [13]:
from sentence_transformers import SentenceTransformer

In [14]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2')
#model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

In [15]:
# Upsert
# batches of 128
# total questions are 2118
batch_size=128
total_size=2118
for ctr in tqdm(range(0,total_size,batch_size)):
    ctr_end = min(ctr+batch_size, total_size)
    IDs = [str(i) for i in range(ctr, ctr_end)]
    documents = [text for text in questions[ctr:ctr_end]]
    embeddings = model.encode(questions[ctr:ctr_end]).tolist()
    coll.upsert(documents=documents, ids=IDs, embeddings=embeddings)


  0%|          | 0/17 [00:00<?, ?it/s]

In [16]:
coll.count()

2118

In [17]:
# Let's for our question, which may not exist in its current form in the list of questions.
# Instead, we are trying to find out from the list which questions are semantically similar to
# this question we have in mind.
question = 'why did Americans fight their own'

In [18]:
# convert to a vector
ques_vector = model.encode(question).tolist()
# ques_vector

In [19]:
# Get similar vectors
similar_vectors = coll.query(ques_vector, n_results = 10)

In [20]:
# How does it look?
similar_vectors

{'ids': [['1323',
   '1721',
   '1119',
   '1435',
   '1397',
   '1377',
   '1300',
   '1221',
   '1954',
   '6']],
 'distances': [[1.0276615619659424,
   1.0597517490386963,
   1.1023807525634766,
   1.1028097867965698,
   1.1260040998458862,
   1.1382591724395752,
   1.1674909591674805,
   1.183807373046875,
   1.2293291091918945,
   1.2379268407821655]],
 'metadatas': [[None, None, None, None, None, None, None, None, None, None]],
 'embeddings': None,
 'documents': [['what made the civil war different from others',
   'when was america pioneered',
   'what date did the american civil war start',
   'how many native Americans did the United States kill or deport?',
   'what triggered the civil war',
   'when did the civil war start and where',
   'Who controlled Alaska before US?',
   'what two empires fought to control afghanistan',
   'what is colonial americans day in usa',
   'how did bleeding sumner lead to the civil war']],
 'uris': None,
 'data': None}

In [21]:
#pretty output
print(f'{"Distance":>8} {"ID":>4} {"Question"}')
for ids in similar_vectors['ids'][0]:
    i = similar_vectors['ids'][0].index(ids)
    print(f"{round(similar_vectors['distances'][0][i],6):1.6f} {ids:>4} {similar_vectors['documents'][0][i]}")

Distance   ID Question
1.027662 1323 what made the civil war different from others
1.059752 1721 when was america pioneered
1.102381 1119 what date did the american civil war start
1.102810 1435 how many native Americans did the United States kill or deport?
1.126004 1397 what triggered the civil war
1.138259 1377 when did the civil war start and where
1.167491 1300 Who controlled Alaska before US?
1.183807 1221 what two empires fought to control afghanistan
1.229329 1954 what is colonial americans day in usa
1.237927    6 how did bleeding sumner lead to the civil war


In [None]:
model