In [None]:
!pip install -q sentence-transformers datasets pinecone-client faiss-cpu faiss-gpu

In [18]:
import datasets
squad = datasets.load_dataset('squad', split='validation')
squad

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 10570
})

In [19]:
squad[0]

{'id': '56be4db0acb8001400a502ec',
 'title': 'Super_Bowl_50',
 'context': 'Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24–10 to earn their third Super Bowl title. The game was played on February 7, 2016, at Levi\'s Stadium in the San Francisco Bay Area at Santa Clara, California. As this was the 50th Super Bowl, the league emphasized the "golden anniversary" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as "Super Bowl L"), so that the logo could prominently feature the Arabic numerals 50.',
 'question': 'Which NFL team represented the AFC at Super Bowl 50?',
 'answers': {'text': ['Denver Broncos', 'Denver Broncos', 'Denver Broncos'],


In [20]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')



In [21]:
squad = squad.map(lambda x: {'encoding': model.encode(x['context']).tolist()}, batched=True, batch_size=32)

In [22]:
import faiss
d = len(squad[0]['encoding'])
cntx_index = faiss.IndexFlatL2(d)
cntx_index.is_trained

True

In [23]:
import numpy as np

In [24]:
cntxs =  np.array(squad['encoding'])
cntxs.shape

(10570, 384)

In [25]:
%%time
cntx_index.add(cntxs)

CPU times: user 9.75 ms, sys: 10 ms, total: 19.8 ms
Wall time: 20.3 ms


In [28]:
cntx_index.ntotal

10570

In [80]:
query = "Which NFL team represented the AFC at Super Bowl 50?"
# query = "What is the largest planet in our solar system?"
# query = 'Who wrote the book "To Kill a Mockingbird?"'
# query = "Who is the CEO of Tesla, Inc.?"
query = 'Where is a palm house with subtropic plants from all over the world on display?'
qvec = model.encode([query]).tolist()
len(qvec)

1

In [82]:
%%time
dist, idx = cntx_index.search(np.array(qvec), k=10)
idx

CPU times: user 4.01 ms, sys: 0 ns, total: 4.01 ms
Wall time: 13.4 ms


array([[ 999, 1000, 1001, 1002, 1003, 2663, 2664, 2665, 2666, 1004]])

In [83]:
squad[idx[0]].keys()

dict_keys(['id', 'title', 'context', 'question', 'answers', 'encoding'])

In [84]:
for i in range(len(idx[0])):
  print(squad[idx[0]]['context'][i][:100])
  print(squad[idx[0]]['answers'][i]['text'])
  print(squad[idx[0]]['id'][i])
  print('------')

Other green spaces in the city include the Botanic Garden and the University Library garden. They ha
['green', 'green', 'green']
573368044776f41900660a29
------
Other green spaces in the city include the Botanic Garden and the University Library garden. They ha
['New Orangery', 'New Orangery', 'New Orangery']
573368044776f41900660a2a
------
Other green spaces in the city include the Botanic Garden and the University Library garden. They ha
['Pole Mokotowskie', 'Mokotów', 'Pole Mokotowskie']
573368044776f41900660a2b
------
Other green spaces in the city include the Botanic Garden and the University Library garden. They ha
['Park Ujazdowski', 'Park Ujazdowski', 'Park Ujazdowski']
573368044776f41900660a2c
------
Other green spaces in the city include the Botanic Garden and the University Library garden. They ha
['1927', '1927', '1927']
573368044776f41900660a2d
------
Many locals and tourists frequent the southern California coast for its popular beaches, and the des
['Palm Springs', 'Palm

In [85]:
from transformers import pipeline
model_id = 'deepset/electra-base-squad2'
reader_model = pipeline(tokenizer=model_id, model=model_id, task='question-answering')



In [87]:
for i in range(len(idx[0])):
  context = squad[idx[0]]['context'][i][:100]
  answer = squad[idx[0]]['answers'][i]['text']
  pred_answer = reader_model(question=query, context=context)

  print(context)
  print(answer)
  print('query:', query)
  print('pred answer: ', pred_answer)
  print('------')

Other green spaces in the city include the Botanic Garden and the University Library garden. They ha
['green', 'green', 'green']
query: Where is a palm house with subtropic plants from all over the world on display?
pred answer:  {'score': 2.2183598957781214e-08, 'start': 39, 'end': 57, 'answer': 'the Botanic Garden'}
------
Other green spaces in the city include the Botanic Garden and the University Library garden. They ha
['New Orangery', 'New Orangery', 'New Orangery']
query: Where is a palm house with subtropic plants from all over the world on display?
pred answer:  {'score': 2.2183598957781214e-08, 'start': 39, 'end': 57, 'answer': 'the Botanic Garden'}
------
Other green spaces in the city include the Botanic Garden and the University Library garden. They ha
['Pole Mokotowskie', 'Mokotów', 'Pole Mokotowskie']
query: Where is a palm house with subtropic plants from all over the world on display?
pred answer:  {'score': 2.2183598957781214e-08, 'start': 39, 'end': 57, 'answer': 'th

In [88]:
model_id = 'yjernite/bart_eli5'
gen_model = pipeline(model=model_id, tokenizer=model_id, task='text2text-generation')

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
for i in range(len(idx[0])):
  context = squad[idx[0]]['context'][i][:100]
  answer = squad[idx[0]]['answers'][i]['text']
  pred_answer = gen_model(
      f"question: {query} context:{context}",
       num_beams=4,
        do_sample=True,
        temperature=1.5,
        max_length=64
  )

  print(context)
  print(answer)
  print('query:', query)
  print('pred answer: ', pred_answer)
  print('------')

Other green spaces in the city include the Botanic Garden and the University Library garden. They ha
['green', 'green', 'green']
query: Where is a palm house with subtropic plants from all over the world on display?
pred answer:  [{'generated_text': ' Where is a palm house with a palm house with subtropic plants from all over the world on display?'}]
------
Other green spaces in the city include the Botanic Garden and the University Library garden. They ha
['New Orangery', 'New Orangery', 'New Orangery']
query: Where is a palm house with subtropic plants from all over the world on display?
pred answer:  [{'generated_text': " It is on display. There are plenty of palm houses with those plants on display all over the world on display. It's not that uncommon to see a palm house with them, but I've never seen one with plants from all over the world on display."}]
------
Other green spaces in the city include the Botanic Garden and the University Library garden. They ha
['Pole Mokotowskie',