In [58]:
import datasets

qa = datasets.load_dataset('squad', split='validation')
qa

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 10570
})

In [70]:
qa[1000]

{'id': '573368044776f41900660a2a',
 'title': 'Warsaw',
 'context': 'Other green spaces in the city include the Botanic Garden and the University Library garden. They have extensive botanical collection of rare domestic and foreign plants, while a palm house in the New Orangery displays plants of subtropics from all over the world. Besides, within the city borders, there are also: Pole Mokotowskie (a big park in the northern Mokotów, where was the first horse racetrack and then the airport), Park Ujazdowski (close to the Sejm and John Lennon street), Park of Culture and Rest in Powsin, by the southern city border, Park Skaryszewski by the right Vistula bank, in Praga. The oldest park in Praga, the Praga Park, was established in 1865–1871 and designed by Jan Dobrowolski. In 1927 a zoological garden (Ogród Zoologiczny) was established on the park grounds, and in 1952 a bear run, still open today.',
 'question': 'Where is a palm house with subtropic plants from all over the world on displa

In [5]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [7]:
qa = qa.map(lambda x: { 'encodeing':model.encode(x['context']).tolist()}, batched=True, batch_size=32)

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [8]:
qa

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers', 'encodeing'],
    num_rows: 10570
})

In [9]:
!pip install pinecone-client

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting pinecone-client
  Downloading pinecone_client-3.2.2-py3-none-any.whl.metadata (16 kB)
Downloading pinecone_client-3.2.2-py3-none-any.whl (215 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m215.9/215.9 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pinecone-client
Successfully installed pinecone-client-3.2.2


In [20]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(
    api_key = 'b7604448-6d4c-439c-b0af-b73a9317339e',  
)

if 'qa-index' not in pc.list_indexes():
    pc.create_index(name='qa-index', dimension=len(qa[0]['encodeing']), spec=ServerlessSpec(
    cloud="aws",
    region="us-east-1"
  ))

index = pc.Index('qa-index')

In [28]:
from tqdm import tqdm

upserts = [(ele['id'], ele['encodeing']) for ele in qa]
for i in tqdm(range(0, len(upserts), 50)):
    index.upsert(upserts[i:min(i+50, len(upserts))])

100%|██████████| 212/212 [00:32<00:00,  6.51it/s]


In [29]:
len(upserts)

10570

In [71]:
query = 'Where is a palm house with subtropic plants from all over the world on display?'
xq = model.encode([query]).tolist()

In [75]:
xc = index.query(vector=xq, top_k=10)
xc

{'matches': [{'id': '573368044776f41900660a2a',
              'score': 0.486505717,
              'values': []},
             {'id': '573368044776f41900660a29',
              'score': 0.486505717,
              'values': []},
             {'id': '573368044776f41900660a2d',
              'score': 0.486505717,
              'values': []},
             {'id': '573368044776f41900660a2b',
              'score': 0.486505717,
              'values': []},
             {'id': '573368044776f41900660a2c',
              'score': 0.486505717,
              'values': []},
             {'id': '5705eccb52bb8914006896b8',
              'score': 0.362052441,
              'values': []},
             {'id': '5705eccb52bb8914006896bb',
              'score': 0.362052441,
              'values': []},
             {'id': '5705eccb52bb8914006896b9',
              'score': 0.362052441,
              'values': []},
             {'id': '5705eccb52bb8914006896ba',
              'score': 0.362052441,
            

In [76]:
ids = [ele['id'] for ele in xc['matches']]
contexts = qa.filter(lambda x: True if x['id'] in ids else False)
contexts['context']

Filter:   0%|          | 0/10570 [00:00<?, ? examples/s]

['Other green spaces in the city include the Botanic Garden and the University Library garden. They have extensive botanical collection of rare domestic and foreign plants, while a palm house in the New Orangery displays plants of subtropics from all over the world. Besides, within the city borders, there are also: Pole Mokotowskie (a big park in the northern Mokotów, where was the first horse racetrack and then the airport), Park Ujazdowski (close to the Sejm and John Lennon street), Park of Culture and Rest in Powsin, by the southern city border, Park Skaryszewski by the right Vistula bank, in Praga. The oldest park in Praga, the Praga Park, was established in 1865–1871 and designed by Jan Dobrowolski. In 1927 a zoological garden (Ogród Zoologiczny) was established on the park grounds, and in 1952 a bear run, still open today.',
 'Other green spaces in the city include the Botanic Garden and the University Library garden. They have extensive botanical collection of rare domestic and 

In [77]:
from transformers import pipeline

model_name = 'deepset/electra-base-squad2'
nlp = pipeline(model=model_name, tokenizer=model_name, task='question-answering')

In [81]:
preds = []
for context in contexts['context']:
    pred = nlp(question=query, context=context)
    preds.append(pred)
preds

[{'score': 0.999940037727356,
  'start': 197,
  'end': 209,
  'answer': 'New Orangery'},
 {'score': 0.999940037727356,
  'start': 197,
  'end': 209,
  'answer': 'New Orangery'},
 {'score': 0.999940037727356,
  'start': 197,
  'end': 209,
  'answer': 'New Orangery'},
 {'score': 0.999940037727356,
  'start': 197,
  'end': 209,
  'answer': 'New Orangery'},
 {'score': 0.999940037727356,
  'start': 197,
  'end': 209,
  'answer': 'New Orangery'},
 {'score': 2.6208901715563115e-09,
  'start': 442,
  'end': 471,
  'answer': 'Bielany Forest nature reserve'},
 {'score': 6.712130925734527e-06,
  'start': 112,
  'end': 124,
  'answer': 'Palm Springs'},
 {'score': 6.712130925734527e-06,
  'start': 112,
  'end': 124,
  'answer': 'Palm Springs'},
 {'score': 6.712130925734527e-06,
  'start': 112,
  'end': 124,
  'answer': 'Palm Springs'},
 {'score': 6.712130925734527e-06,
  'start': 112,
  'end': 124,
  'answer': 'Palm Springs'}]

In [84]:
model_name = 'yjernite/bart_eli5'
seq2seq = pipeline('text2text-generation', model=model_name, tokenizer=model_name)

In [83]:
for context in contexts['context']:
    answer = seq2seq(
        f"question: {query} context: {context}",
        num_beams=4,
        do_sample=True,
        temperature=1.5,
        max_length=64
    )
    print(answer)

[{'generated_text': ' It is in the New York Botanical Garden.'}]
[{'generated_text': " In the New York Botanical Garden. It's a botanic garden, not a botanical house."}]
[{'generated_text': ' Where do you think they would store them? A palm house?'}]
[{'generated_text': " It's in New York City, but I'm not sure where exactly. It's a palm house, so it's probably on display somewhere in the city, but we don't know exactly where it is."}]
[{'generated_text': " It's in the New York Botanical Garden. There are a lot of botanic houses in New York.  URL_0"}]
[{'generated_text': " There is a palm house with subtropic plants on display in the Netherlands. It's called the Botanic Garden of the Netherlands. There is also a botanic garden in Amsterdam with subtropic plants on display. It's called the Botanic Garden of the Netherlands."}]
[{'generated_text': ' There is a palm house in my town. They don\'t call it a "pal house" for nothing. It\'s called a "pal house". It\'s a sort of "pal house".'}]

In [85]:
query = "Do NFL teams only care about playing at the Superbowl?"
xq = model.encode([query]).tolist()

In [88]:
xc = index.query(vector=xq, top_k=5)
ids = [x['id'] for x in xc['matches']]
contexts = qa.filter(lambda x: True if x['id'] in ids else False)
contexts

Filter:   0%|          | 0/10570 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 5
})

In [89]:
for context in contexts['context']:
    answer = seq2seq(
        f"question: {query} context: {context}",
        num_beams=4,
        do_sample=True,
        temperature=1.5,
        max_length=64
    )
    print(answer)

[{'generated_text': " Yes. There's a ton of money to be made by having the Superbowl. It's a big deal for the NFL, and the NFL teams want to get as much money out of it as possible."}]
[{'generated_text': " The Superbowl is the biggest game of the year for the NFL and it is a big event for them. If it isn't played, they lose money. So if they are playing in the Superbowl, they are getting paid a lot of money."}]
[{'generated_text': " Yes, the Superbowl is the biggest game in the world. It is the biggest event of the year. A lot of teams don't want to play in a loss. A loss would be a loss to the Superbowl. They want to be able to say that they won the Superbowl."}]
[{'generated_text': ' The Super Bowl is the most lucrative game of the year. It brings in tens of millions of dollars. They would rather play in New York, Philadelphia, or London. If you are a team in the NFL, the Super Bowl is your biggest game of the year. If you are not, you might not'}]
[{'generated_text': ' In addition 

In [90]:
query = "Do NFL teams only care about playing at the Super Bowl?"

seq2seq(
    f"question: {query} context: unknown",
    num_beams=4,
    do_sample=True,
    temperature=1.5,
    max_length=64
)

[{'generated_text': " It's a huge oversimplification, but it's the basic idea of your question. The Super Bowl is a huge deal, but the NFL teams only have one shot at winning, so they have a finite amount of time to score points. The NFL has a lot of superstars, so they have to"}]

In [92]:
gen = pipeline('text-generation', model='EleutherAI/gpt-neo-125M', tokenizer='EleutherAI/gpt-neo-125M')

gen(query, max_length=32)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'Do NFL teams only care about playing at the Super Bowl?\n\nThe NFL is a great place to play, but it’s not the only place'}]