In [43]:
endpoint = "https://dpka96bj5r3f505a.us-east-1.aws.endpoints.huggingface.cloud"
api_org = "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"

import requests

# add the api org token to the headers
headers = {
    'Authorization': f'Bearer {api_org}'
}
# we add sentences to embed like so
json_data = {"inputs": ["a happy dog", "a sad dog"]}
# make the request
res = requests.post(
    endpoint,
    headers=headers,
    json=json_data
)

res

<Response [200]>

In [44]:
len(res.json()['embeddings'])

2

In [45]:
dim = len(res.json()['embeddings'][0])
dim

384

In [46]:
from datasets import load_dataset

snli = load_dataset("snli", split='train')
snli

Found cached dataset snli (/Users/jwilliams/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b)


Dataset({
    features: ['premise', 'hypothesis', 'label'],
    num_rows: 550152
})

In [47]:
passages = list(set(snli['hypothesis']))
len(passages)

480042

In [48]:
passages = passages[:50_000]

In [49]:
import pinecone

# initialize connection to pinecone (get API key at app.pinecone.io)
pinecone.init(api_key="a3895c8f-9f97-4424-9b50-ac62e03fed4c", environment="us-west1-gcp")


In [50]:
index_name = 'hf-endpoints'

# check if the hf-endpoints index exists
if index_name not in pinecone.list_indexes():
    # create the index if it does not exist
    pinecone.create_index(
        index_name,
        dimension=dim,
        metric="cosine"
    )

# connect to hf-endpoints index we created
index = pinecone.Index(index_name)

In [53]:
from tqdm.auto import tqdm

# we will use batches of 64
batch_size = 64

for i in tqdm(range(35856, len(passages), batch_size)):
    # find end of batch
    i_end = min(i+batch_size, len(passages))
    # extract batch
    batch = passages[i:i_end]
    # generate embeddings for batch via endpoints
    res = requests.post(
        endpoint,
        headers=headers,
        json={"inputs": batch}
    )
    emb = res.json()['embeddings']
    # get metadata (just the original text)
    meta = [{'text': text} for text in batch]
    # create IDs
    ids = [str(x) for x in range(i, i_end)]
    # add all to upsert list
    to_upsert = list(zip(ids, emb, meta))
    # upsert/insert these records to pinecone
    _ = index.upsert(vectors=to_upsert)

# check that we have all vectors in index
index.describe_index_stats()

  0%|          | 0/221 [00:00<?, ?it/s]

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 41088}},
 'total_vector_count': 41088}

In [54]:
query = snli['premise'][0]
print(f"Query: {query}")
# encode with HF endpoints
res = requests.post(endpoint, headers=headers, json={"inputs": query})
xq = res.json()['embeddings']
# query and return top 5
xc = index.query(xq, top_k=5, include_metadata=True)
# iterate through results and print text
print("Answers:")
for match in xc['matches']:
    print(match['metadata']['text'])

Query: A person on a horse jumps over a broken down airplane.
Answers:
A person rides a horse over a plane.
person riding horse while jumping.
The person jumped out of a plane.
A horse jumping over a barrier
A person jumps a horse over a gate in a competition.


In [55]:
query = snli['premise'][100]
print(f"Query: {query}")
# encode with HF endpoints
res = requests.post(endpoint, headers=headers, json={"inputs": query})
xq = res.json()['embeddings']
# query and return top 5
xc = index.query(xq, top_k=5, include_metadata=True)
# iterate through results and print text
print("Answers:")
for match in xc['matches']:
    print(f"{match['score']} {match['metadata']['text']}")

Query: A woman is walking across the street eating a banana, while a man is following with his briefcase.
Answers:
0.883513391 A woman eats a banana and walks across a street, and there is a man trailing behind her.
0.5561198 A woman with short arms reaches for a banana.
0.553925574 A woman takes a walk with some fruit.
0.553366244 A woman is walking towards a man.
0.527939856 a woman and man walking in a street


In [56]:
query = snli['premise'][200]
print(f"Query: {query}")
# encode with HF endpoints
res = requests.post(endpoint, headers=headers, json={"inputs": query})
xq = res.json()['embeddings']
# query and return top 5
xc = index.query(xq, top_k=5, include_metadata=True)
# iterate through results and print text
print("Answers:")
for match in xc['matches']:
    print(f"{match['score']} {match['metadata']['text']}")

Query: People on bicycles waiting at an intersection.
Answers:
0.854751 Bicyclists wait at an intersection.
0.804648817 People on bicycles speed through an intersection.
0.695790946 People standing next to their bikes.
0.694061458 People on bikes.
0.687675714 people outside near a bike rack getting ready to go for a ride


In [57]:
print(snli[0])

{'premise': 'A person on a horse jumps over a broken down airplane.', 'hypothesis': 'A person is training his horse for a competition.', 'label': 1}
