In [2]:
%pip install -q python-env

Note: you may need to restart the kernel to use updated packages.


In [3]:
%pip install -q "pinecone[grpc]"


Note: you may need to restart the kernel to use updated packages.


In [None]:
# Import the Pinecone library
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
import time

# Initialize a Pinecone client with your API key
pc = Pinecone(api_key="")

# Define a sample dataset where each item has a unique ID, text, and category
data = [
    {
        "id": "rec1",
        "text": "Apples are a great source of dietary fiber, which supports digestion and helps maintain a healthy gut.",
        "category": "digestive system" 
    },
    {
        "id": "rec2",
        "text": "Apples originated in Central Asia and have been cultivated for thousands of years, with over 7,500 varieties available today.",
        "category": "cultivation"
    },
    {
        "id": "rec3",
        "text": "Rich in vitamin C and other antioxidants, apples contribute to immune health and may reduce the risk of chronic diseases.",
        "category": "immune system"
    },
    {
        "id": "rec4",
        "text": "The high fiber content in apples can also help regulate blood sugar levels, making them a favorable snack for people with diabetes.",
        "category": "endocrine system"
    }
]

# Convert the text into numerical vectors that Pinecone can index
embeddings = pc.inference.embed(
    model="multilingual-e5-large",
    inputs=[d["text"] for d in data],
    parameters={
        "input_type": "passage", 
        "truncate": "END"
    }
)

print(embeddings)

EmbeddingsList(
  model='multilingual-e5-large',
  vector_type='dense',
  data=[
    {'vector_type': dense, 'values': [0.04205322265625, -0.00951385498046875, ..., -0.050506591796875, -0.01019287109375]},
    {'vector_type': dense, 'values': [0.033203125, -0.018524169921875, ..., -0.00965118408203125, -0.0240631103515625]},
    {'vector_type': dense, 'values': [0.033599853515625, -0.00600433349609375, ..., -0.005611419677734375, -0.0251922607421875]},
    {'vector_type': dense, 'values': [0.00972747802734375, -0.01181793212890625, ..., -0.0252227783203125, -0.006389617919921875]}
  ],
  usage={'total_tokens': 118}
)


In [5]:
# Create a serverless index
index_name = "example-index"

if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=1024,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws", 
            region="us-east-1"
        ) 
    ) 

# Wait for the index to be ready
while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)


In [7]:
# Target the index
# In production, target an index by its unique DNS host, not by its name
# See https://docs.pinecone.io/guides/data/target-an-index
index = pc.Index(index_name)

# Prepare the records for upsert
# Each contains an 'id', the vector 'values', 
# and the original text and category as 'metadata'
records = []
for d, e in zip(data, embeddings):
    records.append({
        "id": d["id"],
        "values": e["values"],
        "metadata": {
            "source_text": d["text"],
            "category": d["category"]
        }
    })

# Upsert the records into the index
index.upsert(
    vectors=records,
    namespace="example-namespace"
)


upserted_count: 4

In [8]:
# Define your query
query = "Health risks"

# Convert the query into a numerical vector that Pinecone can search with
query_embedding = pc.inference.embed(
    model="multilingual-e5-large",
    inputs=[query],
    parameters={
        "input_type": "query"
    }
)

# Search the index for the three most similar vectors
results = index.query(
    namespace="example-namespace",
    vector=query_embedding[0].values,
    top_k=3,
    include_values=False,
    include_metadata=True
)

print(results)


{'matches': [{'id': 'rec3',
              'metadata': {'category': 'immune system',
                           'source_text': 'Rich in vitamin C and other '
                                          'antioxidants, apples contribute to '
                                          'immune health and may reduce the '
                                          'risk of chronic diseases.'},
              'score': 0.8224697,
              'sparse_values': {'indices': [], 'values': []},
              'values': []},
             {'id': 'rec1',
              'metadata': {'category': 'digestive system',
                           'source_text': 'Apples are a great source of '
                                          'dietary fiber, which supports '
                                          'digestion and helps maintain a '
                                          'healthy gut.'},
              'score': 0.7912986,
              'sparse_values': {'indices': [], 'values': []},
              'values

In [9]:
query = "apples"

# Convert the query into a numerical vector that Pinecone can search with
query_embedding = pc.inference.embed(
    model="multilingual-e5-large",
    inputs=[query],
    parameters={
        "input_type": "query"
    }
)

# Search the index for the three most similar vectors
results = index.query(
    namespace="example-namespace",
    vector=query_embedding[0].values,
    top_k=3,
    include_values=False,
    include_metadata=True
)

print(results)

{'matches': [{'id': 'rec1',
              'metadata': {'category': 'digestive system',
                           'source_text': 'Apples are a great source of '
                                          'dietary fiber, which supports '
                                          'digestion and helps maintain a '
                                          'healthy gut.'},
              'score': 0.8735175,
              'sparse_values': {'indices': [], 'values': []},
              'values': []},
             {'id': 'rec3',
              'metadata': {'category': 'immune system',
                           'source_text': 'Rich in vitamin C and other '
                                          'antioxidants, apples contribute to '
                                          'immune health and may reduce the '
                                          'risk of chronic diseases.'},
              'score': 0.86947113,
              'sparse_values': {'indices': [], 'values': []},
              'value

In [12]:
query = "cultivation"

# Convert the query into a numerical vector that Pinecone can search with
query_embedding = pc.inference.embed(
    model="multilingual-e5-large",
    inputs=[query],
    parameters={
        "input_type": "query"
    }
)

# Search the index for the three most similar vectors
results = index.query(
    namespace="example-namespace",
    vector=query_embedding[0].values,
    top_k=1,
    include_values=False,
    include_metadata=True
)

print(results)

{'matches': [{'id': 'rec2',
              'metadata': {'category': 'cultivation',
                           'source_text': 'Apples originated in Central Asia '
                                          'and have been cultivated for '
                                          'thousands of years, with over 7,500 '
                                          'varieties available today.'},
              'score': 0.8079267,
              'sparse_values': {'indices': [], 'values': []},
              'values': []}],
 'namespace': 'example-namespace',
 'usage': {'read_units': 6}}


In [10]:
query = "cars"

# Convert the query into a numerical vector that Pinecone can search with
query_embedding = pc.inference.embed(
    model="multilingual-e5-large",
    inputs=[query],
    parameters={
        "input_type": "query"
    }
)

# Search the index for the three most similar vectors
results = index.query(
    namespace="example-namespace",
    vector=query_embedding[0].values,
    top_k=3,
    include_values=False,
    include_metadata=True
)

print(results)

{'matches': [{'id': 'rec3',
              'metadata': {'category': 'immune system',
                           'source_text': 'Rich in vitamin C and other '
                                          'antioxidants, apples contribute to '
                                          'immune health and may reduce the '
                                          'risk of chronic diseases.'},
              'score': 0.7768235,
              'sparse_values': {'indices': [], 'values': []},
              'values': []},
             {'id': 'rec1',
              'metadata': {'category': 'digestive system',
                           'source_text': 'Apples are a great source of '
                                          'dietary fiber, which supports '
                                          'digestion and helps maintain a '
                                          'healthy gut.'},
              'score': 0.7758372,
              'sparse_values': {'indices': [], 'values': []},
              'values

In [14]:
ranked_results = pc.inference.rerank(
    model ="bge-reranker-v2-m3",
    query = "Health risks",
    documents=[
        {"id": "rec3", "source_text": "Rich in vitamin C and other antioxidants, apples contribute to immune health and may reduce the risk of chronic diseases."},
        {"id": "rec1", "source_text": "Apples are a great source of dietary fiber, which supports digestion and helps maintain a healthy gut."},
        {"id": "rec4", "source_text": "The high fiber content in apples can also help regulate blood sugar levels, making them a favorable snack for people with diabetes."}
    ],
    top_n=3,
    rank_fields=["source_text"],
    return_documents=True,
    parameters={
        "truncate": "END"
    }
)

print(ranked_results)

RerankResult(
  model='bge-reranker-v2-m3',
  data=[{
    index=0,
    score=0.07992552,
    document={
        id='rec3',
        source_text='Rich in vitamin C and other antioxidants, apples contribute to immune health and may reduce the risk of chronic diseases.'
    }
  },{
    index=2,
    score=0.003976228,
    document={
        id='rec4',
        source_text='The high fiber content in apples can also help regulate blood sugar levels, making them a favorable snack for people with diabetes.'
    }
  },{
    index=1,
    score=0.0009888597,
    document={
        id='rec1',
        source_text='Apples are a great source of dietary fiber, which supports digestion and helps maintain a healthy gut.'
    }
  }],
  usage={'rerank_units': 1}
)


In [15]:
# Search the index with a metadata filter
filtered_results = index.query(
    namespace="example-namespace",
    vector=query_embedding.data[0].values,
    filter={
        "category": {"$eq": "digestive system"}
    },
    top_k=3,
    include_values=False,
    include_metadata=True
)

print(filtered_results)


{'matches': [{'id': 'rec1',
              'metadata': {'category': 'digestive system',
                           'source_text': 'Apples are a great source of '
                                          'dietary fiber, which supports '
                                          'digestion and helps maintain a '
                                          'healthy gut.'},
              'score': 0.7739248,
              'sparse_values': {'indices': [], 'values': []},
              'values': []}],
 'namespace': 'example-namespace',
 'usage': {'read_units': 6}}
