In [7]:
# Import the Pinecone library
from pinecone import Pinecone, ServerlessSpec
import os
from dotenv import load_dotenv

load_dotenv()


True

In [9]:
api_key = os.getenv("PINECONE_API_KEY")
# Initialize the Pinecone client
pc = Pinecone(api_key=api_key)

In [12]:
# Create your Pinecone index
pc.create_index(
    name="my-first-index",
    dimension=256,
    spec=ServerlessSpec(
        cloud='aws',
        region='us-east-1'
    )
)

In [18]:
# Connecting to the index 
index = pc.Index('my-first-index')

index.describe_index_stats()

NotFoundException: (404)
Reason: Not Found
HTTP response headers: HTTPHeaderDict({'content-type': 'text/plain; charset=utf-8', 'access-control-allow-origin': '*', 'vary': 'origin,access-control-request-method,access-control-request-headers', 'access-control-expose-headers': '*', 'x-pinecone-api-version': '2024-07', 'X-Cloud-Trace-Context': 'c39211316d48507fa763ba362f23f5d7', 'Date': 'Fri, 08 Nov 2024 12:08:03 GMT', 'Server': 'Google Frontend', 'Content-Length': '89', 'Via': '1.1 google', 'Alt-Svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000'})
HTTP response body: {"error":{"code":"NOT_FOUND","message":"Resource my-first-index not found"},"status":404}


### Namespaces are used for partitioning indexes which can be used for: 
- Separate datasets 
- Data Versioning 
- Separate Groups 

In [11]:
# Deleting an index 
pc.delete_index('my-first-index')

In [17]:
# List indexes 
pc.list_indexes()

[
    {
        "name": "datacamp-index",
        "dimension": 1536,
        "metric": "cosine",
        "host": "datacamp-index-qeu0iz9.svc.aped-4627-b74a.pinecone.io",
        "spec": {
            "serverless": {
                "cloud": "aws",
                "region": "us-east-1"
            }
        },
        "status": {
            "ready": true,
            "state": "Ready"
        },
        "deletion_protection": "disabled"
    }
]

In [None]:
vectors = [
    {
        "id": "0",
        "values": [0.025525547564029694, 0.0188823901116848],
        "metadata": {"genre": "productivity", "year": 2020}
    },
]

# Checking dimensionality 
vector_dims = [len(vector['values']) == 1536 for vector in vectors]
all(vector_dims)

In [None]:
# Upsert is either update or insert the records if it's not available 
index.upsert(vectors=vectors)

In [None]:
index = pc.Index('datacamp-index')
# Fetching Vectors 
ids=['0','1']
fetched_vectors = index.fetch(
    ids=['0','1'],
    
)
metadatas = [fetched_vectors['vectors'][id]['metadata'] for id in ids]
print(metadatas)


[{'genre': 'action', 'year': 2000.0}, {'genre': 'horror', 'year': 2007.0}]


In [None]:
# Read Units = Measure of resources consumed during read operations 
# For querying, measuring Read Units is harder to calculate 
index.query(
    vector=[0]*1536,
    top_k=3,
    include_values=True
)

{'matches': [{'id': '2',
              'score': 0.0,
              'values': [-0.374185205,
                         -0.457807481,
                         0.195336565,
                         0.732191265,
                         0.893467486,
                         -0.788188398,
                         -0.690342784,
                         0.889472544,
                         0.473070443,
                         0.765987635,
                         -0.594734728,
                         0.175171748,
                         0.402279198,
                         0.36022377,
                         -0.183696613,
                         -0.969210207,
                         0.165852115,
                         -0.493796915,
                         -0.0994915068,
                         0.915162,
                         -0.201929361,
                         0.679603219,
                         -0.622918785,
                         0.344921,
                         0.954

In [29]:
pc.create_index(
    name="dotproduct-index",
    dimension=1536,
    metric='dotproduct',
    spec=ServerlessSpec(
        cloud='aws',
        region='us-east-1'
    )
)

In [30]:
pc.delete_index('dotproduct-index')

In [None]:
# Meta data Filtering reduces search space
index.query(
    vector=[],
    filter={
        "genre":{"$eq":"documentary"}, 
        "year":2019
    },
    top_k=1,
    include_metadata=True
)

PineconeApiException: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'Date': 'Fri, 08 Nov 2024 15:29:13 GMT', 'Content-Type': 'application/json', 'Content-Length': '76', 'Connection': 'keep-alive', 'x-pinecone-request-latency-ms': '33', 'x-pinecone-request-id': '1247938675901108751', 'x-envoy-upstream-service-time': '34', 'server': 'envoy'})
HTTP response body: {"code":3,"message":"Either 'vector' or 'ID' must be provided","details":[]}


In [None]:
index.update(
    id = '1', 
    values=[]
)
index.update(
    id = '1', 
    set_metadata={}
)

In [35]:
index.delete(ids=['3','4'])


{}

In [None]:
import itertools
def chunks(iterable, batch_size=100):
    """A helper function to break an iterable into chunks of size batch_size."""
    # Convert the iterable into an iterator
    it = iter(iterable)
    # Slice the iterator into chunks of size batch_size
    chunk = tuple(itertools.islice(it, batch_size))
    while chunk:
        # Yield the chunk
        yield chunk
        chunk = tuple(itertools.islice(it, batch_size))

In [None]:
index = pc.Index('datacamp-index')

# Upsert vectors in batches of 100
for chunk in chunks(vectors):
    index.upsert(vectors=chunk) 

# Retrieve statistics of the connected Pinecone index
print(index.describe_index_stats())

In [None]:
pc = Pinecone(api_key=api_key, pool_threads=20)
index = pc.Index('datacamp-index')

# Upsert vectors in batches of 200 vectors
with pc.Index('datacamp-index', pool_threads=20) as index:
    async_results = [index.upsert(vectors=chunk, async_req=True) for chunk in chunks(vectors, batch_size=100)]
    [async_result.get() for async_result in async_results]

# Retrieve statistics of the connected Pinecone index
print(index.describe_index_stats())

In [None]:
pc = Pinecone(api_key="ce4e11d4-cce4-4693-84bc-f1db02187db5")
index = pc.Index('datacamp-index')

# Upsert vector_set1 to namespace1
index.upsert(
  vectors=vector_set1,
  namespace='namespace1'
)

# Upsert vector_set2 to namespace2
index.upsert(
  vectors=vector_set2,
  namespace='namespace2'
)

# Print the index statistics
print(index.describe_index_stats())

In [None]:
# Semantic Search with Pinecone 
 # Initialize the Pinecone client
index = pc.Index('pinecone-datacamp')

batch_limit = 100

for batch in np.array_split(df, len(df) / batch_limit):
    # Extract the metadata from each row
    metadatas = [{
      "text_id": row['id'],
      "text": row['text'],
      "title": row['title']} for _, row in batch.iterrows()]
    texts = batch['text'].tolist()
    
    ids = [str(uuid4()) for _ in range(len(texts))]
    
    # Encode texts using OpenAI
    response = client.embeddings.create(input=texts, model="text-embedding-3-small")
    embeds = [np.array(x.embedding) for x in response.data]
    
    # Upsert vectors to the correct namespace
    index.upsert(vectors=zip(ids, embeds, metadatas), namespace='squad_dataset')