# Pinecone

In [1]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

True

In [2]:
# pip install -q pinecone-client

Note: you may need to restart the kernel to use updated packages.


In [3]:
# pip install --upgrade -q pinecone-client 

Note: you may need to restart the kernel to use updated packages.


In [4]:
# pip show pinecone-client

Name: pinecone-client
Version: 3.1.0
Summary: Pinecone client and SDK
Home-page: https://www.pinecone.io
Author: Pinecone Systems, Inc.
Author-email: support@pinecone.io
License: Apache-2.0
Location: /Users/bertan/development/MachineLearningProjects/egeai/CustomChatGPTApp/.venv/lib/python3.10/site-packages
Requires: certifi, tqdm, typing-extensions, urllib3
Required-by: 
Note: you may need to restart the kernel to use updated packages.


In [5]:
from pinecone import Pinecone
pc = Pinecone()
pc.list_indexes()

{'indexes': []}

## Working with Pinecone Indexes

In [7]:
pc.list_indexes()

{'indexes': []}

In [9]:
# pc.describe_index('langchain')
pc.list_indexes().names()

[]

## Create an index on Pinecone

In [52]:
#let's create pinecone index
from pinecone import PodSpec
index_name = 'langchain'

if index_name not in pc.list_indexes().names():
    print(f'Creating index {index_name}')
    pc.create_index(
        name=index_name,
        dimension=1536,
        metric='cosine',
        spec=PodSpec(
            environment='gcp-starter'
        )
    )
    print('Index created!')
else:
    print(f'Index {index_name} already exist!')


Creating index langchain
Index created!


## Delete an index on Pinecone

In [49]:
index_name = 'langchain'
if index_name in pc.list_indexes().names():
    print(f'Deleting index {index_name}')
    pc.delete_index(index_name)
    print('Done')
else:
    print(f'Index {index_name} does not exist!')

Deleting index langchain
Done


### To perform any operation with an index, you must first select it.

In [12]:
index = pc.Index(index_name)
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

## Working with Vectors

In [42]:
import random
vectors = [[random.random() for _ in range(1536)] for v in range(5)]
vectors

[[0.6751122924977868,
  0.23816146040019326,
  0.6040015337394106,
  0.4570756501572997,
  0.1635046861989632,
  0.4606727978041868,
  0.5335733155391605,
  0.893783757486464,
  0.6252897946304146,
  0.282133504639157,
  0.629415994562616,
  0.14986161625312733,
  0.36904595848729793,
  0.3932170774470911,
  0.004980934021791894,
  0.009534095299524092,
  0.6640242313037946,
  0.4080380259147889,
  0.3886635826799828,
  0.5423952752020754,
  0.530090131361825,
  0.27480505171419845,
  0.8686889499478125,
  0.4871330384478276,
  0.3384403977727728,
  0.02393856591049548,
  0.02454835564804647,
  0.8243225473086445,
  0.6074354204281954,
  0.8083790677077198,
  0.6306465751366298,
  0.4712807678063963,
  0.2141509648368578,
  0.744499922613443,
  0.4592562460328874,
  0.878131119948894,
  0.5031976690001373,
  0.262548636098738,
  0.2518779531411772,
  0.3571291570002455,
  0.4902576837310738,
  0.5538249512115719,
  0.4291978037874239,
  0.3914170265135445,
  0.1497078705965449,
  0.012

## Inserting vectors

In [43]:
# to insert a vector, we need the vector itself and its id, which is a string. Since there are 5 vectors, we're creating a list with five elements which represent the IDs.
ids = list('abcd')

# select the index
index_name = 'langchain'
index = pc.Index(index_name)

# to insert new vectors into the index, use upsert method.
# upsert is a single operation that can be used to insert a new value or update an existing value if it already exists.
index.upsert(vectors=zip(ids, vectors))

{'upserted_count': 4}

## Updating the vectors

In [24]:
index.upsert(vectors=[('c', [0.5] * 1536)])

{'upserted_count': 1}

## Fetching the vector by ID

In [37]:
# index = pc.index(index_name) # we have already selected
index.fetch(ids=['d'])

{'namespace': '', 'usage': {'read_units': 1}, 'vectors': {}}

## Deleting vectors by ID

In [38]:
index.delete(ids=['b', 'd'])

{}

In [39]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 2e-05,
 'namespaces': {'': {'vector_count': 2}},
 'total_vector_count': 2}

In [44]:
index.fetch(ids=['b'])

{'namespace': '',
 'usage': {'read_units': 1},
 'vectors': {'b': {'id': 'b',
                   'values': [0.850462139,
                              0.542898,
                              0.824869037,
                              0.295529604,
                              0.903319716,
                              0.813650608,
                              0.473705977,
                              0.0442596078,
                              0.518641651,
                              0.538626373,
                              0.584990561,
                              0.944291592,
                              0.995798469,
                              0.0616229214,
                              0.334837765,
                              0.555218875,
                              0.273614645,
                              0.0807249,
                              0.158707231,
                              0.296154231,
                              0.795665443,
                       

## Query 

In [45]:
query_vector = [random.random() for _ in range(1536)]

The query operation will retreive the IDs of the most similar vectors in the index, along with their similarity scores.

In [46]:

index.query(
    vector=query_vector,
    top_k=3,
    include_values=False
)

{'matches': [{'id': 'b', 'score': 0.760144651, 'values': []},
             {'id': 'c', 'score': 0.745957136, 'values': []},
             {'id': 'd', 'score': 0.745626569, 'values': []}],
 'namespace': '',
 'usage': {'read_units': 5}}

## Namespaces
Pinecone allows you to partition the vectors in an index into namespaces.

Queries and other operations are scoped to a specific namespace, allowing diferent requests to search different subsets of your index.

Imagine you are dealing with news articles. You might want to create a namespace for indexing articles by content and another for indexing articles by title.

`Key information` about namespaces:
* Every index consists of one or more namespaces.
* Each vector exists in exactly one namespace.
* Namespaces are uniquely identified by a namespace name.
* The default namespace is represented by the empty string and is used if no specific namespace is specified.

In [56]:
# index.describe_index_stats()
index = pc.Index('langchain')

import random
vectors = [[random.random() for _ in range(1536)] for v in range(5)]
ids = list('abcde')
index.upsert(vectors=zip(ids, vectors))

{'upserted_count': 5}

In [58]:
vectors = [[random.random() for _ in range(1536)] for v in range(2)]
ids = list('qp')
index.upsert(vectors=zip(ids, vectors), namespace='second-namespace')

{'upserted_count': 2}

In [59]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0001,
 'namespaces': {'': {'vector_count': 5},
                'first-namespace': {'vector_count': 3},
                'second-namespace': {'vector_count': 2}},
 'total_vector_count': 10}

In [60]:
index.fetch(ids=['x'], namespace='first-namespace')

{'namespace': 'first-namespace',
 'usage': {'read_units': 1},
 'vectors': {'x': {'id': 'x',
                   'values': [0.335009336,
                              0.316244721,
                              0.0789737552,
                              0.616738677,
                              0.665182,
                              0.842058718,
                              0.618972242,
                              0.238105476,
                              0.586499393,
                              0.281436443,
                              0.404755831,
                              0.341582566,
                              0.259200186,
                              0.521100044,
                              0.0955416635,
                              0.602175,
                              0.98731,
                              0.971734107,
                              0.867901683,
                              0.299591929,
                              0.65957278,
              

In [61]:
# delete a vector
index.delete(ids=['x'], namespace='first-namespace')

{}

In [62]:
# delete all records from namespace. Deleting all records from a namespace also deletes the namespace itself
index.delete(delete_all=True, namespace='first-namespace')

{}

In [63]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 7e-05,
 'namespaces': {'': {'vector_count': 5},
                'second-namespace': {'vector_count': 2}},
 'total_vector_count': 7}