In [32]:
import os

from dotenv import load_dotenv
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WikipediaLoader
from langchain_openai import OpenAIEmbeddings
from langchain_pinecone import Pinecone

In [33]:
from pinecone_connector import PineconeConnector

embeddings = OpenAIEmbeddings(model="text-embedding-3-small", show_progress_bar=True)

_embeddings = embeddings.embed_documents(
    [
        "Hi there!",
        "Oh, hello!",
        "What's your name?",
        "My friends call me World",
        "Hello World!",
    ]
)
len(_embeddings), len(_embeddings[0])

100%|██████████| 1/1 [00:00<00:00,  1.94it/s]


(5, 1536)

In [34]:
load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_ENV = os.getenv("PINECONE_ENV")
pinecone = PineconeConnector(
    embeddings
)  # Used for management and specific Pincone functionality

# vars(pinecone)

In [50]:
# Create index. Name cannot contain capitals and choose between Serverless or Pod for server_type.
pinecone.create_index(
    "testindex",
    server_type="serverless",
)


True

In [51]:
# Create index. Traditional Pod Build
pinecone.create_index(
    "myfirstpod",
    metric="dotproduct",
    server_type="pod",
    environment="us-east1-gcp")

True

In [44]:

# Once Pinecone index has been created, then create LC vectorsore instance
vectorstore1 = Pinecone(
    index_name="testindex",
    embedding=embeddings,
    pinecone_api_key=pinecone.PINECONE_API_KEY,
)
vectorstore2 = Pinecone(
    index_name="myfirstpod",
    embedding=embeddings,
    pinecone_api_key=pinecone.PINECONE_API_KEY,
)

In [53]:
# List all indexes in the project
pinecone.list_index()

[{'dimension': 1536,
  'host': 'myfirstpod-fc416ad.svc.us-east1-gcp.pinecone.io',
  'metric': 'dotproduct',
  'name': 'myfirstpod',
  'spec': {'pod': {'environment': 'us-east1-gcp',
                   'pod_type': 'p1.x1',
                   'pods': 1,
                   'replicas': 1,
                   'shards': 1}},
  'status': {'ready': True, 'state': 'Ready'}},
 {'dimension': 1536,
  'host': 'testindex-fc416ad.svc.apw5-4e34-81fa.pinecone.io',
  'metric': 'cosine',
  'name': 'testindex',
  'spec': {'serverless': {'cloud': 'aws', 'region': 'us-west-2'}},
  'status': {'ready': True, 'state': 'Ready'}}]

In [54]:
# Describe an index by name
pinecone.describe_index("testindex")

{'dimension': 1536,
 'host': 'testindex-fc416ad.svc.apw5-4e34-81fa.pinecone.io',
 'metric': 'cosine',
 'name': 'testindex',
 'spec': {'serverless': {'cloud': 'aws', 'region': 'us-west-2'}},
 'status': {'ready': True, 'state': 'Ready'}}

In [57]:
# Provides data about indexes like total vector count, fullness, namespace
output1 = pinecone.describe_index_stats("testindex")
output2 = pinecone.describe_index_stats("myfirstpod")

print(output1)
print(output2)

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'sao': {'vector_count': 54}},
 'total_vector_count': 54}
{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'hxh': {'vector_count': 54}},
 'total_vector_count': 54}


In [None]:
# Delete the index
pinecone.delete_index("testindex", 10)


In [49]:
# Delete the index
pinecone.delete_index("myfirstpod")

In [58]:
# Call `__repr__` method
repr(pinecone)

'Pinceconnector(embeddings={self.embeddings!r}, OPENAI_API_KEY={self.OPENAI_API_KEY!r}, PINECONE_API_KEY={self.PINECONE_API_KEY!r}, PINECONE_ENV={self.PINECONE_ENV!r})'

In [43]:
# Load Documents from Wikipedia and splitt int Document objects
document = WikipediaLoader(query="HUNTER X HUNTER", load_max_docs=2).load()

document2 = WikipediaLoader(query="Sword Art Online", load_max_docs=2).load()

text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=100,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)

documents = text_splitter.create_documents(texts=[document[0].page_content], metadatas=[document[0].metadata])
documents2 = text_splitter.create_documents(
    texts=[document2[0].page_content], metadatas=[document2[0].metadata]
)


In [52]:
# Adds list of Document Objects (Note, wraps add_text method which is more grannular and has more options) Using the namespace logically divide the index.
id1 = vectorstore1.add_documents(documents, namespace="sao")
id2 = vectorstore2.add_documents(documents2, namespace="hxh")

  0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:00<00:00,  1.54it/s]
100%|██████████| 1/1 [00:00<00:00,  1.39it/s]


In [None]:
# Basic Similarity Search
docsearch_basic = vectorstore1.similarity_search("anime")
docsearch_basic

In [None]:
# Search with Score 
# Return pinecone documents most similar to query, along with scores.
docsearch_score = vectorstore1.similarity_search_with_score("anime")
docsearch_score

In [None]:
search_relevance_score = vectorstore2.similarity_search_with_relevance_scores("What is HunterXHunter?")
search_relevance_score

In [None]:
# Return VectorStoreRetriever initialized from this VectorStore.
retriever = vectorstore1.as_retriever(search_type = "mmr", search_kwargs = {"lambda_mult": 0.75})
# retriever = vectorstore1.as_retriever(search_type = "similarity",search_kwargs={"k":1})
output = retriever.invoke("What is hunter x hunter?")
output