In [None]:
import os

from dotenv import load_dotenv
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WikipediaLoader
from langchain_openai import OpenAIEmbeddings
from langchain_pinecone import Pinecone
from schema import *

In [None]:
from pinecone_connector import PineconeConnector

embeddings = OpenAIEmbeddings(model="text-embedding-3-small", show_progress_bar=True)

_embeddings = embeddings.embed_documents(
    [
        "Hi there!",
        "Oh, hello!",
        "What's your name?",
        "My friends call me World",
        "Hello World!",
    ]
)
len(_embeddings), len(_embeddings[0])

In [None]:
load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_ENV = os.getenv("PINECONE_ENV")
pinecone = PineconeConnector(
    embeddings
)  # Used for management and specific Pincone functionality

# vars(pinecone)

In [None]:
# Create index. Name cannot contain capitals and choose between Serverless or Pod for server_type.
pinecone.create_index(
    "testindex",
    server_type="serverless",
)

In [None]:
# Create index. Traditional Pod Build
pinecone.create_index(
    "myfirstpod", metric="dotproduct", server_type="pod", environment="us-east1-gcp"
)

In [None]:
# Once Pinecone index has been created, then create LC vectorsore instance
vectorstore1 = Pinecone(
    index_name="testindex",
    embedding=embeddings,
    pinecone_api_key=pinecone.PINECONE_API_KEY,
)
# vectorstore2 = Pinecone(
#     index_name="myfirstpod",
#     embedding=embeddings,
#     pinecone_api_key=pinecone.PINECONE_API_KEY,
# )

In [None]:
from pprint import pprint

# List all indexes in the project
indexes = pinecone.list_index()

pprint(type(indexes))
indexes

In [None]:
# Describe an index by name
pinecone.describe_index("testindex")

In [None]:
# Provides data about indexes like total vector count, fullness, namespace
output1 = pinecone.describe_index_stats("testindex")
output2 = pinecone.describe_index_stats("myfirstpod")

print(output1)
print(output2)

In [None]:
# Delete the index
pinecone.delete_index("testindex", 10)

In [None]:
# Delete the index
pinecone.delete_index("myfirstpod")

In [None]:
# Call `__repr__` method
repr(pinecone)

In [None]:
# Load Documents from Wikipedia and splitt int Document objects
document = WikipediaLoader(query="HUNTER X HUNTER", load_max_docs=2).load()

document2 = WikipediaLoader(query="Sword Art Online", load_max_docs=2).load()

text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=100,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)

documents = text_splitter.create_documents(
    texts=[document[0].page_content], metadatas=[document[0].metadata]
)
documents2 = text_splitter.create_documents(
    texts=[document2[0].page_content], metadatas=[document2[0].metadata]
)

In [None]:
# Adds list of Document Objects (Note, wraps add_text method which is more grannular and has more options) Using the namespace logically divide the index, but you CANNOT search across Vector Store.

# id1 = vectorstore1.add_documents(documents, namespace="sao")
# id2 = vectorstore2.add_documents(documents2, namespace="hxh")

In [None]:
# This is adding a list of documents. This is an example of using metadata. Extra metadata needs to be added to documents[0].metadata

# id1 = vectorstore1.add_documents(documents)
id2 = vectorstore1.add_documents(documents)

In [None]:
# Basic Similarity Search
docsearch_basic = vectorstore1.similarity_search(
    "anime",
)
docsearch_basic

In [None]:
# Search with Score
# Return pinecone documents most similar to query, along with scores.
docsearch_score = vectorstore1.similarity_search_with_score("anime")
docsearch_score

In [None]:
search_relevance_score = vectorstore2.similarity_search_with_relevance_scores(
    "What is HunterXHunter?"
)
search_relevance_score

In [None]:
# Return VectorStoreRetriever initialized from this VectorStore.
retriever = vectorstore1.as_retriever(
    search_type="mmr", search_kwargs={"lambda_mult": 0.75}
)
# retriever = vectorstore1.as_retriever(search_type = "similarity",search_kwargs={"k":1})
output = retriever.invoke("What is hunter x hunter?")
output

In [None]:
title = documents[0].metadata["title"]
source = documents[0].metadata["source"]

documents[0].metadata = {
    "title": title,
    "source": source,
    "genre": "anime",
    "season": "Spring 2010",
}

In [None]:
from pprint import pprint as pp

pp(documents[0].metadata)

In [None]:
# Metadata filtering
retriever = vectorstore1.as_retriever(
    search_kwargs={"filter": {"genre": "Spring 2010"}}
)
output = retriever.invoke("Who is Kirito?")
output

In [None]:
from schema import IndexList, IndexStatus, Pod, Severless, IndexesResponse

_index = IndexList(
    name="semantic-search",
    dimension=384,
    metric="cosine",
    host="semantic-search-c01b5b5.svc.us-west1-gcp.pinecone.io",
    status=IndexStatus(ready=True, state="Ready"),
    spec=Pod(
        environment="us-west1-gcp", replicas=2, shards=2, pod_type="p1.x1", pods=4
    ),
)

index = IndexesResponse(indexes=[_index])
index.model_dump_json()