In [1]:
import os
from dotenv import load_dotenv
import time
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import DirectoryLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_pinecone import PineconeVectorStore

### Initialize Pinecone client, create index, load + chunk docs, upsert docs

In [None]:
# Client configuration details
pinecone_conn = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"))
spec = ServerlessSpec(cloud='aws', region='us-east-1')

# Manually get existing index names - TODO: Check if method has been added for this, was getting error trying to use .has_index()
index_names = [index["name"] for index in pinecone_conn.list_indexes()]

# Create new index
index_name = "digital-twin"
if index_name in index_names:
    pinecone_conn.delete_index(index_name)

pinecone_conn.create_index(
    index_name,
    dimension=1536,  # Dimensionality of text-embedding-ada-002 (OpenAI)
    metric='dotproduct',
    spec=spec
)

# Wait for index to be created
while not pinecone_conn.describe_index(index_name).status["ready"]:
    time.sleep(1)

# Load all input docs into memory
loader = DirectoryLoader(
    "/Users/brianfrechette/Library/Mobile Documents/com~apple~CloudDocs/dev/digital_twin/doc_inputs",
    glob="*.docx",
    show_progress=True
)
docs = loader.load()

# Chunk documents before upserting
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs_chunked = text_splitter.split_documents(docs)

# Initialize pinecone vector store by adding chunked docs to the index
vector_store_from_docs = PineconeVectorStore.from_documents(
    docs_chunked,
    index_name=index_name,
    embedding=OpenAIEmbeddings()
)