In [4]:
import os
from dotenv import load_dotenv
import time
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import DirectoryLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_pinecone import PineconeVectorStore
from langchain_core.documents import Document
from uuid import uuid4

### Initialize Pinecone client, create index, load + chunk docs, upsert docs

In [7]:
# Client configuration details
pinecone_conn = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"))
spec = ServerlessSpec(cloud='aws', region='us-east-1')

# Manually get existing index names - TODO: Check if method has been added for this, was getting error trying to use .has_index()
index_names = [index["name"] for index in pinecone_conn.list_indexes()]

# Create new index
index_name = "digital-twin"
if index_name in index_names:
    pinecone_conn.delete_index(index_name)

pinecone_conn.create_index(
    index_name,
    dimension=1536,  # Dimensionality of text-embedding-ada-002 (OpenAI)
    metric='dotproduct',
    spec=spec
)

# Wait for index to be created
while not pinecone_conn.describe_index(index_name).status["ready"]:
    time.sleep(1)

# Load all input docs into memory
loader = DirectoryLoader(
    "/Users/brianfrechette/Library/Mobile Documents/com~apple~CloudDocs/dev/digital_twin/doc_inputs",
    glob="*.docx",
    show_progress=True
)
docs = loader.load()

# Chunk documents before upserting
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs_chunked = text_splitter.split_documents(docs)

# Initialize pinecone vector store by adding chunked docs to the index
vector_store = PineconeVectorStore.from_documents(
    docs_chunked,
    index_name=index_name,
    embedding=OpenAIEmbeddings()
)


100%|██████████| 1/1 [01:38<00:00, 98.36s/it]

100%|██████████| 1/1 [00:01<00:00,  1.20s/it]


### Add additional static context

In [8]:
# TODO: In the future, build crawlers to do this for me from LinkedIn, my personal site, etc.
# Manually create documents with static content
docs = [
    Document(
        page_content="I have career interests in software development, big data, and machine learning.",
    ),
    Document(
        page_content="I am specifically interested in working in the tech or finance industry as a software engineer, data engineer, machine learning engineer, or the equivalent."
    ),
    Document(
        page_content="I have completed my bachelors of science in engineering from UConn, my master in computer information technology (MCIT) from UPenn, and I plan to start my master of science in aritificial intelligence (MSE-AI) from UPenn beginning in 2025.",
        metadata={"description": "education"}
    ),
]

# Create unique ids, upsert docs
uuids = [str(uuid4()) for _ in range(len(docs))]
vector_store.add_documents(
    documents=docs,
    ids=uuids
)

['425692e4-aadf-4dcc-80a1-b532aac9798d',
 '10465005-a6fb-4518-abb3-5120ebccec8e',
 '1eaaca05-ae74-4e06-a0cd-e16aafa71513']