In [None]:
import os
import numpy as np
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec, Index

print("Embedding + Upsert Notebook")

In [None]:
# 1) Get Pinecone key from environment
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
INDEX_NAME = "chatbot-index"

# 2) Connect (control-plane) to Pinecone
pc = Pinecone(api_key=PINECONE_API_KEY)

if INDEX_NAME not in pc.list_indexes().names():
    pc.create_index(
        name=INDEX_NAME,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )
    print(f"Index '{INDEX_NAME}' created!")
else:
    print(f"Index '{INDEX_NAME}' already exists.")

# 3) Describe index, get host, create data-plane Index
desc = pc.describe_index(INDEX_NAME)
host = desc.host
index = Index(api_key=PINECONE_API_KEY, host=host)

# 4) Initialize embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

# 5) Helper for chunking
def chunk_text(text, chunk_size=2000, overlap=200):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunk = text[start:end]
        chunks.append(chunk)
        start += (chunk_size - overlap)
    return chunks

# 6) Upsert function
def upsert_folder(folder_path, file_extension, chunk_size=2000, overlap=200):
    for file_name in os.listdir(folder_path):
        if file_name.endswith(file_extension):
            full_path = os.path.join(folder_path, file_name)
            with open(full_path, 'r', encoding='utf-8') as f:
                content = f.read()
            text_chunks = chunk_text(content, chunk_size, overlap)
            for i, chunk in enumerate(text_chunks):
                chunk_id = f"{file_name}_chunk_{i}"
                vec = model.encode(chunk).tolist()
                metadata = {
                    "text": chunk,
                    "original_file": file_name,
                    "chunk_index": i
                }
                index.upsert([(chunk_id, vec, metadata)])
            print(f"Upserted {len(text_chunks)} chunks for {file_name}")

In [None]:
# 7) Actually upsert from processed folders
TXT_DIR = "./files/processed_txt_files"
M_DIR   = "./files/processed_m_files"

upsert_folder(TXT_DIR, '.txt')
upsert_folder(M_DIR,   '.m')

print("All chunks upserted!")

In [None]:
# 8) Optional: test a query
test_query = "Explain how wing rotation works for insect flight"
test_vec   = model.encode(test_query).tolist()
res = index.query(vector=test_vec, top_k=3, include_metadata=True)

print(f"Test Query: {test_query}\n")
for match in res["matches"]:
    cid = match["id"]
    sc  = match["score"]
    print(f"ID: {cid}, Score: {sc}")
    if "metadata" in match:
        snippet = match["metadata"].get("text", "")
        print(f"Snippet start: {snippet[:150]}\n")
    else:
        print("No metadata.")