In [1]:
from pinecone import Pinecone, ServerlessSpec
from google import genai
from langchain_text_splitters import RecursiveCharacterTextSplitter
from dotenv import load_dotenv
import pymupdf
import re
import os
import time
import uuid
load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

In [21]:
pc = Pinecone(os.getenv("PINECONE_API_KEY"))
index_name = "pdf-index"

if not pc.has_index(index_name):
  pc.create_index(
        name=index_name,
        dimension=768,
        metric="dotproduct",
        spec=ServerlessSpec(cloud="aws",
        region="us-east-1",
        )
    )
  while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

print("Index Created")

Index Created


In [22]:
path = "../data/paper.pdf"
out_path = "../data/paper_extract.txt"
with pymupdf.open(path) as doc, open(out_path, "wb") as out:
  for page in doc:
    text = page.get_text()
    text = re.sub(r'\n+', ' ', text)
    text = re.sub(r'\s{2,}', ' ', text)
    text = text.encode("utf8")
    out.write(text)

In [23]:
text_splitter = RecursiveCharacterTextSplitter(
  chunk_size=600,
  chunk_overlap=100,
  length_function=len,
  is_separator_regex=False,
)

with open(out_path) as file:
    text = file.read()
    texts = text_splitter.create_documents([text])
    text_chunks = [doc.page_content for doc in texts]

In [24]:
client = genai.Client(api_key=os.getenv("GOOGLE_API_KEY"))
result = client.models.embed_content(
  model="models/text-embedding-004",
  contents=text_chunks)
vector_embeddings = [embedding.values for embedding in result.embeddings]

In [27]:
vectors = [
  {
    "id": str(uuid.uuid4()),
        "values": embedding,
        "metadata": {
            "content": text_chunks[i], 
            "pdf_name": "paper.pdf", 
            "chunk_index": i
        }
  }
  for i, embedding in enumerate(vector_embeddings)
]

print(vectors[0])

{'id': '2a3f46e8-34de-49d1-9ce6-32cde8bbea2b', 'values': [-0.051067423, -0.03582339, -0.08651791, 0.0147862, -0.032526013, 0.05366, 0.048879657, 0.010730737, 0.024282679, -0.043053467, -0.005162531, 0.014476969, 0.063402005, -0.011981444, 0.019957483, -0.0033598228, 0.040231973, 0.041759446, -0.09804623, -0.043375775, 0.007764666, -0.02279886, 0.021457028, 0.008559897, -0.019216292, -0.07219846, 0.04588406, -0.017176185, 0.037436947, 0.0024778314, -0.009818835, 0.033484615, 0.020139117, 0.046672374, -0.04642878, 0.07500865, -0.0024762554, -0.024331965, 0.026312184, -0.04170222, -0.07530702, -0.015988518, 0.007664398, 0.034783166, -0.03740241, 0.0067540472, -0.0046870513, 0.05297371, -0.07071796, 0.03424831, 0.051982325, 0.013874319, -0.02311174, 0.023018716, 0.014084615, 0.014591595, 0.0028690777, 0.021666305, 0.018281253, -0.023100324, -0.0018501873, 0.058921985, -0.044759437, -0.031448536, -0.009587412, -0.004576102, -0.016328506, -0.037361626, -0.0347865, 0.024265705, 0.0042660595, 

In [28]:
index = pc.Index(index_name)
index.upsert(vectors)
print("Vectors Upserted")

Vectors Upserted
