# In this notebook I will Try to connect the rag model for pdf ingestion and use pinecone as vector database


In [1]:
# We will be using these PDF loaders but you can check out other loaded documents
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.document_loaders import UnstructuredPDFLoader
from rich import print
import re
from transformers import AutoTokenizer
from dotenv import load_dotenv
import os
from pinecone import Pinecone, ServerlessSpec

os.environ["PINECONE_API_KEY"] = os.getenv("PINECONE_API_KEY")

load_dotenv()

EMBEDDING_MODEL = "sentence-transformers/paraphrase-MiniLM-L6-v2"

# This is the name of the report which should be in the directory
# You can download the precise PDF I am using from here https://www.pc.gov.pk/uploads/archives/PSDP_2023-24.pdf

name = '../data/raw/Easy_recipes.pdf'

# This loader uses PyMuPDF
loader_py = PyMuPDFLoader(name)

# Storing the loaded documents as langChain Document object
pages_py = loader_py.load()


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print(pages_py[0].page_content)
print(len(pages_py))

In [3]:
# text splitter
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(
    # shows how to seperate
    separator="\n",
    # Shows the document token length
    chunk_size=1000,
    # How much overlap should exist between documents
    chunk_overlap=150,
    # How to measure length
    length_function=len
)

# Applying the splitter
docs = text_splitter.split_documents(pages_py)

print(docs[0])
print(len(docs))

In [4]:
def clean_text(text):
    # Remove non-printable characters
    text = re.sub(r'[^\x20-\x7E]', '', text)
    
    # Remove excessive spaces, newlines, and tabs
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Remove unwanted special characters but keep medical symbols
    text = re.sub(r'[^\w\s.,:;/%°±()\-]', '', text)  
    text = re.sub(r'[\t\r\xa0]', ' ', text)  # Replace \t, \r, and \xa0 with a space
    text = re.sub(r'\n+', '\n', text)  # Keep meaningful line breaks
    text = re.sub(r' +', ' ', text).strip()  # Remove excessive space

    return text  

In [6]:
from langchain_core.documents import Document
docs = [Document(clean_text(doc.page_content),metadata=doc.metadata) for doc in docs]

In [7]:
print(docs[0])

In [8]:
embeddings=AutoTokenizer.from_pretrained(EMBEDDING_MODEL)

In [9]:
pinecone_api_key = os.environ.get("PINECONE_API_KEY")
print(pinecone_api_key)
pc = Pinecone(api_key=pinecone_api_key)

In [10]:
import time

index_name = "test-index-legal"  # change if desired

existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
print(existing_indexes)

if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=3072,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )
    while not pc.describe_index(index_name).status["ready"]:
        time.sleep(1)

index = pc.Index(index_name)
print(index)

In [11]:
index.upsert(items=docs)


TypeError: Index.upsert() missing 1 required positional argument: 'vectors'