In [1]:
FILE_PATH = "C:/Ambarish/PINECONE/docs/"

# Model Settings
MODEL_NAME="all-MiniLM-L6-v2"

# Pinecone Settings
PINECONE_API_KEY="e72e0b7c-e243-41eb-b497-3fc23b3105fb"
PINECONE_INDEX_NAME="docs"
PINECONE_REGION="us-east1-gcp"
CATEGORY="ncert"

In [2]:
from PyPDF2 import PdfReader
import pinecone
from sentence_transformers import SentenceTransformer
import os
import uuid 
from nltk.tokenize import sent_tokenize

  from tqdm.autonotebook import tqdm


In [3]:
def get_pdf_data(file_path, num_pages = 1):
    reader = PdfReader(file_path)
    full_doc_text = ""
    pages = reader.pages
    num_pages = len(pages) 
    
    try:
        for page in range(num_pages):
            current_page = reader.pages[page]
            text = current_page.extract_text()
            full_doc_text += text
    except:
        print("Error reading file")
    finally:
        return full_doc_text

In [4]:
def get_chunks(fulltext:str,chunk_length =500) -> list:
    text = fulltext

    chunks = []
    while len(text) > chunk_length:
        last_period_index = text[:chunk_length].rfind('.')
        if last_period_index == -1:
            last_period_index = chunk_length
        chunks.append(text[:last_period_index])
        text = text[last_period_index+1:]
    chunks.append(text)

    return chunks

In [5]:
def split_text_into_chunks(plain_text, max_chars=500):
    text_chunks = []
    current_chunk = ""
    for line in plain_text:
        if len(current_chunk) + len(line) + 1 <= max_chars:
            current_chunk += line + " "
        else:
            text_chunks.append(current_chunk.strip())
            current_chunk = line + " "
    if current_chunk:
        text_chunks.append(current_chunk.strip())
    return text_chunks

In [6]:
pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_REGION)
index = pinecone.Index(PINECONE_INDEX_NAME)

In [7]:
model = SentenceTransformer(MODEL_NAME)

In [8]:
length = 512 - 384 
list_custom = [1] * length

In [9]:
def addData(corpusData,filename):
    for i in range(len(corpusData)):
        chunk=corpusData[i]
        embedding = model.encode(chunk).tolist()
        embedding_all = embedding + list_custom
        chunkInfo=(str(uuid.uuid4()),
                embedding_all,
                {'sentence': chunk,'category':CATEGORY,'filename':filename})
        index.upsert(vectors=[chunkInfo])

In [10]:
def insert_pinecone(filename):
    print("Processing file: ", filename)
    full_doc_text = get_pdf_data(filename)
    print("Number of characters: ", len(full_doc_text))
    Lines = get_chunks(full_doc_text)
    print("Number of chunks: ", len(Lines))
    addData(Lines,filename)

In [11]:
def insert_pinecone_file_path(file_path):
    files = os.listdir(file_path)
    for filename in files:
        if filename.endswith(".pdf"):
            insert_pinecone(file_path + filename)

In [12]:
insert_pinecone_file_path(FILE_PATH)

Processing file:  C:/Ambarish/PINECONE/docs/CHAP04AnimalKingdom.pdf
Number of characters:  31802
Number of chunks:  73
Processing file:  C:/Ambarish/PINECONE/docs/CHAP05StructuralOrganization.pdf
Number of characters:  27664
Number of chunks:  62
Processing file:  C:/Ambarish/PINECONE/docs/CHAP06AnatomyFloweringPlants.pdf
Number of characters:  13839
Number of chunks:  32
Processing file:  C:/Ambarish/PINECONE/docs/CHAP07StructuralOrganizationAnimals.pdf
Number of characters:  14701
Number of chunks:  33
Processing file:  C:/Ambarish/PINECONE/docs/CHAP08CellStructure.pdf
Number of characters:  38303
Number of chunks:  88
Processing file:  C:/Ambarish/PINECONE/docs/Chap09BioMolecules.pdf
Number of characters:  35027
Number of chunks:  81
Processing file:  C:/Ambarish/PINECONE/docs/Chap10CellCycle.pdf
Number of characters:  24648
Number of chunks:  60
Processing file:  C:/Ambarish/PINECONE/docs/Chap11PlantPhysiology.pdf
Number of characters:  45169
Number of chunks:  108
Processing file: