In [1]:
from collections import defaultdict
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain_pinecone import PineconeVectorStore
import re
from nltk.corpus import stopwords
from langchain.embeddings import HuggingFaceEmbeddings
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
from dotenv import load_dotenv
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv()
PINECONE_API_KEY=os.environ.get('PINECONE_API_KEY2')
GOOGLE_API_KEY=os.environ.get('GOOGLE_API_KEY')
pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "research-paper-llm-db5"

embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

  embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')


In [None]:
pc.create_index(
    name=index_name,
    dimension=384,
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    )
)

In [3]:
# Extract Data From a Single PDF File
def load_pdf_file(file_path: str):
    loader = PyPDFLoader(file_path)
    document = loader.load()
    return document

# Extract Data From Directory
def load_pdfs_from_directory(directory_path: str):
    loader= DirectoryLoader(directory_path, glob="*.pdf", loader_cls=PyPDFLoader)
    documents=loader.load()
    return documents
def clean_text(text):
    """
    Cleans the extracted text by:
    - Removing non-alphanumeric characters (except spaces).
    - Normalizing whitespace.
    - Removing headers, footers, or page numbers using regex patterns.
    """

    text = re.sub(r"[^a-zA-Z0-9\s]", "", text) # Remove non-alphanumeric characters (except spaces)
    text = re.sub(r"Page \d+|Header Text|Footer Text", "", text) # Remove page num, headers/footers
    stop_words = set(stopwords.words("english"))
    pattern = r'\b(' + '|'.join(map(re.escape, stop_words)) + r')\b'
    result = re.sub(pattern, '', text)
    text = re.sub(r"\s+", " ", result).strip()
    return text


# Converts document pages to a dictionary with source(pdf file path) as key
def group_docs_by_path(documents):
    grouped_docs = defaultdict(list)
    final_grouped_docs = defaultdict(list)
    for doc in documents:
        source = doc.metadata['source'] # fallback in case source is missing
        page = {
            'title': doc.metadata.get('title', None),
            'total_pages': doc.metadata.get('total_pages', None),
            'page': doc.metadata.get('page', None),
            'page_label': doc.metadata.get('page_label', None),
            'page_content': doc.page_content
        }
        grouped_docs[source].append(page)

    for source, pages in grouped_docs.items():
        if len(pages) == 0:
            continue
        page_content = ''.join([page['page_content'] for page in pages])
        cleaned_page_content = clean_text(page_content)
        final_grouped_docs[source] = {
            'title': pages[0]['title'],
            'total_pages': pages[0]['total_pages'],
            'page_content' : page_content,
            'cleaned_page_content': cleaned_page_content
        }

    return final_grouped_docs

In [4]:
def split_by_section(text):
    '''
        Splits the text into sections based on common section headers.
    '''
    pattern = r"\n\d{0,2}\.?\s*(abstract|introduction|toolkit overview|toolkit usage|related work|experiments|methodology|results?|conclusion|references)\s*\n"
    matches = list(re.finditer(pattern, text, re.IGNORECASE))

    sections = []
    for i in range(len(matches)):
        title = matches[i].group().strip()
        start = matches[i].start()
        end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
        section_text = text[start:end].strip()
        sections.append((title, section_text))
    return sections


def chunk_section(section_text, section_title, doc_title, source, chunk_size=1000, chunk_overlap=40):
    """
        Splits the section text into smaller chunks.
    """
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", ".", " ", ""]
    )
    chunks = splitter.split_text(section_text)
    documents = []
    for chunk in chunks:
        doc = Document(
            page_content=chunk,
            metadata={
                'section': section_title,
                'title': doc_title,
                'source': source
            }
        )
        documents.append(doc)

    return documents

def chunk_all_sections(doc_map):
    for source, doc in doc_map.items():
        sections = split_by_section(doc['page_content'])
        all_chunks = []
        for title, content in sections:
            section_title = title.strip().lower().replace("\n", " ").strip()
            section_chunks = chunk_section(content, section_title, doc['title'], source)
            all_chunks.extend(section_chunks)
        doc_map[source]['chunks'] = all_chunks
        doc_map[source]['sections'] = sections

    return doc_map

In [5]:
def push_to_vector_db(document_map):
    for doc in document_map.values():
        if 'chunks' not in doc:
            continue
        text_chunks = doc['chunks']
        docsearch = PineconeVectorStore.from_documents(
        documents=text_chunks,
        index_name=index_name,
        embedding=embeddings,
    )

In [6]:
# To be used during initialization
documents = load_pdfs_from_directory(directory_path='../data_rp/')

# To be used to upload a new file
# extracted_data = load_pdf_file(file_path='../data_rp/RP1.pdf')


doc_map = group_docs_by_path(documents)
doc_map = chunk_all_sections(doc_map)
push_to_vector_db(doc_map)