In [1]:
import os
import re
import pymupdf4llm
import json
import pandas as pd
import time
from uuid import uuid4
from pathlib import Path
from natsort import natsorted
from langchain_openai import OpenAIEmbeddings
from langchain.schema import Document
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [2]:
#set API Key from OpenAI
openai_api_key= os.environ.get("OPENAI_API_KEY_SBR")

In [3]:
def load_pdf_paths(pdf_directory, base_persist_directory):
    """Load PDF file paths in natural order and assign persist directories in groups of three with local IDs (1,2,3) per batch."""
    
    files = [file for file in os.listdir(pdf_directory) if file.endswith(".pdf")]
    sorted_files = natsorted(files)  
    
    all_files = [str(Path(pdf_directory, file).as_posix()) for file in sorted_files]
    
    num_of_papers = len(all_files)  

    document_ids = list(range(1, num_of_papers + 1))

    persist_directories = [
        str(Path(base_persist_directory, f"db_{doc_id}").as_posix())
        for doc_id in document_ids
    ]
    
    df_pdf = pd.DataFrame({
        'Document_ID': document_ids, 
        'PDF_path': all_files, 
        'Vectordb_path': persist_directories
    })
    
    return df_pdf
    
def load_the_document(pdf_path):
    """Loads the document and extracts text content"""

    # Load document
    docs = pymupdf4llm.to_markdown(pdf_path, page_chunks=True)

    # Extract metadata
    paper_title = docs[0]['metadata'].get('title', 'Unknown Title')
    num_pages = docs[0]['metadata'].get('page_count', 0)
    author = docs[0]['metadata'].get('author', 'Unknown Author')

    pages_content = []

    for i, page in enumerate(docs, start=1):
        page_text = remove_reference(page['text']).strip()
        pages_content.append({
            "page_number": i,
            "text": page_text
        })

    return pages_content, paper_title, author, num_pages


def remove_reference(pdf_text):
    """Removes references and acknowledgment sections, along with inline citations, from a given PDF text."""

    # Remove references and acknowledgment sections entirely
    pdf_text = re.split(r'(?i)\bReferences\b|\bAcknowledgment[s]?\b', pdf_text)[0].strip()

    # Remove inline citations like [1], [12-15], (Smith et al., 2021), etc.
    citation_patterns = [
        r'\[[^\]]*\d{1,4}\]',          # numeric citations like [1], [12], [ABC12]
        r'\([^\)]*et al\.,?\s*\d{4}[^\)]*\)',  # (Author, 2020)
        r'\(\s*\d{4}[a-z]?\s*\)',  # (2021a)
    ]

    for pattern in citation_patterns:
        pdf_text = re.sub(pattern, '', pdf_text)

    # Clean up extra spaces and blank lines
    pdf_text = '\n'.join(line.strip() for line in pdf_text.splitlines() if line.strip())

    return pdf_text


def embedding_document(pages_content, ID, persist_directory):
    """Embeds documents and stores them persistently in a vector database (Chroma), recording page number."""

    documents = []
    
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=3000,   # small for better retrieval precision
        chunk_overlap=400,
        length_function=len,
        separators=["\n\n", "\n", "."]
    )

    document_id = ID

    for page in pages_content:
        page_number = page["page_number"]
        page_text = page["text"]

        # Split this page into smaller chunks
        page_chunks = text_splitter.create_documents([page_text])

        for chunk in page_chunks:
            documents.append(
                Document(
                    page_content=chunk.page_content,
                    metadata={
                        "page_number": page_number,
                        "document_id": document_id 
                    }
                )
            )

    num_chunks = len(documents)

    # Initialize embeddings
    embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)

    vectordb = Chroma.from_documents(
        documents=documents,
        embedding=embeddings, 
        persist_directory=persist_directory
    )

    vectordb.persist()

    return num_chunks

def build_vectorDB(df_path):
    """Processes each PDF, extracts metadata, builds vector DB, and updates DataFrame."""
   
    success_status = []  # Track success for each PDF
    paper_titles = []  # Store paper titles
    author_list = []
    num_pages_list = []  # Store number of pages
    chunk_nums = []  # Store chunk numbers separately

    for _, row in df_path.iterrows():
        pdf_path = row["PDF_path"]
        ID = row["Document_ID"]
        persist_directory = row["Vectordb_path"]
        
        try:
            # Extract content, title, and page count
            docs, paper_title, author, num_pages = load_the_document(pdf_path)

            # Build the vector database
            num_chunks = embedding_document(docs, ID, persist_directory)  # Returns an integer

            success_status.append(True)

            # Store extracted metadata
            paper_titles.append(paper_title)
            author_list.append(author)
            num_pages_list.append(num_pages)
            chunk_nums.append(num_chunks) 

            print(f"Successfully processed {pdf_path}")
        
        except Exception as e:
            print(f"Error processing {pdf_path}: {e}")
            success_status.append(False)  # Mark as failed

            # Store placeholders for failed cases
            paper_titles.append(None)
            author_list.append(None)
            num_pages_list.append(None)
            chunk_nums.append(None)  
            vectordb = None  # Ensure vectordb is always defined

    # Add extracted metadata to the DataFrame
    df_path["Paper_title"] = paper_titles
    df_path["Author"] = author_list
    df_path["Num_pages"] = num_pages_list
    df_path["Num_chunks"] = chunk_nums  
    df_path["Is_added_to_vectorDB"] = success_status

    return df_path

def df_to_csv(df, file_name):
    """Write a DataFrame to a CSV file"""
    df.to_csv(file_name, index=False, escapechar='\\')

In [3]:
# pdf_directory = 'C:/Users/89751/OneDrive/Desktop/LCA ontology/Ragas_evaluation/Paper'
pdf_directory = 'C:/Users/89751/OneDrive/Desktop/testing/'
base_persist_directory = 'C:/Users/89751/LangChain-Practise/New_code/Embedding/db29'
df_path = load_pdf_paths(pdf_directory, base_persist_directory)
df_path

Unnamed: 0,Document_ID,PDF_path,Vectordb_path
0,1,C:/Users/89751/OneDrive/Desktop/testing/22.pdf,C:/Users/89751/LangChain-Practise/New_code/Emb...


In [4]:
total_start_time = time.time()  # Start time tracking

df = build_vectorDB(df_path)

total_end_time = time.time()
print(f"\nTotal processing time: {total_end_time - total_start_time:.2f} seconds")

Error processing C:/Users/89751/OneDrive/Desktop/testing/22.pdf: name 'openai_api_key' is not defined

Total processing time: 9.26 seconds


In [5]:
df

Unnamed: 0,Document_ID,PDF_path,Vectordb_path,Paper_title,Author,Num_pages,Num_chunks,Is_added_to_vectorDB
0,1,C:/Users/89751/OneDrive/Desktop/testing/22.pdf,C:/Users/89751/LangChain-Practise/New_code/Emb...,,,,,False


In [6]:
file_path = 'C:/Users/89751/OneDrive/Desktop/LCA ontology/vectorDB.csv'
df_to_csv(df, file_path)

In [14]:
pdf_path = 'C:/Users/89751/OneDrive/Desktop/22.pdf'

docs = pymupdf4llm.to_markdown(pdf_path, page_chunks=True)

In [15]:
docs

[{'metadata': {'format': 'PDF 1.7',
   'title': 'Environmental life cycle assessment of polypropylene made from used cooking oil',
   'author': 'Christian Moretti',
   'subject': 'Resources, Conservation & Recycling, 157 (2020) 104750. doi:10.1016/j.resconrec.2020.104750',
   'keywords': 'Polypropylene (PP); Bio-based chemicals; Used cooking oil (UCO); Multifunctionality; ',
   'creator': 'Elsevier',
   'producer': '',
   'creationDate': 'D:20200504213932Z',
   'modDate': 'D:20200504213932Z',
   'trapped': '',
   'encryption': None,
   'file_path': 'C:/Users/89751/OneDrive/Desktop/22.pdf',
   'page_count': 13,
   'page': 1},
  'toc_items': [[1,
    'Environmental life cycle assessment of polypropylene made from used cooking oil',
    1],
   [2, 'Introduction', 1]],
  'tables': [],
  'images': [{'number': 0,
    'bbox': (37.644100189208984,
     60.77508544921875,
     97.05830383300781,
     125.97198486328125),
    'transform': (59.41419982910156,
     0.0,
     -0.0,
     65.19689941