In [1]:
import os
from dotenv import load_dotenv
load_dotenv()
from langchain_groq import ChatGroq
from pydantic import BaseModel, Field
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate
import pprint
import pdfplumber
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders.pdf import PyMuPDFLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.schema import Document
import pytesseract
from PIL import Image


In [2]:
## Langchain and Langsmith tracing
os.environ['LANGCHAIN_API_KEY'] = os.getenv('LANGCHAIN_API_KEY')
os.environ['LANGCHAIN_PROJECT'] = os.getenv('LANGCHAIN_PROJECT')
os.environ["LANGCHAIN_TRACING_V2"]="true"

## Getting Froq API key
os.environ["GROQ_API_KEY"]=os.getenv("GROQ_API_KEY")

In [4]:
pdf_path = "Generative AI with LangChain (2024).pdf"
# output_md_path = "output.md"
# output_metadata_path = "metadata.json"
image_folder = "images"
os.makedirs(image_folder, exist_ok=True)

In [None]:
# loader=PyPDFLoader(pdf_path)
# docs=loader.load()
# # docs

In [None]:
# poppler_path = 'E:\\Release-24.08.0-0\\poppler-24.08.0\\Library\\bin'
# os.environ["PATH"] += os.pathsep + poppler_path

In [5]:
os.environ['PATH'] += os.pathsep + r'C:\\Program Files\\Tesseract-OCR'

In [6]:
def extract_elements_from_pdf(pdf_path):
    """Extracts texts, table contents and images from a PDF."""
    
    try:
        documents = []
        print(f"Extracting text contents from '{os.path.basename(pdf_path)}'")
        # Extract text contents using pdfplumber
        with pdfplumber.open(pdf_path) as pdf:
            for page_num, page in enumerate(pdf.pages):
                text = page.extract_text()
                if text.strip():
                    metadata = {
                        "source": os.path.basename(pdf_path),
                        "page": page_num + 1,
                        "type": "text"
                    }
                    documents.append(Document(page_content=text, metadata=metadata))
        
        print(f"Extracting table contents from '{os.path.basename(pdf_path)}'")
        # Extract table contents using pdfplumber
        with pdfplumber.open(pdf_path) as pdf:
            for page_num, page in enumerate(pdf.pages):
                tables = page.extract_tables()
                for table_num, table_data in enumerate(tables):
                    if table_data: # Ensure table_data is not None or empty
                        # Flatten table data into a string format
                        table_content = "\n".join(["\t".join(map(str, row)) for row in table_data if row])
                        if table_content.strip(): # Only add non-empty tables
                            metadata = {
                                "source": os.path.basename(pdf_path),
                                "page": page_num + 1,
                                "table_num": table_num + 1,
                                "type": "table"
                            }
                            # Add a header to table content to distinguish it
                            documents.append(Document(page_content=f"Table {table_num+1} on page {page_num+1} contains:\n{table_content}", metadata=metadata))
        
        print(f"Extracting images from '{os.path.basename(pdf_path)}'")
        # Extract images using pdfplumber
        with pdfplumber.open(pdf_path) as pdf:
            for page_num, page in enumerate(pdf.pages):
                images = page.images
                for image_num, image in enumerate(images):
                    if image: # Ensure table_data is not None or empty
                        bbox = [image['x0'], page.cropbox[3]-image['y1'],  image['x1'], page.cropbox[3]-image['y0']]
                        img_page = page.crop(bbox=bbox)
                        img_obj = img_page.to_image(resolution=500)
                        # page_number = image['page_number']
                        image_name_prefix = f'{page_num}-{image_num + 1}'
                        image_name = f'{image_name_prefix}' + ".png"
                        image_path = f'{image_folder}\\{image_name}'
                        img_obj.save(image_path)
                        image_content = pytesseract.image_to_string(Image.open(image_path), lang='eng')
                        metadata = {
                                "source": os.path.basename(pdf_path),
                                "page": page_num + 1,
                                "image_num": image_num + 1,
                                "type": "image",
                                "image_path":image_path
                            }
                        # Add a header to table content to distinguish it
                        documents.append(Document(page_content=f"Image {image_num+1} on page {page_num+1} contains:\n{image_content}", metadata=metadata))
        return documents
    except Exception as e:
            print(f"Error extracting details from {pdf_path}: {e}")

In [7]:
raw_docs = extract_elements_from_pdf(pdf_path)

Extracting text contents from 'Generative AI with LangChain (2024).pdf'
Extracting table contents from 'Generative AI with LangChain (2024).pdf'
Extracting images from 'Generative AI with LangChain (2024).pdf'


In [8]:
raw_docs[0]

Document(metadata={'source': 'Generative AI with LangChain (2024).pdf', 'page': 2, 'type': 'text'}, page_content='Generative AI with LangChain\nBuild large language model (LLM) apps with Python,\nChatGPT, and other LLMs\nBen Auffarth\nBIRMINGHAM—MUMBAI')

In [9]:
from langchain_experimental.text_splitter import SemanticChunker
embeddings=HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
text_splitter = SemanticChunker(embeddings, breakpoint_threshold_type='percentile', breakpoint_threshold_amount=90)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
chunks = text_splitter.split_documents([doc for doc in raw_docs if doc.metadata['type']=='text'])
chunks.extend(doc for doc in raw_docs if doc.metadata['type']!='text')

In [47]:
len(chunks)

984