In [12]:
# !pip install pypdf
# pip install pandas 
# !pip install pdfplumber
# !pip install docling 
# !pip install "camelot-py[base]"

In [9]:
from pypdf import PdfReader
import os 
import pdfplumber
import pandas as pd 
import camelot


### Extract tables from Pdf using pdfplumber

In [41]:


def extract_tables_from_pdf(pdf):
 
    tables_folder = "extracted_tables" 
    os.makedirs(tables_folder, exist_ok=True)  # Ensure the folder exists
 
    with pdfplumber.open(pdf) as pdf :
        for page_number in range(len(pdf.pages)):
            table=pdf.pages[page_number].extract_table()
            if table:
                df = pd.DataFrame(table[1::], columns=table[0])
                # print(df)
                csv_path = os.path.join(tables_folder, f"table_page_{page_number}.csv")
                df.to_csv(csv_path, index= False)

                
           


    
# extract_tables_from_pdf()

### Extract Text and Images from Pdf

In [42]:

def extract_data_from_pdf(pdf):
    images_folder = "extracted_images"
    text_folder = "extracted_text"

    # Ensure folder exist
    for folder in [images_folder, text_folder]:
        if not os.path.exists(folder):
            os.makedirs(folder)

    # reading pdf file
    reader = PdfReader(pdf) 
    # Extract text from all pages
    text_file_path = os.path.join(text_folder, f"extracted_text.txt")
    with open(text_file_path,"w", encoding="utf-8") as fp:
        for page_number in range(len(reader.pages)):
            page = reader.pages[page_number]
            fp.write(f"Page {page_number + 1}\n")
            fp.write(page.extract_text())
            fp.write("\n\n")
            
            # Extract images from all pages
            for count, image_file_object in enumerate(page.images):
                image_file_path = os.path.join(images_folder, f"page_{page_number + 1}_image_{count + 1}.png")
                with open(image_file_path, "wb") as img_fp:
                    img_fp.write(image_file_object.data)
    
    extract_tables_from_pdf(pdf)
    


In [43]:
extract_data_from_pdf("./data/test.pdf")


### Text and Images using pyMuPdf

In [22]:
import fitz  
import os
import pandas as pd

def extract_pdf_content(pdf_path, output_folder):
    os.makedirs(output_folder, exist_ok=True)
    images_folder = os.path.join(output_folder)
    os.makedirs(images_folder, exist_ok=True)

    # Open the PDF
    pdf_document = fitz.open(pdf_path)

    text_data = []
    for page_number in range(len(pdf_document)):
        page = pdf_document[page_number]

        # Extract text
        text = page.get_text()
        text_data.append(f"--- Page {page_number + 1} ---\n{text}\n")

        # Extract images
        for img_index, img in enumerate(page.get_images(full=True)):
            xref = img[0]
            base_image = pdf_document.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]
            image_file_path = os.path.join(images_folder, f"page_{page_number + 1}_image_{img_index + 1}.png")
            with open(image_file_path, "wb") as img_file:
                img_file.write(image_bytes)
                
    # Save text to a file
    text_file_path = os.path.join(output_folder, "extracted_text.txt")
    with open(text_file_path, "w", encoding="utf-8") as text_file:
        text_file.writelines(text_data)


# Usage
extract_pdf_content("./data/NFC.pdf", "pymupdf")


#### Extract text , image and tabled from pdf using pdfplumber

In [19]:

def extract_text_img_tables(pdf_path):
    tables_folder = "extracted_tables" 
    text_folder = "extracted_text"
    images_folder = "extracted_images"

    os.makedirs(tables_folder, exist_ok=True)  # Ensure the folder exists

    with pdfplumber.open(pdf_path) as pdf :
        for page_number in range(len(pdf.pages)):
            table=pdf.pages[page_number].extract_table()
            if table:
                df = pd.DataFrame(table[1::], columns=table[0])
                # print(df)
                csv_path = os.path.join(tables_folder, f"table_page_{page_number}.csv")
                df.to_csv(csv_path, index= False)

        text_file_path = os.path.join(text_folder, "extracted_text.pdf")
        with open(text_file_path, "w", encoding="utf-8") as text_file:
            for page_number, page in enumerate(pdf.pages, start=1):
                text = page.extract_text()
                if text:
                    # Write text to the output file
                    text_file.write(f"--- Page {page_number} ---\n")
                    text_file.write(text)
                    text_file.write("\n\n")
                
                for count, image in enumerate(page.images):
                    # if "stream" in image:
                        image_data  = image["stream"].get_data()
                        if image_data:
                            image_file_path = os.path.join(images_folder, f"page_{page_number + 1}_image_{count + 1}.png")   
                            with open(image_file_path, "wb") as img_fp:
                               img_fp.write(image_data)

            

In [20]:
extract_text_img_tables("./data/test.pdf")