In [12]:
import os
import pandas as pd
from tika import parser
import fitz
from PIL import Image
import pytesseract
import time
import traceback
from concurrent.futures import ThreadPoolExecutor, as_completed
import re 

In [13]:
BATCHES_FOLDER = r'C:/Users/nanadhirah/Desktop/important/legislation/batches_csv'
PDF_FOLDER = r'C:/Users/nanadhirah/Desktop/important/legislation/legislation_pdf'
#os.makedirs(BATCHES_FOLDER, exist_ok=True)
pytesseract.pytesseract.tesseract_cmd = '/Users/nanadhirah/AppData/Local/miniconda3/envs/apps/Library/bin/tesseract.exe'

In [14]:
# Function to check if PDF is already in any of the CSV files in batches_folder
def get_existing_pdf_filenames(BATCHES_FOLDER):
    existing_pdfs = set()
    
    # Iterate through all CSV files in batches_folder
    for filename in os.listdir(BATCHES_FOLDER):
        if filename.endswith('.csv'):
            csv_path = os.path.join(BATCHES_FOLDER, filename)
            df = pd.read_csv(csv_path)
            
            # Normalize column names to lowercase
            #df.columns = [col.lower() for col in df.columns]
            
            # Check if 'pdf file' column exists (case-insensitive)
            if 'PDF File' in df.columns:
                pdf_files = df['PDF File'].dropna().unique()
                existing_pdfs.update(pdf_files)
    
    return existing_pdfs

# Function to extract new PDFs that are not in the CSV files
def extract_new_pdfs(PDF_FOLDER, BATCHES_FOLDER):
    existing_pdfs = get_existing_pdf_filenames(BATCHES_FOLDER)
    
    # List all PDFs in the pdf_folder
    new_pdfs = []
    for filename in os.listdir(PDF_FOLDER):
        if filename.endswith('.pdf') and filename not in existing_pdfs:
            new_pdfs.append(filename)
    
    # If there are new PDFs, return the list
    if new_pdfs:
        print(f"Found {len(new_pdfs)} new PDFs to extract.")
    else:
        print("No new PDFs found.")
    
    return new_pdfs

In [15]:
#check
existing_extracted_pdfs = get_existing_pdf_filenames(BATCHES_FOLDER)
print("Total existing PDFs:", len(existing_extracted_pdfs))
print("Existing PDFs:", existing_extracted_pdfs)

Total existing PDFs: 14537
Existing PDFs: {'4101_UnknownActNo_PDF.pdf', '8310_19_PDF.pdf', '4725_306_PDF.pdf', '6862_342_PDF.pdf', '4632_139_PDF.pdf', '10138_19_PDF.pdf', '14966_92_PDF.pdf', '12557_634_PDF.pdf', '5274_723_PDF.pdf', '5273_723_PDF.pdf', '14648_115_PDF.pdf', '10115_19_PDF.pdf', '12371_UnknownActNo_PDF.pdf', '9339_358_PDF.pdf', '1218_UnknownActNo_BM.pdf', '6900_53_PDF.pdf', '12911_206_PDF.pdf', '10899_A1578_PDF.pdf', '11927_537_PDF.pdf', '14893_50_PDF.pdf', '12387_657_PDF.pdf', '6016_UnknownActNo_PDF.pdf', '6624_53_PDF.pdf', '8121_611_PDF.pdf', '10206_19_PDF.pdf', '9172_5_PDF.pdf', '13787_613_PDF.pdf', '1392_149_PDF.pdf', '7646_235_PDF.pdf', '4821_671_PDF.pdf', '11338_19_PDF.pdf', '10784_613_PDF.pdf', '12162_533_PDF.pdf', '4995_348_PDF.pdf', '5464_613_PDF.pdf', '13480_308_PDF.pdf', '1349_177_PDF.pdf', '13490_UnknownActNo_PDF.pdf', '13976_283_PDF.pdf', '2264_806_PDF.pdf', '2887_342_PDF.pdf', '13114_19_PDF.pdf', '12262_613_PDF.pdf', '3069_UnknownActNo_PDF.pdf', '7597_235_PDF

In [16]:
#check
new_pdfs_to_extract = extract_new_pdfs(PDF_FOLDER,BATCHES_FOLDER)

No new PDFs found.


In [6]:
def extract_text_with_tika(pdf_path):
    try:
        return parser.from_file(pdf_path)['content'] or ""
    except:
        return ""

def extract_images_and_ocr(pdf_path):
    extracted_text = ""
    try:
        pdf_document = fitz.open(pdf_path)
        for page in pdf_document:
            # Convert the page to an image
            pix = page.get_pixmap()
            img = Image.frombytes("RGB", (int(pix.width), int(pix.height)), pix.samples)
            
            # Perform OCR on the image
            extracted_text += pytesseract.image_to_string(img) + "\\n"
        pdf_document.close()
    except Exception as e:
        print(f"OCR extraction failed for {pdf_path}: {e}")
    return extracted_text

def extract_text_from_pdf(pdf_path):
    tika_text = extract_text_with_tika(pdf_path).strip()
    ocr_text = extract_images_and_ocr(pdf_path).strip()
    method = "Tika + OCR" if tika_text and ocr_text else "Tika" if tika_text else "OCR"
    return tika_text, ocr_text, method 

In [7]:
def get_last_batch_index(output_folder):
    """
    Checks the last batch index in the specified folder.
    Returns the next available batch index.
    """
    last_index = 0

    # Iterate over files in the output folder
    for filename in os.listdir(output_folder):
        if filename.endswith('.csv'):
            # Extract the batch number using regex
            match = re.match(r'batch_(\d+)\.csv', filename)
            if match:
                batch_number = int(match.group(1))
                if batch_number > last_index:
                    last_index = batch_number

    # Return the next available batch index
    return last_index + 1

In [8]:
#check
existing_batches_csv = get_last_batch_index(BATCHES_FOLDER)
print("Next batch of csv:", existing_batches_csv)

Next batch of csv: 1129


In [9]:
#modified
def process_pdfs(pdf_folder, new_pdfs, batch_size=10, output_folder=BATCHES_FOLDER):
    """
    Process PDFs in batches and save each batch as a CSV.
    Continues batch numbering from the last existing batch.
    """
    extracted_data = []
    print(f"Found {len(new_pdfs)} new PDFs. Processing them in batches...")

    start_time = time.time()

    # Get the starting batch index
    current_batch_index = get_last_batch_index(output_folder)
    print(f"Starting from batch index: {current_batch_index}")

    # Split the list of new PDFs into batches
    batches = [new_pdfs[i:i + batch_size] for i in range(0, len(new_pdfs), batch_size)]
    
    with ThreadPoolExecutor() as executor:
        # Process each batch in parallel
        for batch_index, batch in enumerate(batches):
            # Create a list of PDF paths for the batch
            pdf_paths = [os.path.join(pdf_folder, pdf_file) for pdf_file in batch]
            
            # Submit each PDF to the executor
            future_to_pdf = {executor.submit(process_pdf, pdf_path): pdf_path for pdf_path in pdf_paths}
            for future in as_completed(future_to_pdf):
                result = future.result()
                extracted_data.append(result)

            # After processing each batch, save it to a CSV
            if output_folder:
                batch_df = pd.DataFrame(extracted_data)
                batch_filename = f"batch_{current_batch_index}.csv"
                batch_filepath = os.path.join(output_folder, batch_filename)
                batch_df.to_csv(batch_filepath, index=False)
                print(f"Batch {current_batch_index} saved to {batch_filepath}")
                
                # Increment batch index for the next batch
                current_batch_index += 1
                # Reset extracted_data for the next batch
                extracted_data = []

    # Return the combined DataFrame containing all extracted data
    all_data_df = pd.DataFrame(extracted_data)
    return all_data_df

def process_pdf(pdf_path):
    """Process a single PDF file and extract text."""
    try:
        print(f"Processing: {pdf_path}")
        tika_text, ocr_text, method = extract_text_from_pdf(pdf_path)

        # Combine Tika and OCR text
        full_text = (tika_text or "") + "\n" + (ocr_text or "")
        text_length = len(full_text.strip())

        # Return extracted data as a dictionary
        return {
            'PDF File': os.path.basename(pdf_path),
            'Document_Text': full_text,
            'Text_Len': text_length,
            'Text_Ext_Method': method
        }
    except Exception as e:
        # Capture error details and return error information
        error_message = f"Error processing {pdf_path}: {str(e)}\n{traceback.format_exc()}"
        print(error_message)
        return {
            'Document_name': os.path.basename(pdf_path),
            'Document': pdf_path,
            'Error': error_message
        }

In [10]:
def main():
    new_pdfs = extract_new_pdfs(PDF_FOLDER, BATCHES_FOLDER)

    # Step 2: If new PDFs exist, process them
    if new_pdfs:
        process_pdfs(PDF_FOLDER, new_pdfs, batch_size=10, output_folder=BATCHES_FOLDER)
    else:
        print("No new PDFs to process.")

In [11]:
main()

Found 3265 new PDFs to extract.
Found 3265 new PDFs. Processing them in batches...
Starting from batch index: 1129
Processing: C:/Users/nanadhirah/Desktop/important/legislation/legislation_pdf\6775_333_PDF.pdf
Processing: C:/Users/nanadhirah/Desktop/important/legislation/legislation_pdf\6776_333_PDF.pdf
Processing: C:/Users/nanadhirah/Desktop/important/legislation/legislation_pdf\6777_333_PDF.pdf
Processing: C:/Users/nanadhirah/Desktop/important/legislation/legislation_pdf\6778_376_PDF.pdf
Processing: C:/Users/nanadhirah/Desktop/important/legislation/legislation_pdf\6779_64_PDF.pdf
Processing: C:/Users/nanadhirah/Desktop/important/legislation/legislation_pdf\6780_720_PDF.pdf
Processing: C:/Users/nanadhirah/Desktop/important/legislation/legislation_pdf\6781_151_PDF.pdf
Processing: C:/Users/nanadhirah/Desktop/important/legislation/legislation_pdf\6782_122_PDF.pdf
Processing: C:/Users/nanadhirah/Desktop/important/legislation/legislation_pdf\6783_723_PDF.pdf
Processing: C:/Users/nanadhirah

2024-12-18 08:14:01,970 [ThreadPoolEx] [WARNI]  Failed to see startup log message; retrying...
2024-12-18 08:14:01,995 [ThreadPoolEx] [WARNI]  Failed to see startup log message; retrying...
2024-12-18 08:14:02,022 [ThreadPoolEx] [WARNI]  Failed to see startup log message; retrying...
2024-12-18 08:14:02,023 [ThreadPoolEx] [WARNI]  Failed to see startup log message; retrying...
2024-12-18 08:14:02,050 [ThreadPoolEx] [WARNI]  Failed to see startup log message; retrying...
2024-12-18 08:14:02,116 [ThreadPoolEx] [WARNI]  Failed to see startup log message; retrying...
2024-12-18 08:14:02,256 [ThreadPoolEx] [WARNI]  Failed to see startup log message; retrying...
2024-12-18 08:14:02,118 [ThreadPoolEx] [WARNI]  Failed to see startup log message; retrying...
2024-12-18 08:14:02,256 [ThreadPoolEx] [WARNI]  Failed to see startup log message; retrying...
2024-12-18 08:14:02,116 [ThreadPoolEx] [WARNI]  Failed to see startup log message; retrying...
2024-12-18 08:14:06,997 [ThreadPoolEx] [WARNI]  Fa

Batch 1129 saved to C:/Users/nanadhirah/Desktop/important/legislation/batches_csv\batch_1129.csv
Processing: C:/Users/nanadhirah/Desktop/important/legislation/legislation_pdf\6785_235_PDF.pdf
Processing: C:/Users/nanadhirah/Desktop/important/legislation/legislation_pdf\6786_333_PDF.pdf
Processing: C:/Users/nanadhirah/Desktop/important/legislation/legislation_pdf\6787_376_PDF.pdf
Processing: C:/Users/nanadhirah/Desktop/important/legislation/legislation_pdf\6788_376_PDF.pdf
Processing: C:/Users/nanadhirah/Desktop/important/legislation/legislation_pdf\6789_306_PDF.pdf
Processing: C:/Users/nanadhirah/Desktop/important/legislation/legislation_pdf\6790_488_PDF.pdf
Processing: C:/Users/nanadhirah/Desktop/important/legislation/legislation_pdf\6791_645_PDF.pdf
Processing: C:/Users/nanadhirah/Desktop/important/legislation/legislation_pdf\6792_715_PDF.pdf
Processing: C:/Users/nanadhirah/Desktop/important/legislation/legislation_pdf\6793_209_PDF.pdf
Processing: C:/Users/nanadhirah/Desktop/importan

2024-12-18 08:38:08,422 [ThreadPoolEx] [WARNI]  Tika server returned status: 500
2024-12-18 08:38:08,455 [ThreadPoolEx] [WARNI]  Tika server returned status: 500


Batch 1150 saved to C:/Users/nanadhirah/Desktop/important/legislation/batches_csv\batch_1150.csv
Processing: C:/Users/nanadhirah/Desktop/important/legislation/legislation_pdf\69_A1670_EN.pdf
Processing: C:/Users/nanadhirah/Desktop/important/legislation/legislation_pdf\6_A1733_BM.pdf
Processing: C:/Users/nanadhirah/Desktop/important/legislation/legislation_pdf\6_A1733_EN.pdf
Processing: C:/Users/nanadhirah/Desktop/important/legislation/legislation_pdf\7000_45_PDF.pdf
Processing: C:/Users/nanadhirah/Desktop/important/legislation/legislation_pdf\7002_342_PDF.pdf
Processing: C:/Users/nanadhirah/Desktop/important/legislation/legislation_pdf\7003_469_PDF.pdf
Processing: C:/Users/nanadhirah/Desktop/important/legislation/legislation_pdf\7008_469_PDF.pdf
Processing: C:/Users/nanadhirah/Desktop/important/legislation/legislation_pdf\7009_UnknownActNo_PDF.pdf
Processing: C:/Users/nanadhirah/Desktop/important/legislation/legislation_pdf\7011_45_PDF.pdf
Processing: C:/Users/nanadhirah/Desktop/import

2024-12-18 08:39:04,114 [ThreadPoolEx] [WARNI]  Tika server returned status: 500


Batch 1152 saved to C:/Users/nanadhirah/Desktop/important/legislation/batches_csv\batch_1152.csv
Processing: C:/Users/nanadhirah/Desktop/important/legislation/legislation_pdf\7024_UnknownActNo_PDF.pdf
Processing: C:/Users/nanadhirah/Desktop/important/legislation/legislation_pdf\7025_UnknownActNo_PDF.pdf
Processing: C:/Users/nanadhirah/Desktop/important/legislation/legislation_pdf\7026_UnknownActNo_PDF.pdf
Processing: C:/Users/nanadhirah/Desktop/important/legislation/legislation_pdf\7028_UnknownActNo_PDF.pdf
Processing: C:/Users/nanadhirah/Desktop/important/legislation/legislation_pdf\7029_UnknownActNo_PDF.pdf
Processing: C:/Users/nanadhirah/Desktop/important/legislation/legislation_pdf\7030_UnknownActNo_PDF.pdf
Processing: C:/Users/nanadhirah/Desktop/important/legislation/legislation_pdf\7031_UnknownActNo_PDF.pdf
Processing: C:/Users/nanadhirah/Desktop/important/legislation/legislation_pdf\7032_UnknownActNo_PDF.pdf
Processing: C:/Users/nanadhirah/Desktop/important/legislation/legislati

2024-12-18 08:39:30,960 [ThreadPoolEx] [WARNI]  Tika server returned status: 500


Batch 1153 saved to C:/Users/nanadhirah/Desktop/important/legislation/batches_csv\batch_1153.csv
Processing: C:/Users/nanadhirah/Desktop/important/legislation/legislation_pdf\7035_UnknownActNo_PDF.pdf
Processing: C:/Users/nanadhirah/Desktop/important/legislation/legislation_pdf\7036_UnknownActNo_PDF.pdf
Processing: C:/Users/nanadhirah/Desktop/important/legislation/legislation_pdf\7037_UnknownActNo_PDF.pdf
Processing: C:/Users/nanadhirah/Desktop/important/legislation/legislation_pdf\7038_UnknownActNo_PDF.pdf
Processing: C:/Users/nanadhirah/Desktop/important/legislation/legislation_pdf\7039_UnknownActNo_PDF.pdf
Processing: C:/Users/nanadhirah/Desktop/important/legislation/legislation_pdf\7040_UnknownActNo_PDF.pdf
Processing: C:/Users/nanadhirah/Desktop/important/legislation/legislation_pdf\7041_UnknownActNo_PDF.pdf
Processing: C:/Users/nanadhirah/Desktop/important/legislation/legislation_pdf\7042_UnknownActNo_PDF.pdf
Processing: C:/Users/nanadhirah/Desktop/important/legislation/legislati

2024-12-18 08:42:54,544 [ThreadPoolEx] [WARNI]  Tika server returned status: 500


Batch 1157 saved to C:/Users/nanadhirah/Desktop/important/legislation/batches_csv\batch_1157.csv
Processing: C:/Users/nanadhirah/Desktop/important/legislation/legislation_pdf\7082_UnknownActNo_PDF.pdf
Processing: C:/Users/nanadhirah/Desktop/important/legislation/legislation_pdf\7083_94_PDF.pdf
Processing: C:/Users/nanadhirah/Desktop/important/legislation/legislation_pdf\7084_UnknownActNo_PDF.pdf
Processing: C:/Users/nanadhirah/Desktop/important/legislation/legislation_pdf\7085_UnknownActNo_PDF.pdf
Processing: C:/Users/nanadhirah/Desktop/important/legislation/legislation_pdf\7086_UnknownActNo_PDF.pdf
Processing: C:/Users/nanadhirah/Desktop/important/legislation/legislation_pdf\7087_UnknownActNo_PDF.pdf
Processing: C:/Users/nanadhirah/Desktop/important/legislation/legislation_pdf\7088_UnknownActNo_PDF.pdf
Processing: C:/Users/nanadhirah/Desktop/important/legislation/legislation_pdf\7090_UnknownActNo_PDF.pdf
Processing: C:/Users/nanadhirah/Desktop/important/legislation/legislation_pdf\709