In [3]:
import fitz  # PyMuPDF
import pandas as pd
from tabula import read_pdf
import re

In [4]:
#Insert your pdf path. For example: pdf_path = "data/multiple_test.pdf". 
# #I reccomend to have a folder with your data, called "data" and to insert your file(s) here. 
# #If you have the repo forked and will develope further, remember to add the folder to .gitignore.  
pdf_path = "path/to/pdf/file.pdf"

In [5]:
def extract_specific_columns_from_page(pdf_path, page_number):
    try:
        tables = read_pdf(pdf_path, pages=str(page_number), multiple_tables=True, stream=True)
        if tables:
            table = tables[0]
            if 'your column name' in table.columns and 'another column name' in table.columns:
                return table[['your column name', 'another column name']]
            else:
                print(f"Specified columns not found on page {page_number}.")
                return None
        else:
            print(f"No tables found on page {page_number}.")
            return None
    except Exception as e:
        print(f"Error on page {page_number}: {e}")
        return None


In [None]:

def extract_text_info_from_page(pdf_path, page_number):
    doc = fitz.open(pdf_path)
    page = doc.load_page(page_number - 1)  # Pages are 0-indexed in PyMuPDF
    text = page.get_text()

    info_keys = ['key value in text','key value in text1', 'key value in text2', 'key value in text3', 'key value in text4']
    info_values = [re.search(f"{key}:\s*(.*)", text).group(1) for key in info_keys if re.search(f"{key}:\s*(.*)", text)]
    
    extracted_info = dict(zip(info_keys, info_values))
    df_info = pd.DataFrame([extracted_info])
    doc.close()
    
    return df_info

In [6]:
def transpose_table_clean(df):
    # Check if 'your column name' column exists to set it as the index
    if 'your column name' in df.columns:
        # Set the 'your column name' column as the index and transpose the DataFrame
        df_transposed = df.set_index('your column name').T
        
        # Remove the name of the index and columns
        df_transposed.index.name = None
        df_transposed.columns.name = None
        
        # Optionally, return the cleaned transposed DataFrame for further use
        return df_transposed
    else:
        print("The 'your column name' column was not found.")
        return None


In [7]:
def process_pdf_pages(pdf_path):
    # Open the PDF
    doc = fitz.open(pdf_path)
    combined_data_list = []
    
    # Iterate over each page in the PDF
    for page_number in range(1, len(doc) + 1):
        print(f"Processing page {page_number} of {pdf_path}...")
        
        # Step 1: Extract specific columns for this page
        table = extract_specific_columns_from_page(pdf_path, page_number)
        if table is not None:
            df_transposed = transpose_table_clean(table)
        
        # Step 2: Extract text information for this page
        df_info = extract_text_info_from_page(pdf_path, page_number)
        
        # Step 3: Combine the DataFrames if both are defined
        if df_transposed is not None and df_info is not None:
            df_transposed_reset = df_transposed.reset_index(drop=True)
            df_info_reset = df_info.reset_index(drop=True)
            
            combined_df = pd.concat([df_info_reset, df_transposed_reset], axis=1)
            combined_data_list.append(combined_df)
        else:
            print(f"Skipping page {page_number} due to missing data.")
    
    doc.close()
    
    if combined_data_list:
        return pd.concat(combined_data_list, ignore_index=True)
    else:
        return None


In [None]:
# Initialize list to store combined data from all pages in the PDF
all_pages_combined_data_list = []

# Process all pages of the single PDF
combined_df = process_pdf_pages(pdf_path)

if combined_df is not None:
    all_pages_combined_data_list.append(combined_df)
else:
    print(f"No data extracted from {pdf_path}.")

# Combine all data from all pages in the single PDF
if all_pages_combined_data_list:
    final_combined_df = pd.concat(all_pages_combined_data_list, ignore_index=True)
    
    # Display final combined DataFrame
    print(final_combined_df.to_string(index=False))
    
    # Save final combined data to CSV
    final_combined_df.to_csv('final_combined_results.csv', index=False)
else:
    print("No data was processed from the PDF.")