In [7]:
import camelot
import pandas as pd
import os
from typing import List

In [8]:
def is_numeric(value):
    """
    Check if a value is numeric (integer or float).
    """
    try:
        float(value)  # Try converting to float
        return True
    except ValueError:
        return False

def extract_numerical_tables_from_pdfs(pdf_folder: str, output_folder: str):
    """
    Extracts tables from all PDF files in a folder, filters numerical values,
    and saves them as CSV files.

    Parameters:
        pdf_folder (str): Path to the folder containing the PDF files.
        output_folder (str): Folder to save the extracted CSV files.
    """
    # Ensure the output folder exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Get all PDF files in the folder
    try:
        pdf_files = [os.path.join(pdf_folder, f) for f in os.listdir(pdf_folder) if f.lower().endswith('.pdf')]
        if not pdf_files:
            print(f"No PDF files found in the folder: {pdf_folder}")
            return
    except Exception as e:
        print(f"Error accessing folder: {e}")
        return

    # Process each PDF file
    for pdf_path in pdf_files:
        print(f"Processing PDF: {pdf_path}")
        # Extract tables from the PDF
        try:
            tables = camelot.read_pdf(pdf_path, pages='all', flavor='stream')
        except Exception as e:
            print(f"Error reading PDF {pdf_path}: {e}")
            continue  # Skip to the next PDF if there's an error

        # Save each table as a CSV file (with only numerical values)
        for i, table in enumerate(tables):
            # Convert the table to a pandas DataFrame
            df = table.df

            # Filter numerical values using a list comprehension
            numerical_df = pd.DataFrame([[x if is_numeric(x) else None for x in row] for row in df.values], columns=df.columns)

            # Drop rows and columns with all NaN values
            numerical_df.dropna(axis=0, how='all', inplace=True)  # Drop rows with all NaN
            numerical_df.dropna(axis=1, how='all', inplace=True)  # Drop columns with all NaN

            # Save the filtered DataFrame to a CSV file
            # Include the PDF filename in the output path
            pdf_filename = os.path.splitext(os.path.basename(pdf_path))[0]
            csv_path = os.path.join(output_folder, f'{pdf_filename}_table_{i + 1}_numerical.csv')
            numerical_df.to_csv(csv_path, index=False)
            print(f"  Numerical Table {i + 1} saved to {csv_path}")

        print(f"  Extracted {len(tables)} tables from {pdf_path}")

# Example usage
pdf_folder = 'papers/'  # Folder containing the PDF files
output_folder = 'extracted_numerical_tables'  # Folder to save the CSV files

if __name__ == "__main__":
    extract_numerical_tables_from_pdfs(pdf_folder, output_folder)

Processing PDF: papers/paper.pdf
  Numerical Table 1 saved to extracted_numerical_tables/paper_table_1_numerical.csv
  Numerical Table 2 saved to extracted_numerical_tables/paper_table_2_numerical.csv
  Numerical Table 3 saved to extracted_numerical_tables/paper_table_3_numerical.csv
  Numerical Table 4 saved to extracted_numerical_tables/paper_table_4_numerical.csv
  Numerical Table 5 saved to extracted_numerical_tables/paper_table_5_numerical.csv
  Numerical Table 6 saved to extracted_numerical_tables/paper_table_6_numerical.csv
  Numerical Table 7 saved to extracted_numerical_tables/paper_table_7_numerical.csv
  Numerical Table 8 saved to extracted_numerical_tables/paper_table_8_numerical.csv
  Numerical Table 9 saved to extracted_numerical_tables/paper_table_9_numerical.csv
  Numerical Table 10 saved to extracted_numerical_tables/paper_table_10_numerical.csv
  Numerical Table 11 saved to extracted_numerical_tables/paper_table_11_numerical.csv
  Extracted 11 tables from papers/paper