In [51]:
pdf_file = "reports/goog-10-k-2023-25-55-3.pdf"

In [52]:
import os
import fitz  # PyMuPDF
import pdfplumber
import camelot
import pandas as pd

def extract_tables_from_pdf(pdf_path, output_folder="extracted_tables"):
    os.makedirs(output_folder, exist_ok=True)
    all_tables = []
    pdf_doc = fitz.open(pdf_path)

    # Step 1: Detect if text exists (text-based PDF)
    is_text_based = any(page.get_text().strip() for page in pdf_doc)

    if is_text_based:
        print("PDF is text-based. Using pdfplumber...")
        with pdfplumber.open(pdf_path) as pdf:
            for page_num, page in enumerate(pdf.pages):
                tables = page.extract_tables()
                for table_idx, table in enumerate(tables):
                    df = pd.DataFrame(table[1:], columns=table[0])
                    csv_path = os.path.join(output_folder, f"table_page{page_num+1}_{table_idx+1}.csv")
                    df.to_csv(csv_path, index=False)
                    all_tables.append(df)
    else:
        print("PDF is image-based or graphical. Trying Camelot...")
        try:
            tables = camelot.read_pdf(pdf_path, pages='all', flavor='stream')  # or 'lattice'
            for idx, table in enumerate(tables):
                df = table.df
                csv_path = os.path.join(output_folder, f"camelot_table_{idx+1}.csv")
                df.to_csv(csv_path, index=False)
                all_tables.append(df)
        except Exception as e:
            print("Camelot failed:", e)

    pdf_doc.close()

    if not all_tables:
        print("No tables detected. Try OCR-based extraction.")

    return all_tables


# Example usage
tables = extract_tables_from_pdf(pdf_file)

# Print summary
print(f"Total tables extracted: {len(tables)}")
for idx, df in enumerate(tables):
    print(f"\n--- Table {idx+1} ---")
    print(df.head())


PDF is text-based. Using pdfplumber...
Total tables extracted: 2

--- Table 1 ---
  October 1 - 31 9,923 38,687                            None $ 134.66  \
0             November 1 - 30  9,197 28,198 $ 134.53 $ 135.16     None   
1             December 1 - 31           7,502 24,760 $ 135.76     None   
2         Total 26,622 91,645                            None     None   

   $ 135.65  48,610 $ 45,736  
0      None             None  
1  $ 136.37  32,262 $ 36,347  
2      None             None  

--- Table 2 ---
Empty DataFrame
Columns: [Note 11, of]
Index: []
