In [None]:
import logging
import time
from pathlib import Path
from IPython.display import display

import pandas as pd
from docling.document_converter import DocumentConverter

_log = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

output_dir = Path("scratch")

doc_converter = DocumentConverter()

def is_float(x):
    cleaned = x.replace(",", "")
    try:
        float(cleaned)
        return True
    except:
        return False
    
for file in Path("./statements").iterdir():

    isCreditCardStmt = False if "deposit" in str(file) else True

    # if str(file) != 'statements/uob_nov_2025_deposit.pdf':
    # if str(file) != 'statements/uob_mar_2025.pdf':
        # continue

    start_time = time.time()

    conv_res = doc_converter.convert(str(file))
    output_dir.mkdir(parents=True, exist_ok=True)
    doc_filename = conv_res.input.file.stem

    mainframe = pd.DataFrame()
    frames = []

    # Export tables
    # Skip the first and last table (they tend to be non-transactional)
    for table_ix, table in enumerate(conv_res.document.tables[1:-1]):
        df: pd.DataFrame = table.export_to_dataframe(doc=conv_res.document)
        dropped_rows = []

        # normalize column headers
        df.columns = range(len(df.columns))

        # for-loop to drop rows
        # skip rows whose columns 2 and 3 do not have float values (debit/credit)
        # skip rows where both columns 2 and 3 are populated (e.g. Total)
        for index, row in df.iterrows():

            # uob credit card statements
            if isCreditCardStmt and len(row) != 4:
                dropped_rows.append(row)
                df.drop(index, inplace=True)
                continue

            # uob non-creditcard statements
            if not isCreditCardStmt and ((len(row[0])==0) or (not is_float(row[2]) and not is_float(row[3]))):
                # print(f"date={row[0]}, desc={row[1]}, credit={row[2]}, debit={row[3]}, balance={row[4]}")
                dropped_rows.append(row)
                df.drop(index, inplace=True)

        display(df)
        frames.append(df)

    mainframe = pd.concat(frames, ignore_index=True)

    # TODO: show dropped rows
    
    # Save the table as CSV
    element_csv_filename = output_dir / f"{doc_filename}.csv"
    _log.info(f"Saving CSV table to {element_csv_filename}")
    mainframe.to_csv(element_csv_filename, index=False)

    end_time = time.time() - start_time

    _log.info(f"Document converted and tables exported in {end_time:.2f} seconds.")


In [None]:
# mainframe