In [16]:
import logging
import time
from pathlib import Path
from IPython.display import display

import pandas as pd
from docling.document_converter import DocumentConverter

_log = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

output_dir = Path("scratch")

doc_converter = DocumentConverter()


def is_float(x):
    cleaned = x.replace(",", "")
    try:
        float(cleaned)
        return True
    except:
        return False


for file in Path("./statements").iterdir():
    bank = Path(file).stem.split("_")[0].upper()
    isCreditCardStmt = False if "deposit" in str(file) else True

    if str(file) not in ('statements/citi_dec_2025.pdf', 'statements/citi_nov_2025.pdf'):
        continue

    start_time = time.time()

    conv_res = doc_converter.convert(str(file))
    output_dir.mkdir(parents=True, exist_ok=True)
    doc_filename = conv_res.input.file.stem

    mainframe = pd.DataFrame()
    frames = []

    if bank == 'UOB':
        table_range = [1, -1]
    elif bank == 'CITI':
        table_range = [3, -3]

    # Export tables
    # Skip the first and last table (they tend to be non-transactional)
    for table_ix, table in enumerate(conv_res.document.tables[table_range[0]:table_range[1]]):
        df: pd.DataFrame = table.export_to_dataframe(doc=conv_res.document)
        dropped_rows = []

        # normalize column headers
        df.columns = range(len(df.columns))

        # for-loop to drop rows
        # skip rows whose columns 2 and 3 do not have float values (debit/credit)
        # skip rows where both columns 2 and 3 are populated (e.g. Total)
        for index, row in df.iterrows():

            # uob credit card statements
            if bank == 'UOB' and isCreditCardStmt and len(row) != 4:
                dropped_rows.append(row)
                df.drop(index, inplace=True)
                continue

            # uob non-creditcard statements
            if bank.upper() == 'UOB' and not isCreditCardStmt and (
                    (len(row[0]) == 0) or (not is_float(row[2]) and not is_float(row[3]))):
                # print(f"date={row[0]}, desc={row[1]}, credit={row[2]}, debit={row[3]}, balance={row[4]}")
                dropped_rows.append(row)
                df.drop(index, inplace=True)

        display(df)
        frames.append(df)

    mainframe = pd.concat(frames, ignore_index=True)

    # TODO: show dropped rows

    # Save the table as CSV
    element_csv_filename = output_dir / f"{doc_filename}.csv"
    _log.info(f"Saving CSV table to {element_csv_filename}")
    mainframe.to_csv(element_csv_filename, index=False)

    end_time = time.time() - start_time

    _log.info(f"Document converted and tables exported in {end_time:.2f} seconds.")


2026-01-25 00:23:50,336 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2026-01-25 00:23:50,338 - INFO - Going to convert document batch...
2026-01-25 00:23:50,339 - INFO - Initializing pipeline for StandardPdfPipeline with options hash e15bc6f248154cc62f8db15ef18a8ab7
2026-01-25 00:23:50,340 - INFO - Auto OCR model selected ocrmac.
2026-01-25 00:23:50,341 - INFO - Accelerator device: 'mps'
2026-01-25 00:23:52,887 - INFO - Accelerator device: 'mps'
2026-01-25 00:23:53,747 - INFO - Processing document citi_dec_2025.pdf
2026-01-25 00:24:16,678 - INFO - Finished converting document citi_dec_2025.pdf in 26.34 sec.


Unnamed: 0,0,1,2
0,,TRANSACTIONS FOR CITI REWARDS WORLD MASTERCARD...,
1,,BALANCE PREVIOUS STATEMENT,179.92
2,26 NOV,MONEYSEND ONG AI WEI SG,(179.92)
3,,SUB-TOTAL:,0.00
4,CITI REWARDS WORLDMASTERCARD5425 5030 0422 667...,CITI REWARDS WORLDMASTERCARD5425 5030 0422 667...,
5,05 NOV,ShopBack llaollao Ta Singapore SG,7.00
6,09 NOV,NTUC FairPrice App Pay SINGAPORE SG,10.17
7,10 NOV,NETFLIX.COM SINGAPORE SG,22.98
8,11 NOV,ShopBack Love And Brav Singapore SG,39.00
9,12 NOV,ShopBack Old Chang K Singapore SG,2.00


Unnamed: 0,0,1,2
0,DATE,DESCRIPTION,AMOUNT (SGD)
1,15 NOV,UBER *TRIP HELP.UBER.C Vorden NL FOREIGN AMOUN...,54.17
2,15 NOV,AMAZE* ZIMCARRYCO SINGAPORE SG,17.46
3,15 NOV,AMAZE* HANWHA CONNE SINGAPORE SG,12.86
4,15 NOV,AMAZE* SSIYUHWENGKY SINGAPORE SG,6.16
5,15 NOV,"KORAIL 0051 DAEJEON KR FOREIGN AMOUNT WON48,90...",45.44
6,16 NOV,AMAZE* TRIP.COM SINGAPORE SG,512.38
7,17 NOV,AMAZE* SEUKECHYEOSE SINGAPORE SG,118.54
8,17 NOV,"AMAZE* CO., LTD. GA SINGAPORE SG",14.24
9,18 NOV,AMAZE* HANEULMOKJAN SINGAPORE SG,9.19


Unnamed: 0,0,1,2
0,22 NOV,AMAZE* DEOJEONG UKL SINGAPORE SG,5.09
1,22 NOV,AMAZE* CJ OLIVE YOU SINGAPORE SG,1.82
2,22 NOV,AMAZE* SSIYU JONGRO SINGAPORE SG,13.20
3,22 NOV,AMAZE* 7-ELEVEN SINGAPORE SG,2.09
4,22 NOV,CCY CONVERSION FEE SGD 14.98,(0.14)
5,23 NOV,UBER *TRIP HELP.UBER.C Vorden NL FOREIGN AMOUN...,48.33
6,23 NOV,KKDAY SINGAPORE SG,4.13
7,23 NOV,UBER *TRIP HELP.UBER.C Vorden NL FOREIGN AMOUN...,50.36
8,24 NOV,AMAZE* KOCOWA TRIAL SINGAPORE SG,10.69
9,25 NOV,VIVIFI SINGAPORE SG,10.00


2026-01-25 00:24:16,701 - INFO - Saving CSV table to scratch/citi_dec_2025.csv
2026-01-25 00:24:16,703 - INFO - Document converted and tables exported in 26.37 seconds.
2026-01-25 00:24:16,706 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2026-01-25 00:24:16,708 - INFO - Going to convert document batch...
2026-01-25 00:24:16,708 - INFO - Processing document citi_nov_2025.pdf
2026-01-25 00:24:34,359 - INFO - Finished converting document citi_nov_2025.pdf in 17.66 sec.


Unnamed: 0,0,1,2
0,,TRANSACTIONS FOR CITI REWARDS WORLD MASTERCARD...,
1,,BALANCE PREVIOUS STATEMENT,862.94
2,28 OCT,MONEYSEND ONG AI WEI SG,(862.94)
3,,SUB-TOTAL:,0.00
4,CITI REWARDS WORLDMASTERCARD5425 5030 0422 667...,CITI REWARDS WORLDMASTERCARD5425 5030 0422 667...,
5,03 OCT,NTUC FairPrice App Pay SINGAPORE SG,29.40
6,06 OCT,Grab* A-8FXX4CGG2DOJAV Singapore SG,9.00
7,06 OCT,Gopay-Gojek SINGAPORE SG,11.90
8,06 OCT,SHOPEE SINGAPORE MP SINGAPORE SG,4.59
9,08 OCT,NTUC FairPrice App Pay SINGAPORE SG,1.43


Unnamed: 0,0,1,2
0,12 OCT,ShopBack CHICHA San Singapore SG,5.8
1,13 OCT,ShopBack CHICHA San Singapore SG,4.8
2,17 OCT,ShopBack llaollao Ta Singapore SG,7.0
3,18 OCT,Gopay-Gojek SINGAPORE SG,17.0
4,25 OCT,VIVIFI SINGAPORE SG,10.0
5,25 OCT,FAIRPRICE GROUP HAWKER SINGAPORE SG,1.8
6,25 OCT,FAIRPRICE GROUP HAWKER SINGAPORE SG,1.8
7,26 OCT,ShopBack Singapore SG,10.95
8,26 OCT,APPLE.COM/BILL CORK IE XXXX-XXXX-XXXX-0657,5.98
9,28 OCT,CCY CONVERSION FEE SGD 5.98,0.05


Unnamed: 0,0,1
0,S$O to <S$50,Current Balance
1,>=S$50,The minor caymand asom le pectied in eater or ...


2026-01-25 00:24:34,385 - INFO - Saving CSV table to scratch/citi_nov_2025.csv
2026-01-25 00:24:34,388 - INFO - Document converted and tables exported in 17.68 seconds.
