In [4]:
import logging
import time
from pathlib import Path
from IPython.display import display

import pandas as pd
from docling.document_converter import DocumentConverter

_log = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

output_dir = Path("scratch")

doc_converter = DocumentConverter()


def is_float(x):
    cleaned = x.replace(",", "")
    try:
        float(cleaned)
        return True
    except:
        return False

for file in Path("./statements").iterdir():
    bank = Path(file).stem.split("_")[0].lower()
    is_credit_card_stmt = False if "deposit" in str(file) else True

    # TODO: docling isn't reading it properly
    if str(file) != 'statements/uob_jan_2026_deposit.pdf':
        continue

    start_time = time.time()

    conv_res = doc_converter.convert(str(file))
    output_dir.mkdir(parents=True, exist_ok=True)
    doc_filename = conv_res.input.file.stem

    mainframe = pd.DataFrame()
    frames = []

    table_range = [0,-1]

    if bank == 'uob':
        table_range = [1, -1]
    elif bank == 'citi':
        table_range = [3, -3]
    elif bank == 'chocolate':
        table_range = [0, -3]

    # Export tables
    # Skip the first and last table (they tend to be non-transactional)
    for table_ix, table in enumerate(conv_res.document.tables[table_range[0]:table_range[1]]):
        df: pd.DataFrame = table.export_to_dataframe(doc=conv_res.document)
        dropped_rows = []

        # normalize column headers
        df.columns = range(len(df.columns))

        if df.shape[1] != 5:
            print(f"⚠️⚠️⚠️ Table {table_ix} has {df.shape[1]} columns")
            df.insert(3, "missing_col", "")
            display(df)
            print(f"done displaying==================================")

        # for-loop to drop rows
        # skip rows whose columns 2 and 3 do not have float values (debit/credit)
        # skip rows where both columns 2 and 3 are populated (e.g. Total)
        for index, row in df.iterrows():

            # uob credit card
            if bank == 'uob' and is_credit_card_stmt and len(row) != 4:
                dropped_rows.append(row)
                df.drop(index, inplace=True)
                continue

            # uob non-creditcard
            if bank == 'uob' and not is_credit_card_stmt and (
                    (len(row[0]) == 0) or (not is_float(row[2]) and not is_float(row[3]))):
                # print(f"date={row[0]}, desc={row[1]}, credit={row[2]}, debit={row[3]}, balance={row[4]}")
                dropped_rows.append(row)
                df.drop(index, inplace=True)

        display(df)
        frames.append(df)

    mainframe = pd.concat(frames, ignore_index=True)

    # TODO: show dropped rows

    # Save the table as CSV
    element_csv_filename = output_dir / f"{doc_filename}.csv"
    _log.info(f"Saving CSV table to {element_csv_filename}")
    mainframe.to_csv(element_csv_filename, index=False)

    end_time = time.time() - start_time

    _log.info(f"Document converted and tables exported in {end_time:.2f} seconds.")


2026-02-23 22:45:48,797 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2026-02-23 22:45:48,799 - INFO - Going to convert document batch...
2026-02-23 22:45:48,800 - INFO - Initializing pipeline for StandardPdfPipeline with options hash e15bc6f248154cc62f8db15ef18a8ab7
2026-02-23 22:45:48,801 - INFO - Auto OCR model selected ocrmac.
2026-02-23 22:45:48,801 - INFO - Accelerator device: 'mps'
2026-02-23 22:45:50,598 - INFO - Accelerator device: 'mps'
2026-02-23 22:45:51,423 - INFO - Processing document uob_jan_2026_deposit.pdf
2026-02-23 22:46:09,545 - INFO - Finished converting document uob_jan_2026_deposit.pdf in 20.75 sec.


⚠️⚠️⚠️ Table 0 has 4 columns


Unnamed: 0,0,1,2,missing_col,3
0,01 Jan,BALANCE B/F,,,10531.24
1,02 Jan,PAYNOW-FAST PIB2601018759726806 SING SEE SOON ...,5.0,,10526.24
2,03 Jan,One Bonus Interest,13.35,,10539.59
3,05 Jan,NETS Debit-Consumer CS55511585400 xxxxxx0162,1.8,,10537.79
4,05 Jan,NETS Debit-Consumer J & S 88 PT16182252 xxxxxx...,7.2,,10530.59
5,06 Jan,PAYNOW-FAST CHONG PANG OLD TIME,1.8,,10528.79
6,06 Jan,NETS Debit-Consumer MIX VEGETAB12177173 xxxxxx...,4.7,,10524.09
7,07 Jan,PAYNOW-FAST PAYNOW OTHR YONG YOONG JIE JOHN Se...,3.3,,10527.39
8,07 Jan,NETS Debit-Consumer LE TACH PTE08573500 xxxxxx...,0.9,,10526.49
9,07 Jan,PAYNOW-FAST PIB2601078828738812 Kingly OTHR Tr...,1.9,,10524.59




Unnamed: 0,0,1,2,missing_col,3
0,01 Jan,BALANCE B/F,,,10531.24
1,02 Jan,PAYNOW-FAST PIB2601018759726806 SING SEE SOON ...,5.0,,10526.24
2,03 Jan,One Bonus Interest,13.35,,10539.59
3,05 Jan,NETS Debit-Consumer CS55511585400 xxxxxx0162,1.8,,10537.79
4,05 Jan,NETS Debit-Consumer J & S 88 PT16182252 xxxxxx...,7.2,,10530.59
5,06 Jan,PAYNOW-FAST CHONG PANG OLD TIME,1.8,,10528.79
6,06 Jan,NETS Debit-Consumer MIX VEGETAB12177173 xxxxxx...,4.7,,10524.09
7,07 Jan,PAYNOW-FAST PAYNOW OTHR YONG YOONG JIE JOHN Se...,3.3,,10527.39
8,07 Jan,NETS Debit-Consumer LE TACH PTE08573500 xxxxxx...,0.9,,10526.49
9,07 Jan,PAYNOW-FAST PIB2601078828738812 Kingly OTHR Tr...,1.9,,10524.59


Unnamed: 0,0,1,2,3,4
0,09 Jan,Inward CR - GIRO PAYNOW SALA GOVERNMENT TECHNO...,,5964.99,16323.84
1,12 Jan,PAYNOW-FAST LIONEL TAN JUNZER MBK2601118875697871,5.0,,16318.84
2,12 Jan,PAYNOW-FAST STR JAPAN MBK2601118881336347,3.0,,16315.84
3,12 Jan,Funds Trf - FAST PIB2512128534925383 Daniel On...,300.0,,16015.84
4,12 Jan,Funds Trf - FAST PIB2512128534925388 Suphie Lu...,250.0,,15765.84
5,12 Jan,Funds Trf - FAST PIB2512128534925393 IBKR DBS ...,2000.0,,13765.84
6,12 Jan,PAYNOW-FAST CHONG PANG OLD TIME MBK26011288871...,1.8,,13764.04
7,13 Jan,PAYNOW-FAST CHONG PANG OLD TIME MBK26011388985...,4.0,,13760.04
8,15 Jan,PAYNOW-FAST CHONG PANG OLD TIME MBK26011589219...,6.2,,13753.84
9,16 Jan,PAYNOW-FAST PAYNOW OTHR EDWIN LAU JUN HAO For ...,,17.7,13771.54


Unnamed: 0,0,1,2,3,4
0,16 Jan,PAYNOW-FAST PAYNOW OTHR NG YU EE Send back 890...,,17.7,13860.34
1,16 Jan,PAYNOW-FAST PAYNOW OTHR JESSICA TJITRA Baby ha...,,17.7,13878.04
2,16 Jan,NETS Debit-Consumer MIX VEGETAB12077173 xxxxxx...,,6.4,13871.64
3,16 Jan,NETS Debit-Consumer TUCKSHOP (P12089157 xxxxxx...,,2.3,13869.34
4,16 Jan,NETS Debit-Consumer TUCKSHOP (P12099157 xxxxxx...,,0.1,13869.24
5,17 Jan,NETS Debit-Consumer YANG GUO FU10097066 xxxxxx...,,15.6,13853.64
6,17 Jan,PAYNOW-FAST PIB2601178945483316 GREENDOT GOURM...,,85.9,13767.74
7,20 Jan,NETS Debit-Consumer MAPLETREE B00049800 xxxxxx...,,27.25,13740.49
8,20 Jan,PAYNOW-FAST CHONG PANG OLD TIME MBK26012089756...,,1.8,13738.69
9,20 Jan,PAYNOW-FAST CHONG PANG OLD TIME MBK26012089773...,,4.0,13734.69


Unnamed: 0,0,1,2,3,4
0,22 Jan,PAYNOW-FAST PAYNOW OTHR FOO SIANG YANG $2000,,7.0,13523.89
1,22 Jan,PAYNOW-FAST PIB2601229007343561 Bobby SG OTHR ...,5.0,,13518.89
2,23 Jan,"PAYNOW-FAST PAYNOW OTHR LOW JING REN, JOEL Ten...",,6.75,13525.64
3,24 Jan,NETS Debit-Consumer MAPLETREE B12479800 xxxxxx...,13.63,,13512.01
4,24 Jan,PAYNOW-FAST PAYNOW OTHR GLENDA TAY WEN WEI,,6.5,13518.51
5,24 Jan,PAYNOW-FAST PIB2601249029655013 Heng Hui OTHR ...,6.7,,13511.81
6,27 Jan,PAYNOW-FAST PIB2601279061692284 Heng Hui OTHR ...,24.0,,13487.81
7,28 Jan,Bill Payment mBK-UOB Cards 5521632022898588,1193.6,,12294.21
8,28 Jan,Bill Payment mBK-Citi CC 5425503004226675,354.23,,11939.98
9,28 Jan,Bill Payment mBK-HSBC CC 4835850020113032,7.21,,11932.77


Unnamed: 0,0,1,2,3,4
0,29 Jan,PAYNOW-FAST aiqi MBK2601299092207657,50.0,,11896.47
1,29 Jan,PAYNOW-FAST PAYNOW OTHR ONG AI QI,,80.0,11976.47
2,29 Jan,PAYNOW-FAST aiqi MBK2601299092297326,80.0,,11896.47
3,30 Jan,PAYNOW-FAST PAYNOW OTHR ONG AI QI,,100.0,11996.47
4,30 Jan,PAYNOW-FAST PAYNOW OTHR LUA EE HOOK Transfer b...,,50.0,12046.47
5,30 Jan,PAYNOW-FAST aiqi MBK2601309093943263,100.0,,11946.47
6,30 Jan,PAYNOW-FAST CHONG PANG OLD TIME MBK26013090945...,6.0,,11940.47
7,31 Jan,PAYNOW-FAST PIB2601319111831497 fannyelias OTH...,15.0,,11925.47
8,31 Jan,Interest Credit,,0.54,11926.01


2026-02-23 22:46:09,580 - INFO - Saving CSV table to scratch/uob_jan_2026_deposit.csv
2026-02-23 22:46:09,583 - INFO - Document converted and tables exported in 20.79 seconds.
