In [20]:
import pandas as pd
import re
from datetime import datetime
import json

In [2]:
df = pd.read_csv("rows_with_item_prediction.csv")

In [3]:
def group_rows_by_image(df):
    grouped = {}
    for img, g in df.groupby("image"):
        rows = (
            g.sort_values("row_idx")
             .to_dict("records")
        )
        grouped[img] = rows
    return grouped

rows_by_image = group_rows_by_image(df)

In [4]:
def safe_text(x):
    if pd.isna(x):
        return ""
    return str(x)

In [5]:
def extract_company(rows_merged):
    # Keyword umum nama PT/Toko
    keywords = ["sdn", "bhd", "trading", "enterprise", "mart", "store", "cafe"]
    
    # Ambil 5 baris teratas saja
    for r in rows_merged[:5]:
        text = safe_text(r.get("text"))
        # Skip jika isinya cuma angka/simbol
        if len(text) < 3 or re.fullmatch(r"[0-9\.\-:\s]+", text): 
            continue 
        
        # Jika mengandung keyword atau huruf kapital semua, kemungkinan itu nama toko
        if any(k in text.lower() for k in keywords) or text.isupper():
            return text
            
    return ""

In [6]:
def clean_company_name(name):
    if name is None:
        return "UNKNOWN"
    
    name = str(name).strip()
    if not name:
        return "UNKNOWN"
    
    name = str(name).upper()
    
    noise_keywords = [
        "THANK YOU", "VISIT AGAIN", "WELCOME",
        "GST", "TAX INVOICE", "RECEIPT"
        ]
    if any(k in name for k in noise_keywords):
        return "UNKNOWN"
    
    if re.match(r'^\d+[,.\s]+[A-Z\s]{5,}', name):
        return "UNKNOWN"
    
    suffix_patterns = [
        r'\bSDN\.?\s*BHD\.?$',
        r'\bENTERPRISE$',
        r'\bTRADING$',
        r'\bRESTAURANT$',
        r'\bSHOP$',
        r'\bSTORE$'
        ]
    for pat in suffix_patterns:
        name = re.sub(pat, '', name).strip()
    
    # Hapus kurung
    name = re.sub(r'\([^)]*$', '', name)
    
    # Hapus registration number
    name = re.sub(r'\(\d+[^)]*\)', '', name)
    
    name = re.sub(r'\s+', ' ', name).strip()

    if len(name) < 3 and not re.search(r'[A-Z]{2,}', name):
        return "UNKNOWN"
    
    return name

In [7]:
def extract_date(rows_merged):
    # Pola regex tanggal (DD/MM/YYYY, YYYY-MM-DD, dll)
    patterns = [
        r"\d{1,2}[\/\-]\d{1,2}[\/\-]\d{2,4}",
        r"\d{1,2}\s+(?:JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)[a-z]*\s+\d{2,4}"
    ]
    # Gabungkan seluruh teks untuk pencarian regex global
    full_text = " ".join(safe_text(r.get("text")) for r in rows)
    
    for pat in patterns:
        m = re.search(pat, full_text, re.IGNORECASE)
        if m: return m.group(0)
    return None

In [8]:
def ml_item_filter(rows, threshold=0.5):
    return [
        r for r in rows
        if r.get("item_prob", 0) >= threshold
    ]

In [9]:
PRICE_RE = r"\d+\.\d{2}"

NON_ITEM_KEYWORDS = [
    "total", "subtotal", "cash", "change",
    "rounding", "tax", "gst", "service",
    "visa", "master", "balance", "discount",
    "amount", "paid"
]

In [10]:
def is_non_item_line(text: str) -> bool:
    t = text.lower()
    return any(k in t for k in NON_ITEM_KEYWORDS)

In [11]:
def extract_prices(text):
    return [float(p) for p in re.findall(r"\d+\.\d{2}", text)]

In [12]:
def clean_description(text):
    text = re.sub(PRICE_RE, "", text)
    text = re.sub(r"\b[xX@]\b", "", text)
    return re.sub(r"\s+", " ", text).strip()

In [13]:
def infer_qty_and_price(text, prices):
    qty = 1
    unit_price = prices[-1]
    total = prices[-1]

    m = re.search(r"\b(\d+)\s*[xX@]\s*\d", text)
    if m and len(prices) >= 2:
        for q in range(2, 10):
            if abs(prices[-2] * q - prices[-1]) < 0.01:
                qty = q
                unit_price = prices[-2]
                total = prices[-1]
                break

    return qty, unit_price, total

In [14]:
def confidence_score(desc, qty, unit_price, total):
    score = 0.0
    if len(desc) >= 4: score += 0.3
    if qty >= 1: score += 0.2
    if unit_price > 0 and total > 0: score += 0.3
    if abs(qty * unit_price - total) < 0.05: score += 0.2
    return round(score, 2)

In [15]:
def process_receipt(rows, prob_threshold=0.5):
    # --- Header info ---
    company_raw = extract_company(rows)
    company = clean_company_name(company_raw)
    date = extract_date(rows)

    # --- Item rows dari ML ---
    item_rows = ml_item_filter(rows, prob_threshold)

    items = []
    for r in item_rows:
        text = safe_text(r.get("text"))

        if is_non_item_line(text):
            continue

        prices = extract_prices(text)
        if not prices:
            continue

        desc = clean_description(text)
        if len(desc) < 3:
            continue

        qty, unit_price, total = infer_qty_and_price(text, prices)
        if total <= 0:
            continue

        conf = confidence_score(desc, qty, unit_price, total)

        items.append({
            "description": desc,
            "qty": qty,
            "unit_price": round(unit_price, 2),
            "line_total": round(total, 2),
            "confidence": conf,
            "row_idx": r["row_idx"]
        })

    return {
        "company": company,
        "date": date,
        "items": items
    }

In [16]:
results = {}

for img, rows in rows_by_image.items():
    results[img] = process_receipt(rows, prob_threshold=0.5)

In [21]:
with open("receipts_output.json", "w", encoding="utf-8") as f:
    json.dump(results, f, indent=2, ensure_ascii=False)

print("Exported receipts_output.json")

Exported receipts_output.json
