In [1]:
import pandas as pd

In [5]:
df = pd.read_csv("ocr_rows_for_labeling.csv")
df.head(10)

Unnamed: 0,image,row_idx,row_ratio,type,text
0,X00016469612.jpg,0,0.0,header,tan woon yann
1,X00016469612.jpg,1,0.038,header,BOOK TA.K (TAMAN DAYA) SDN BHD
2,X00016469612.jpg,2,0.077,header,789417-W
3,X00016469612.jpg,3,0.115,header,"NO.5: 55,57 & 59, JALAN SAGU 18,"
4,X00016469612.jpg,4,0.154,header,"TAMAN DAYA,"
5,X00016469612.jpg,5,0.192,header,"81100 JOHOR BAHRU,"
6,X00016469612.jpg,6,0.231,header,JOHOR.
7,X00016469612.jpg,7,0.269,header,Document No :TD01167104
8,X00016469612.jpg,8,0.308,header,Date : 25/12/2018 8:13:39 PM
9,X00016469612.jpg,9,0.346,summary,Cashier: MANIS


In [8]:
df["text"] = df["text"].fillna("").astype(str)

In [9]:
df["alpha_ratio"] = df["text"].apply(
    lambda s: sum(c.isalpha() for c in s) / max(len(s), 1)
)

In [10]:
df["has_digit"] = df["text"].apply(
    lambda s: int(any(c.isdigit() for c in s))
)

In [11]:
df["char_len"] = df["text"].str.len()

In [12]:
def is_item_candidate(row):
    text = str(row["text"]).lower()

    # 1. Hard reject (buang langsung)
    blacklist = [
        "total", "subtotal", "tax", "gst", "rounding",
        "cash", "change", "visa", "master", "paid",
        "thank", "receipt", "balance"
    ]
    if any(k in text for k in blacklist):
        return False

    # 2. Feature-based soft rules
    has_digit = any(c.isdigit() for c in text)
    alpha_ratio = sum(c.isalpha() for c in text) / max(len(text), 1)
    row_ratio = row["row_ratio"]

    rules = [
        has_digit,
        alpha_ratio > 0.4,
        0.15 < row_ratio < 0.85,
        len(text) >= 5
    ]

    # 3. OR logic → cukup 2 terpenuhi
    return sum(rules) >= 2

In [13]:
df["is_candidate"] = df.apply(is_item_candidate, axis=1)

In [14]:
df["is_candidate"].value_counts()

is_candidate
True     13024
False     8565
Name: count, dtype: int64

In [16]:
df_candidates = df[df["is_candidate"] == True]
df_candidates

Unnamed: 0,image,row_idx,row_ratio,type,text,alpha_ratio,has_digit,char_len,is_candidate
0,X00016469612.jpg,0,0.000,header,tan woon yann,0.846154,0,13,True
1,X00016469612.jpg,1,0.038,header,BOOK TA.K (TAMAN DAYA) SDN BHD,0.733333,0,30,True
2,X00016469612.jpg,2,0.077,header,789417-W,0.125000,1,8,True
3,X00016469612.jpg,3,0.115,header,"NO.5: 55,57 & 59, JALAN SAGU 18,",0.343750,1,32,True
4,X00016469612.jpg,4,0.154,header,"TAMAN DAYA,",0.818182,0,11,True
...,...,...,...,...,...,...,...,...,...
21575,X51009453804.jpg,11,0.458,item,F/Castal! 187057-75 Tack-It 2 SR 12.00,0.394737,1,38,True
21576,X51009453804.jpg,12,0.500,item,75g- White (new) @ 5.6600,0.360000,1,25,True
21584,X51009453804.jpg,20,0.833,header,(RM) (RM),0.444444,0,9,True
21585,X51009453804.jpg,21,0.875,item,SR@A 11.32 068,0.214286,1,14,True


In [17]:
df_sample = df_candidates.sample(n=2000, random_state=42)

In [18]:
df_sample["label"] = ""

In [19]:
df_sample[["image", "row_idx", "row_ratio", "text", "label"]].to_csv("manual_label_candidates.csv", index=False, encoding="utf-8")