In [1]:
import pandas as pd
import joblib
import re
import pickle

In [3]:
ARTIFACT_PATH = ("artifacts/item_row_classifier.pkl")

with open(ARTIFACT_PATH, "rb") as f:
    artifact = pickle.load(f)

model = artifact["model"]
FEATURES_FINAL = artifact["features"]
THRESHOLD = artifact.get("threshold", 0.5)

print("Loaded features:", FEATURES_FINAL)
print("Threshold:", THRESHOLD)

Loaded features: ['row_ratio', 'word_count', 'digit_ratio', 'starts_alpha', 'price_count', 'price_at_end', 'ends_with_price_strict', 'multi_price', 'has_qty_pattern', 'has_blacklist', 'has_item_hint', 'has_summary_keyword']
Threshold: 0.5


In [4]:
df = pd.read_csv("ocr_rows_for_labeling.csv")
df

Unnamed: 0,image,row_idx,row_ratio,type,text
0,X00016469612.jpg,0,0.000,header,tan woon yann
1,X00016469612.jpg,1,0.038,header,BOOK TA.K (TAMAN DAYA) SDN BHD
2,X00016469612.jpg,2,0.077,header,789417-W
3,X00016469612.jpg,3,0.115,header,"NO.5: 55,57 & 59, JALAN SAGU 18,"
4,X00016469612.jpg,4,0.154,header,"TAMAN DAYA,"
...,...,...,...,...,...
21584,X51009453804.jpg,20,0.833,header,(RM) (RM)
21585,X51009453804.jpg,21,0.875,item,SR@A 11.32 068
21586,X51009453804.jpg,22,0.917,summary,Total 11.32 0.68
21587,X51009453804.jpg,23,0.958,footer,THANK YOU


### **Layout**

In [5]:
df["row_ratio"] = df["row_ratio"].astype(float)

### **Lexical**

In [17]:
df["word_count"] = df["text"].fillna("").apply(lambda s: len([w for w in s.split() if w.isalpha()]))

In [18]:
def digit_ratio(text):
    text = str(text)
    return sum(c.isdigit() for c in text) / max(len(text), 1)

df["digit_ratio"] = df["text"].apply(digit_ratio)

In [20]:
df["starts_alpha"] = df["text"].fillna("").apply(
    lambda s: int(len(s) > 0 and s[0].isalpha())
)

### **Price Structure**

In [22]:
df["price_count"] = (df["text"].fillna("").str.findall(r"\d+\.\d{2}").apply(len))

In [23]:
df["price_at_end"] = df["text"].fillna("").apply(lambda s: int(bool(re.search(r"\d+\.\d{2}\s*$", s))))

In [24]:
df["ends_with_price_strict"] = (
    df["text"]
    .fillna("")
    .str.strip()
    .str.match(r".*\d+\.\d{2}$")
    .astype(int)
)

In [25]:
df["multi_price"] = df["price_count"].apply(lambda x: int(x >= 2))

In [26]:
df["has_qty_pattern"] = (
    df["text"]
    .fillna("")
    .str.contains(r"\b\d+\s*[xX@]\s*\d+(\.\d{2})?", regex=True)
    .astype(int)
)

  .str.contains(r"\b\d+\s*[xX@]\s*\d+(\.\d{2})?", regex=True)


### **Semantic Control**

In [27]:
df["has_blacklist"] = (
    df["text"]
    .fillna("")
    .str.lower()
    .apply(lambda t: int(any(k in t for k in [
        "total","subtotal","cash","change","tax","gst",
        "visa","master","paid","balance"
    ])))
)

In [28]:
ITEM_HINT_WORDS = [
    "pcs", "pack", "kg", "g", "ltr", "ml", "x"
]

df["has_item_hint"] = (
    df["text"]
    .fillna("")
    .str.lower()
    .apply(lambda s: int(any(w in s for w in ITEM_HINT_WORDS)))
)

In [29]:
SUMMARY_KW = [
    "total", "subtotal", "tax", "gst", "cash",
    "change", "rounding", "amount", "balance", "paid"
]

df["has_summary_keyword"] = (
    df["text"]
    .fillna("")
    .str.lower()
    .apply(lambda t: int(any(k in t for k in SUMMARY_KW)))
)

In [30]:
X = df[FEATURES_FINAL]

df["item_prob"] = model.predict_proba(X)[:, 1]
df["is_item_pred"] = (df["item_prob"] >= THRESHOLD).astype(int)

In [31]:
df[["text", "item_prob", "is_item_pred"]].head(10)

Unnamed: 0,text,item_prob,is_item_pred
0,tan woon yann,0.521603,1
1,BOOK TA.K (TAMAN DAYA) SDN BHD,0.523426,1
2,789417-W,0.115856,0
3,"NO.5: 55,57 & 59, JALAN SAGU 18,",0.482904,0
4,"TAMAN DAYA,",0.44926,0
5,"81100 JOHOR BAHRU,",0.327924,0
6,JOHOR.,0.413691,0
7,Document No :TD01167104,0.298171,0
8,Date : 25/12/2018 8:13:39 PM,0.244553,0
9,Cashier: MANIS,0.175366,0


In [36]:
df["is_item_pred"].value_counts()

is_item_pred
0    14905
1     6684
Name: count, dtype: int64

In [38]:
df_out = df.copy()
df_out["item_prob"] = model.predict_proba(X)[:, 1]
df_out["is_item_pred"] = (df_out["item_prob"] >= THRESHOLD).astype(int)

df_out.to_csv("rows_with_item_prediction.csv", index=False)