<a href="https://colab.research.google.com/github/ackmase/sandbox/blob/main/Fraudulent_GenAI_Receipt_Detector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Get necessary packages.
# https://www.paddlepaddle.org.cn/en/install/quick?docurl=/documentation/docs/en/develop/install/pip/linux-pip_en.html
# Note that Google CoLab has the following system set up:
# - python version: 3.12.11
# - pip version: 24.1.2
# - platform: 64 bit, x86_64
# - CPU: Intel Xeon CPU @ 2.00GHz
# - Nvidia GPU: T4
# - cuda version: 12.5.82
!pip install -q --upgrade torch==2.5.1+cu124 torchvision==0.20.1+cu124 torchaudio==2.5.1+cu124 --index-url https://download.pytorch.org/whl/cu124
!pip install paddlepaddle-gpu==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/
!pip install paddleocr
!pip install -q requests bitsandbytes==0.46.0 transformers accelerate==1.3.0 openai pytesseract easyocr keras_ocr datasets
!pip install diffusers gradio

In [2]:
# Standard libraries
import io
import numpy as np
import os
from typing import List

# Third-party libraries
import gradio as gr
from diffusers import DiffusionPipeline
from google.colab import userdata
from huggingface_hub import login
from openai import OpenAI
from paddleocr import PaddleOCR
from PIL import Image
import requests
import torch
from transformers import pipeline, VisionEncoderDecoderModel

In [3]:
hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)
openai_token = userdata.get('openAI')
openai = OpenAI(api_key=openai_token)

In [7]:
# Constants
from datetime import date

today_str = date.today().isoformat()

SYSTEM_PROMPT = """
You are a helpful receipt checker for a company that gives out rewards based \
on receipts submitted by customers. You look at OCR text extracted from these \
receipts and determine whether you think it looks real or not. Remember that \
OCR extraction is still imperfect, and even real receipts can have \
imperfections. Also, watch out! Fraudsters are getting better at faking \
receipts these days.

Today's date is {today_str}.

Given OCR receipt text, please answer with one of the following options:
Real
Fake
Needs Human Validation

Then explain why you think so.
"""

USER_PROMPT = """
You are given imperfect OCR text extractions from photos snapped by a mobile \
phone. When determining the veracity of the receipt, please ignore the \
following:
- all formatting and spacing
- weird OCR artifacts (e.g.,  or odd character strings like 'arr   el)')
- odd characters

Your job is to try and figure out if the receipt text is real or fradulent. \
Watch out! Fraudsters are getting better at faking receipts these days.

Here's an example of a real receipt:
CVS pharmacy2115 ARIESIA BLVD SUITE 100
REDONDO BEACH,CA 90278
310.214.3974
REG#18 TRN#6711 CSHR#0000098 STR#10795
ExtraCare Card H: ********3Y00
POST GRP NUTS CRL Z0.5 3.99F
ORIGINAL PRICE 6.49
3.49 EACH 3.00
Survey_ID #
6469 3692 6776 093 75
TOTAL 3.49
CHARGE 3.49
************2496 RF
VISA CREDIT *************2496
RPPHOVEDE 757900 REF# 187119
TRAN TYPE: SALE AID: A0000000031010
TC:_1999600P97E1FZ51 TERMINPL#
NO SIGNATURE REQUIRED CVM: 280000
TSI(98): 0000
TVRC95): 0000000000
.00
CHANGE
3510 7955 2276 7111 88
Returns with receipt, subJect to
cVS Return Po11cy, thru 10/14/2025
Refund anount is based on Price
after all coupons and discounts.
9:37 AM
AUGUST 15, 2025
L
14 no-cost vaccines
avallable with most insurance
Protect yourself against RSV,
shingles, Tdap & more.
Subject to availability.
Restrictions apply.
Scan the QR to schedule your vaccination.

Note that the real receipt has some jargon and abbreviations and spelling \
errors.

Here's an example of a fake receipt:
SEASIDE MARKET
8/6/2025
TRIX 3.29
OATMEAL CRISP 4.79
TOTAL 8.08

Note that the fake receipt is lacking important information typically found in \
a transaction like cash tendered or masked credit card information.

Please answer with one of the following options:
Real
Fake
Needs Human Validation

Then explain why you think so.

Here is OCR text extracted from a receipt I'd like you to review:\n\n")
"""

GEMMA_MODEL = "google/gemma-3-270m"
LIQUID_MODEL = "LiquidAI/LFM2-1.2B"
NVIDIA_MODEL = "nvidia/Nemotron-Research-Reasoning-Qwen-1.5B"
LFM2_MODEL = "LiquidAI/LFM2-VL-450M"
DOCLING_MODEL = "ds4sd/SmolDocling-256M-preview"
TR_OCR_MODEL = "microsoft/trocr-base-printed"
GPT_MODEL = "gpt-4o-mini"


In [None]:
# Function for checking the veracity of the receipt image

def DetermineVeracityOfReceiptText(text):
    messages=[
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": USER_PROMPT + text}
        ]
    stream = openai.chat.completions.create(
                model=GPT_MODEL,
                messages=messages,
                stream=True
                )
    result = ""
    for chunk in stream:
        result += chunk.choices[0].delta.content or ""
    return result

In [17]:
"""Helper functions for formatting PaddleOCR output into readable text.

Usage:

paddleocr_ppocrv5_to_text(
    res, # JSON result from PaddleOCR
    y_threshold=5) # Vertical proximity threshold to group words into lines

Note that for receipts, where each line is very close to adjacent lines, it
seems that y_threshold=5 is the right balance to get the correct formatting.
Increase this number to 10 or 15 if too many words are getting grouped together
into single lines.
"""

import json

def paddleocr_ppocrv5_to_text(ocr_result, y_threshold=15):
    """
    Convert PaddleOCR PP-OCRv5 JSON output to readable string with line breaks and spaces.

    Args:
        ocr_result (dict): Parsed JSON result from PaddleOCR.
        y_threshold (int): Vertical proximity threshold to group words into lines.

    Returns:
        str: Reconstructed OCR text.
    """
    boxes = ocr_result["rec_boxes"]
    texts = ocr_result["rec_texts"]

    # Build a list of (box, text) tuples with (x_min, y_min, x_max, y_max)
    word_entries = []
    for box, text in zip(boxes, texts):
        x_min, y_min, x_max, y_max = box
        word_entries.append({
            "text": text,
            "x": x_min,
            "y": y_min,
            "line_center": (y_min + y_max) / 2
        })

    # Group words into lines based on vertical proximity
    lines = []
    for word in sorted(word_entries, key=lambda x: x["line_center"]):
        placed = False
        for line in lines:
            if abs(line["avg_y"] - word["line_center"]) <= y_threshold:
                line["words"].append(word)
                line["avg_y"] = sum(w["line_center"] for w in line["words"]) / len(line["words"])
                placed = True
                break
        if not placed:
            lines.append({"avg_y": word["line_center"], "words": [word]})

    # Sort words in each line by x position
    output_lines = []
    for line in lines:
        sorted_words = sorted(line["words"], key=lambda w: w["x"])
        output_lines.append(" ".join(w["text"] for w in sorted_words))

    return "\n".join(output_lines)

In [18]:
# Define functions for using PaddleOCR
def PullOCRText(image):
    # Resize image to make inference cheaper
    w, h = image.size
    resized_image = image.resize((w // 4, h // 4), Image.LANCZOS)

    # Convert image into numpy.ndarray
    print("logging: resizing image")
    resized_image = np.array(resized_image)
    ocr = PaddleOCR(
        lang="en",
        use_doc_orientation_classify=False,
        use_doc_unwarping=False,
        use_textline_orientation=False,
        )

    # Run OCR inference on a sample image
    print("logging: running OCR inference")
    result = ocr.predict(resized_image)
    print("logging: OCR inference complete")

    # Construct output and return
    #drive.mount('/content/drive')
    output = ""
    for res in result:
        res.print()
        print("logging: constructing output")
        output += paddleocr_ppocrv5_to_text(res, y_threshold=5) #10) #15)
        print("logging: returning output")
    return output

In [None]:
# ---------------------------------------------
# Note: plug in your real implementation
# ---------------------------------------------

def _fallback_PullOCRText(image):
    """
    Fallback OCR (placeholder). Replace with your real PullOCRText(image)->str.
    """
    return ("[Fallback OCR] (Replace with PullOCRText) — No OCR engine wired. "
           "If you already have PullOCRText(image)->str, import it and remove "
           "this fallback.")

def _fallback_DetermineVeracityOfReceiptText(text):
    """
    Fallback veracity explanation (placeholder). Replace with your real
    DetermineVeracityOfReceiptText(text)->str. Should return an explanation
    string; Cell 4 will parse a final label from this text.
    """
    # Extremely naive example logic — replace with your model/rules:
    return ("[Fallback veracity explanation] Replace with your real "
            "DetermineVeracityOfReceiptText")

# Try to use user-provided functions if present; otherwise use fallbacks
try:
    PullOCRText  # type: ignore
except NameError:
    PullOCRText = _fallback_PullOCRText

try:
    DetermineVeracityOfReceiptText  # type: ignore
except NameError:
    DetermineVeracityOfReceiptText = _fallback_DetermineVeracityOfReceiptText


# ---------------------------------------------
# Helpers
# ---------------------------------------------
FINAL_LABELS = ["Real", "Fake", "Needs Human Validation"]

def extract_final_label(explanation_text: str) -> str:
    """
    Extracts the final determination from the explanation text.
    Looks for one of: ["Real", "Fake", "Needs Human Validation"].
    If none found, defaults to "Needs Human Validation".
    """
    if not explanation_text:
        return "Needs Human Validation"

    # Normalize whitespace for robust matching
    text_norm = " ".join(explanation_text.split()).lower().split()
    # Look for explicit tokens (case-insensitive)
    if "real" in text_norm[0] or " real" in text_norm[0]:
        return "Real"
    if "fake" in text_norm[0] or " fake" in text_norm[0]:
        return "Fake"
    text_norm = " ".join(text_norm)
    if ("needs human validation" in text_norm or
        "needs review" in text_norm or
        "needs human validation" in text_norm):
        return "Needs Human Validation"

    # If tokens not present, attempt a heuristic fallback
    for lbl in FINAL_LABELS:
        if lbl.lower() in text_norm:
            return lbl

    return "Needs Human Validation"


def pipeline(image):
    """
    Main pipeline: (Cell 1) image -> (Cell 2) decision
                   -> (Cell 3) OCR text
                   -> (Cell 4) full explanation

    """
    if image is None:
        ocr_text = ""
        explanation = ("No image provided. Please upload a receipt image. "
                       "Final: Needs Human Validation")
        decision = "Needs Human Validation"
        return ocr_text, explanation, decision

    # Cell 2: OCR
    try:
        ocr_text = PullOCRText(image)
    except Exception as e:
        ocr_text = ""
        explanation = f"OCR error: {e}. Final: Needs Human Validation"
        decision = "Needs Human Validation"
        return ocr_text, explanation, decision

    # Cell 3: Explanation via DetermineVeracityOfReceiptText
    try:
        explanation = DetermineVeracityOfReceiptText(ocr_text)
        if not isinstance(explanation, str):
            explanation = (f"[Warning] DetermineVeracityOfReceiptText returned "
                           f"non-string type ({type(explanation)}). "
                           f"Final: Needs Human Validation")
    except Exception as e:
        explanation = (f"Error during veracity determination: {e}. "
                       f"Final: Needs Human Validation")

    # Cell 4: Final label parsed from explanation text
    decision = extract_final_label(explanation)
    if decision not in FINAL_LABELS:
        decision = "Needs Human Validation"

    return ocr_text, explanation, decision


# ---------------------------------------------
# UI
# ---------------------------------------------
with gr.Blocks(title="Receipt Veracity Checker") as demo:
    gr.Markdown("## Receipt Veracity Checker")

    with gr.Row():
        # Left column: Cell 1
        with gr.Column(scale=1):
            image_input = gr.Image(
                label="Upload Receipt Image here",
                type="pil"
            )
            run_button = gr.Button("Run Analysis", variant="primary")

        # Right column: Cells 2, 3, 4 stacked
        with gr.Column(scale=1):
            decision = gr.Radio(
                label="Decision",
                choices=FINAL_LABELS,
                value="Needs Human Validation",
                interactive=False
            )

            ocr_output = gr.Textbox(
                label="Extracted OCR Text",
                lines=12,
                interactive=False
            )

            explanation_output = gr.Textbox(
                label="Full Explanation",
                lines=12,
                interactive=False
            )

    # Wire interactions: click and/or auto-run on image change
    run_button.click(
        fn=pipeline,
        inputs=image_input,
        outputs=[ocr_output, explanation_output, decision]
    )

    image_input.change(
        fn=pipeline,
        inputs=image_input,
        outputs=[ocr_output, explanation_output, decision]
    )

if __name__ == "__main__":
    demo.launch(debug=True, show_error=True)