# Prepare finetuning dataset of receipts
It the dataset comes from kaggle `https://www.kaggle.com/datasets/dhiaznaidi/receiptdatasetssd300v2` and contains
- Images under `dataset/images` of receipts
- Extracted data under `dataset/gdt` containing `company`, `total`, `date` and `address`
- Data about extracted data in `data/info_data` contains the data with important words and their coordinates

The expected data should contain `receipt_path`, `schema`(for now just the company, date, total and address), and the `output`

In [None]:
import json
import os
from typing import List

from utils import ReceiptData

data: List[ReceiptData] = []
fixed_schema = {
    "total": "number//total amount of the invoice",
    "company": "string//the name of the company or person doing the supply",
    "date": "date//the date of the invoice",
    "address": "string//address of the person or company doing the supply",
}
for receipt in os.listdir("dataset/info_data"):
    with open(f"dataset/info_data/{receipt}", "r") as f:
        receipt_info = json.loads(f.read())
        receipt_id  = receipt_info.get("image_path","").split("/")[5].replace(".jpg","")
        with open(f"dataset/gdt/{receipt_id}.json","r") as df:
            extracted_data = json.loads(df.read())
            for k,v in extracted_data.items():
                if k in receipt_info.keys():
                    extracted_data[k] = {
                        "value": v,
                        "coordinates": receipt_info[k]
                    }
                else:
                    extracted_data[k] = {
                        "value": v,
                        "coordinates": {
                            "xmin": 0,
                            "ymin": 0,
                            "xmax": 0,
                            "ymax": 0
                        }
                    }
        receipt_data = ReceiptData(
            receipt_path=receipt_info.get("image_path","").replace("/content/Dataset/train/","dataset/"),
            schema=fixed_schema,
            output=extracted_data
        )
        data.append(receipt_data)
print(len(data),data[0])