In [1]:
import importlib
import segment
importlib.reload(segment)
from segment import load_report_with_images, get_docs_with_ocr
from doctr.models import ocr_predictor
import os
import pickle

CACHE_PATH = "../data/cache/ocr_docs3.pkl"

if os.path.exists(CACHE_PATH):
    print("Loading docs from cache...")
    with open(CACHE_PATH, "rb") as f:
        docs = pickle.load(f)
    print("Loaded from cache:", len(docs))
else:
    ocr_model = ocr_predictor(det_arch='db_resnet50', reco_arch='crnn_vgg16_bn', pretrained=True).cuda()
    items = load_report_with_images(limit=32)
    print("Loaded:", len(items))
    docs = get_docs_with_ocr(items, ocr_model)
    os.makedirs(os.path.dirname(CACHE_PATH), exist_ok=True)
    with open(CACHE_PATH, "wb") as f:
        pickle.dump(docs, f)
    print("Saved docs to cache:", CACHE_PATH)

docs = [doc for doc in docs if len(doc['segments']) < 120]
print("Processed Docs:", len(docs))

Loading docs from cache...
Loaded from cache: 995
Processed Docs: 995


In [10]:
import os
import json

def convert_segments_to_yolo(docs, out_dir="../data/yolo_segments"):
    os.makedirs(f"{out_dir}/images", exist_ok=True)
    os.makedirs(f"{out_dir}/labels", exist_ok=True)

    for idx, doc in enumerate(docs):
        img_path = doc["image_path"]
        segments = doc["segments"]
        H, W = doc["dimensions"]

        # Copy image
        img_out = f"{out_dir}/images/{idx}.jpg"
        import shutil
        shutil.copy2(img_path, img_out)

        # Write YOLO label file (all segments = class 0)
        label_out = f"{out_dir}/labels/{idx}.txt"
        with open(label_out, "w") as f:
            for seg in segments:
                box = seg["box"]
                x_min = box["x_min"]
                y_min = box["y_min"]
                x_max = box["x_max"]
                y_max = box["y_max"]

                # Normalize YOLO format
                x_center = (x_min + x_max) / 2 / W
                y_center = (y_min + y_max) / 2 / H
                w = (x_max - x_min) / W
                h = (y_max - y_min) / H

                # YOLO only needs class_id=0 for all
                f.write(f"0 {x_center} {y_center} {w} {h}\n")

    print("YOLO segment-only dataset created!", out_dir)

convert_segments_to_yolo(docs)

YOLO segment-only dataset created! ../data/yolo_segments


In [None]:
# train_yolo.py
from doclayout_yolo import YOLOv10

def train_yolo():
    model = YOLOv10("../models/yolo.pt")   # lightweight and fast

    model.train(
        data="../data/yolo_segments.yaml",
        epochs=50,
        imgsz=1024,
        batch=8,
        device="cuda",
        lr0=0.001,
        optimizer="AdamW",
        patience=20,
    )

    model.save("../models/yolo_segments_trained.pt")
    print("Training complete! Saved model as yolo_segments_trained.pt")

train_yolo()

Ultralytics YOLOv0.0.4 üöÄ Python-3.13.5 torch-2.9.1+cu128 CUDA:0 (NVIDIA A100-SXM4-80GB, 81038MiB)
[34m[1mengine/trainer: [0mtask=detect, mode=train, model=../models/yolo.pt, data=../data/yolo_segments.yaml, epochs=50, time=None, patience=20, batch=8, imgsz=1024, save=True, save_period=10, val_period=1, cache=False, device=cuda, workers=8, project=None, name=train, exist_ok=True, pretrained=True, optimizer=AdamW, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=False, close_mosaic=10, resume=False, amp=True, fraction=1.0, profile=False, freeze=None, multi_scale=False, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, save_hybrid=False, conf=None, iou=0.7, max_det=300, half=False, dnn=False, plots=True, source=None, vid_stride=1, stream_buffer=False, visualize=False, augment=False, agnostic_nms=False, classes=None, retina_masks=False, embed=None, show=False, save_frames=False, save_txt=False, save_conf=False, save_crop

[34m[1mAMP: [0mrunning Automatic Mixed Precision (AMP) checks with YOLOv8n...
Downloading https://github.com/doclayout_yolo/assets/releases/download/v8.1.0/yolov8n.pt to 'yolov8n.pt'...
[34m[1mAMP: [0mchecks skipped ‚ö†Ô∏è, offline and unable to download YOLOv8n. Setting 'amp=True'. If you experience zero-mAP or NaN losses you can disable AMP with amp=False.


  self.scaler = torch.cuda.amp.GradScaler(enabled=self.amp)
[34m[1mtrain: [0mScanning /home/compiling-ganesh/24m0797/workspace/doctr-dit/data/yolo_segments/labels... 995 images, 0 backgrounds, 1 corrupt: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 995/995 [00:00<00:00, 1043.17it/s]

 1.1487547 1.1511207 1.132253  1.1215011 1.1261437 1.1661302 1.114101
 1.1600661 1.1063203 1.1306188]





[34m[1mtrain: [0mNew cache created: /home/compiling-ganesh/24m0797/workspace/doctr-dit/data/yolo_segments/labels.cache
[34m[1malbumentations: [0mBlur(p=0.01, blur_limit=(3, 7)), MedianBlur(p=0.01, blur_limit=(3, 7)), ToGray(p=0.01, method='weighted_average', num_output_channels=3), CLAHE(p=0.01, clip_limit=(1.0, 4.0), tile_grid_size=(8, 8))


  data = fetch_version_info()
  A.ImageCompression(quality_lower=75, p=0.0),
  self._set_keys()
[34m[1mval: [0mScanning /home/compiling-ganesh/24m0797/workspace/doctr-dit/data/yolo_segments/labels.cache... 995 images, 0 backgrounds, 1 corrupt: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 995/995 [00:00<?, ?it/s]

 1.1487547 1.1511207 1.132253  1.1215011 1.1261437 1.1661302 1.114101
 1.1600661 1.1063203 1.1306188]





Plotting labels to /home/compiling-ganesh/24m0797/workspace/doctr-dit/runs/detect/train/labels.jpg... 
[34m[1moptimizer:[0m AdamW(lr=0.001, momentum=0.937) with parameter groups 171 weight(decay=0.0), 183 weight(decay=0.0005), 183 bias(decay=0.0)
Image sizes 1024 train, 1024 val
Using 8 dataloader workers
Logging results to [1m/home/compiling-ganesh/24m0797/workspace/doctr-dit/runs/detect/train[0m
Starting training for 50 epochs...

      Epoch    GPU_mem     box_om     cls_om     dfl_om     box_oo     cls_oo     dfl_oo  Instances       Size


       1/50      17.8G      3.036      1.974          0       2.98      3.605          0        146       1024: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 125/125 [01:23<00:00,  1.50it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:15<00:00,  4.12it/s]


994
                   all        994      19299        0.5      0.557      0.463      0.202

      Epoch    GPU_mem     box_om     cls_om     dfl_om     box_oo     cls_oo     dfl_oo  Instances       Size


       2/50      18.1G      1.766      1.002          0      1.965      1.289          0         46       1024: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 125/125 [00:40<00:00,  3.06it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:14<00:00,  4.39it/s]


994
                   all        994      19299      0.699      0.668       0.68      0.347

      Epoch    GPU_mem     box_om     cls_om     dfl_om     box_oo     cls_oo     dfl_oo  Instances       Size


       3/50      18.1G      1.434     0.8509          0      1.642       1.08          0        122       1024: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 125/125 [00:40<00:00,  3.12it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:14<00:00,  4.44it/s]


994
                   all        994      19299      0.783      0.738      0.789      0.475

      Epoch    GPU_mem     box_om     cls_om     dfl_om     box_oo     cls_oo     dfl_oo  Instances       Size


       4/50      18.6G      1.277     0.7711          0      1.497     0.9377          0         23       1024: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 125/125 [00:39<00:00,  3.14it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:14<00:00,  4.44it/s]


994
                   all        994      19299      0.803      0.764      0.822      0.505

      Epoch    GPU_mem     box_om     cls_om     dfl_om     box_oo     cls_oo     dfl_oo  Instances       Size


       5/50      18.2G      1.179     0.7273          0      1.422     0.8701          0        113       1024: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 125/125 [00:39<00:00,  3.13it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:14<00:00,  4.41it/s]


994
                   all        994      19299      0.825      0.785      0.842      0.541

      Epoch    GPU_mem     box_om     cls_om     dfl_om     box_oo     cls_oo     dfl_oo  Instances       Size


       6/50      18.7G      1.107      0.676          0      1.341     0.8041          0        131       1024: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 125/125 [00:39<00:00,  3.14it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:14<00:00,  4.48it/s]


994
                   all        994      19299      0.848      0.798      0.862      0.577

      Epoch    GPU_mem     box_om     cls_om     dfl_om     box_oo     cls_oo     dfl_oo  Instances       Size


       7/50      17.9G      1.069     0.6567          0      1.298     0.7676          0         70       1024: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 125/125 [00:39<00:00,  3.14it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:14<00:00,  4.42it/s]


994
                   all        994      19299      0.858      0.823      0.873       0.59

      Epoch    GPU_mem     box_om     cls_om     dfl_om     box_oo     cls_oo     dfl_oo  Instances       Size


       8/50      17.6G      1.041      0.641          0      1.281     0.7533          0         29       1024: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 125/125 [00:39<00:00,  3.16it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:14<00:00,  4.47it/s]


994
                   all        994      19299      0.864      0.835      0.883      0.603

      Epoch    GPU_mem     box_om     cls_om     dfl_om     box_oo     cls_oo     dfl_oo  Instances       Size


       9/50      18.3G      1.011     0.6115          0       1.25     0.7184          0         62       1024: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 125/125 [00:39<00:00,  3.15it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:14<00:00,  4.39it/s]


994
                   all        994      19299      0.864      0.834      0.889      0.605

      Epoch    GPU_mem     box_om     cls_om     dfl_om     box_oo     cls_oo     dfl_oo  Instances       Size


      10/50      18.3G      1.011     0.6153          0      1.237     0.7253          0         54       1024: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 125/125 [00:39<00:00,  3.15it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:14<00:00,  4.42it/s]


994
                   all        994      19299      0.866      0.847      0.891      0.641

      Epoch    GPU_mem     box_om     cls_om     dfl_om     box_oo     cls_oo     dfl_oo  Instances       Size


      11/50      18.6G     0.9966     0.6036          0      1.223      0.676          0         31       1024: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 125/125 [00:40<00:00,  3.12it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:14<00:00,  4.50it/s]


994
                   all        994      19299      0.878      0.856      0.903      0.639

      Epoch    GPU_mem     box_om     cls_om     dfl_om     box_oo     cls_oo     dfl_oo  Instances       Size


      12/50      17.8G      0.984     0.5998          0      1.192     0.6723          0         65       1024: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 125/125 [00:40<00:00,  3.08it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:14<00:00,  4.36it/s]


994
                   all        994      19299      0.882      0.862      0.906      0.644

      Epoch    GPU_mem     box_om     cls_om     dfl_om     box_oo     cls_oo     dfl_oo  Instances       Size


      13/50      17.5G     0.9618     0.5842          0      1.174     0.6529          0        146       1024: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 125/125 [00:39<00:00,  3.13it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 63/63 [00:14<00:00,  4.44it/s]


994
                   all        994      19299      0.892      0.864      0.913      0.649

      Epoch    GPU_mem     box_om     cls_om     dfl_om     box_oo     cls_oo     dfl_oo  Instances       Size


      14/50        18G     0.9224     0.5625          0      1.129     0.6167          0        301       1024:  96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 120/125 [00:38<00:01,  3.06it/s]