In [None]:
from IPython.display import display

from core.utils.logging import setup_logging
setup_logging()
import structlog
logger = structlog.get_logger(__name__)

from dotenv import load_dotenv
load_dotenv()
logger.info("Loaded environment variables.")

In [None]:
%load_ext autoreload
%autoreload 2

import os
import json
from tqdm import tqdm

from core.utils.shared import CONFIGS_DIR
from core.config.config import ConfigManager

from core.models.llm.model import LLMModel
from core.models.lmv3.model import LMv3Model

from core.data.utils import get_dataset
from core.data.stats import evaluate_json_response
from core.data.filters import filter_schematisms

from core.data.schematism_parser import SchematismParser
from core.data.translation_parser import Parser


In [None]:
csv_path = "/Volumes/T7/AI_Osrodek/data/csv/dane_hasla_with_filename.csv"
schematism_name = "wloclawek_1872"

In [None]:
schematism_parser = SchematismParser(csv_path=csv_path, schematism_name=schematism_name)

In [None]:
scanned_schematism = schematism_parser.schematisms_path / schematism_parser.schematism_name
scanned_schematism

In [None]:
for file in os.listdir(scanned_schematism):
    if file.endswith(".jpg"):
        results = schematism_parser.get_page_info(file)
        print("_"*15)
        print(json.dumps(results, indent=4, ensure_ascii=False))

        # with Image.open(scanned_schematism / file) as img:
        #     image = img.copy()


In [None]:
images_path = "/Volumes/T7/Data/images"

In [None]:
from PIL import Image
from tqdm import tqdm

In [None]:
def extract_parts(name: str):
    parts = name.split("_")

    if len(parts) == 2:
        return parts[0], parts[1]

    # Detect schematism name based on known formats
    if parts[1].isdigit():  # e.g. wloclawek_1872
        schematism_name = "_".join(parts[:2])
        filename = "_".join(parts[2:])
    else:  # e.g. liber_crac_1529
        schematism_name = "_".join(parts[:3])
        filename = "_".join(parts[3:])

    return schematism_name, filename



In [None]:
# pairs = []
# schematism_parser = None
# count = 0
# for file in tqdm(os.listdir(images_path), desc="Processing images"):
#     if not file.endswith(".jpg"):
#         continue
#     with Image.open(os.path.join(images_path, file)) as img:
#         image = img.copy()
#     schematism_name, filename = extract_parts(file)
#     # print(schematism_name,filename)
#
#     if schematism_parser is None:
#         schematism_parser = SchematismParser(csv_path=csv_path, schematism_name=schematism_name)
#     elif schematism_parser.schematism_name != schematism_name:
#         schematism_parser = SchematismParser(csv_path=csv_path, schematism_name=schematism_name)
#
#     results = schematism_parser.get_page_info(filename)
#
#     pairs.append((image, results))
#     if len(results["entries"]) == 0:
#         count += 1
#         # display(image.resize((200,400)))

In [None]:
def image_generator():
    schematism_parser = None
    for file in tqdm(os.listdir(images_path), desc="Processing images"):
        if not file.endswith(".jpg"):
            continue

        # Load image lazily
        image_path = os.path.join(images_path, file)
        schematism_name, filename = extract_parts(file)

        if schematism_parser is None:
            schematism_parser = SchematismParser(csv_path=csv_path, schematism_name=schematism_name)
        elif schematism_parser.schematism_name != schematism_name:
            schematism_parser = SchematismParser(csv_path=csv_path, schematism_name=schematism_name)

        results = schematism_parser.get_page_info(filename)

        # Yield dict instead of storing in memory
        yield {
            "image": image_path,  # Store path, not the image object
            "results": results,
            "schematism_name": schematism_name,
            "filename": filename
        }

# Create dataset from generator
from datasets import Dataset
dataset = Dataset.from_generator(image_generator)

In [None]:
names = [
    "liber_crac_1529_001.jgp",
    "lodz_1872_0176_11.jgp",
    "wloclawek_1872_001.jpg",
    ]





In [None]:
dataset

In [46]:
import gc

def load_and_push_streaming(dataset, repo_name, batch_size=100):
    """
    Load images in batches and push incrementally to avoid memory issues
    """
    from datasets import Dataset, concatenate_datasets

    # Process and push in chunks
    total_samples = len(dataset)

    for i in range(0, total_samples, batch_size):
        print(f"Processing batch {i//batch_size + 1}/{(total_samples-1)//batch_size + 1}")

        # Get batch
        batch_dataset = dataset.select(range(i, min(i + batch_size, total_samples)))

        # Load images for this batch
        def load_batch_images(example):
            with Image.open(example["image"]) as img:
                image = img.copy()
            del example["image"]
            example["image"] = image
            return example

        batch_dataset = batch_dataset.map(load_batch_images)

        # Push this batch

        batch_dataset.push_to_hub(repo_name, split="train")

        # Clear memory
        del batch_dataset
        gc.collect()

# Use streaming approach
load_and_push_streaming(dataset, "artpods56/sample_repo", batch_size=50)

Processing batch 1/327


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/720 [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Processing batch 2/327


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing batch 3/327


Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/720 [00:00<?, ?B/s]

Processing batch 4/327


Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/720 [00:00<?, ?B/s]

Processing batch 5/327


Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

KeyboardInterrupt: 