context:
- 已经下载了很多图
- 需要tar成dataset

In [27]:
import os
import tarfile
from pathlib import Path
from typing import List
import unibox as ub

import os
import tarfile
from pathlib import Path
from typing import List

def generate_tar_from_images(image_files: List[str], output_dir: str, id: int, modulo: int = 10000) -> List[str]:
    """
    Create tar files containing images grouped by post ID ranges.

    Args:
        image_files (List[str]): List of image file paths.
        output_dir (str): Directory to store the tar files.
        id (int): The ID used to determine the range of files to include in the tar.
        modulo (int): Constant range of IDs per tar file (default: 10,000).

    Returns:
        List[str]: List of created tar file paths.
    """
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    # Determine range for the current ID
    range_start = id * modulo
    range_end = (id + 1) * modulo

    # Filter images by ID range
    tar_files = []
    files_in_range = []
    for image_path in image_files:
        filename = os.path.basename(image_path)
        # Extract ID from filename (e.g., "sankaku_16465")
        try:
            post_id = int(filename.split('_')[1].split('.')[0])
        except (IndexError, ValueError):
            print(f"Skipping file with invalid format: {filename}")
            continue

        if range_start <= post_id < range_end:
            files_in_range.append(image_path)

    # Create tar file for the range
    if files_in_range:
        tar_file_path = output_dir / f"{id}.tar"
        with tarfile.open(tar_file_path, "w") as tar:
            for file in files_in_range:
                tar.add(file, arcname=os.path.relpath(file, start=os.path.dirname(file)))
        tar_files.append(str(tar_file_path))
        print(f"Created tar: {tar_file_path} with {len(files_in_range)} files.")
    else:
        print(f"No files found in range {range_start}-{range_end}.")

    return tar_files

def determine_tar_info(image_files: List[str], modulo: int = 10000):
    """
    Determine how many full tars can be created and the current last post number.

    Args:
        image_files (List[str]): List of image file paths.
        modulo (int): Constant range of IDs per tar file (default: 10,000).

    Returns:
        tuple: (number_of_full_tars, last_post_id)
    """
    post_ids = []
    for image_path in image_files:
        filename = os.path.basename(image_path)
        try:
            post_id = int(filename.split('_')[1].split('.')[0])
            post_ids.append(post_id)
        except (IndexError, ValueError):
            print(f"Skipping file with invalid format: {filename}")
            continue

    if not post_ids:
        return 0, None

    last_post_id = max(post_ids)
    number_of_full_tars = (last_post_id + 1) // modulo

    return number_of_full_tars, last_post_id

In [None]:
# Example usage:
data_dir = "/rmt/yada/dev/sakuga-scraper/data"
output_dir = "/rmt/yada/dev/sakuga-scraper/tars"

# Collect all valid media files
all_files = ub.traverses("/rmt/yada/dev/sakuga-scraper/data")

In [22]:
# Convert extensions to sets for validation
image_extensions = set(ub.IMG_FILES) - {'.gif'}
video_extensions = {'.webm', '.mp4', '.gif'}

# Categorize files
image_files = [f for f in all_files if Path(f).suffix in image_extensions]
video_files = [f for f in all_files if Path(f).suffix in video_extensions]
json_files = [f for f in all_files if Path(f).suffix == '.json']

# Recompute lengths
total_unique_files = len(set(image_files + video_files + json_files))
print(len(all_files), total_unique_files)

67748 67748


In [30]:
# full tar count, last post id
determine_tar_info(image_files), determine_tar_info(video_files)

((6, 65902), (6, 65911))

In [26]:
# Generate tar files
generated_tars = generate_tar_from_images(image_files, output_dir, id=0)
print("Generated tar files:", generated_tars)

Created tar: /rmt/yada/dev/sakuga-scraper/tars/0.tar with 593 files.
Generated tar files: ['/rmt/yada/dev/sakuga-scraper/tars/0.tar']


In [3]:
!cp '/rmt/yada/dev/sakuga-scraper/data/post_12273/sankaku_12273.webm' .