processing part:

this is done on another machine

In [None]:
# This avoids auto-generating a split — returns a DatasetDict
import os
import json
import re
import random
from datasets import load_dataset, Dataset

# --- Configuration ---
# dataset = load_dataset("incantor/danbooru_mongodb-quail_dump-feb-cap-only", split=None, num_proc=16)
# dataset = dataset['train']

sample_size = 500_000
seed = 42  # for reproducibility

caption_columns = [col for col in dataset.column_names if col.startswith("auto-caption__")]
print(f"Caption columns: {caption_columns}")
output_directory = "qa_style_jsons"
prompt_source_name = "auto_caption_prompts"
os.makedirs(output_directory, exist_ok=True)

# --- Helper: Sanitize filename ---
def sanitize_filename(name: str) -> str:
    sanitized = name.replace("/", "__")
    sanitized = re.sub(r'[<>:"|?*\\ ]', '-', sanitized)
    sanitized = sanitized.strip('-_')
    return sanitized if sanitized else "invalid_name"

# --- Sample 0.5M examples ---
print(f"Sampling {sample_size:,} examples...")
sampled_dataset = dataset.shuffle(seed=seed).select(range(sample_size))

# --- Process each caption column ---
for caption_column in caption_columns:
    print(f"Processing caption column: {caption_column}")
    sanitized_model_name = sanitize_filename(caption_column)
    output_filename = os.path.join(output_directory, f"generated_{sanitized_model_name}.jsonl")

    with open(output_filename, 'w', encoding='utf-8') as f:
        for i, example in enumerate(sampled_dataset):
            raw_caption = example[caption_column]
            caption = raw_caption.strip() if isinstance(raw_caption, str) else ""
            if not caption:
                continue  # skip empty or missing captions

            record = {
                "source": prompt_source_name,
                "id": str(i),
                "prompt": f"provide description for the image: {example['filepath']}",
                "model": caption_column,
                "output": caption
            }
            f.write(json.dumps(record, ensure_ascii=False) + '\n')

    print(f"  Wrote JSONL with sampled data to: {output_filename}")

print("\nDone creating QA-style JSONL files from sampled subset.")


In [None]:
!aws s3 sync qa_style_jsons s3://dataset-ingested/temp/captions-analyze-samples/

since the whole file is too big to handle, it is processed in another machine and doing analysis only here:

In [2]:
!aws s3 sync s3://dataset-ingested/temp/captions-analyze-samples/ data/captions-analyze-samples

download: s3://dataset-ingested/temp/captions-analyze-samples/generated_auto-caption__captioner-1.4__200words.jsonl to data/captions-analyze-samples/generated_auto-caption__captioner-1.4__200words.jsonl
download: s3://dataset-ingested/temp/captions-analyze-samples/generated_auto-caption__captioner-1.45__200words.jsonl to data/captions-analyze-samples/generated_auto-caption__captioner-1.45__200words.jsonl
download: s3://dataset-ingested/temp/captions-analyze-samples/generated_auto-caption__captioner-1.45__full.jsonl to data/captions-analyze-samples/generated_auto-caption__captioner-1.45__full.jsonl


In [3]:
!python ../scripts/slop_profile.py --input-dir data/captions-analyze-samples

2025-04-14 14:36:44,140 - INFO - slop_profile - Starting analysis of datasets in: data/captions-analyze-samples
2025-04-14 14:36:44,140 - INFO - slop_profile - Analysis output directory: /home/ubuntu/dev/slop-forensics-a/results/analysis
2025-04-14 14:36:44,140 - INFO - slop_profile - Combined metrics output file: /home/ubuntu/dev/slop-forensics-a/results/slop_profile_results.json
2025-04-14 14:36:44,140 - INFO - slop_profile - Max items per model: 10000
2025-04-14 14:36:44,140 - INFO - slop_profile - Will log top 5 patterns per model
2025-04-14 14:36:44,140 - INFO - slop_profile - Found 3 dataset files to analyze.
Analyzing Models:   0%|                                   | 0/3 [00:00<?, ?it/s]2025-04-14 14:36:44,142 - INFO - slop_profile - Processing file: generated_auto-caption__captioner-1.4__200words.jsonl
2025-04-14 14:36:44,189 - INFO - utils - Reached max_items limit (10000) for data/captions-analyze-samples/generated_auto-caption__captioner-1.4__200words.jsonl.
2025-04-14 14:36

In [7]:
!python ../scripts/create_slop_lists.py --output-dir data/captions-analyze-samples-outputs

2025-04-14 14:42:36,838 - INFO - create_slop_lists - Starting slop list creation from analysis files in: /home/ubuntu/dev/slop-forensics-a/results/analysis
2025-04-14 14:42:36,838 - INFO - create_slop_lists - Output directory: data/captions-analyze-samples-outputs
2025-04-14 14:42:36,838 - INFO - slop_lists - Starting combined slop list generation...
2025-04-14 14:42:36,838 - INFO - slop_lists - Found 3 analysis files. Loading data...
Loading analysis files:   0%|                             | 0/3 [00:00<?, ?it/s]2025-04-14 14:42:36,964 - INFO - utils - Reached max_items limit (10000) for /home/ubuntu/dev/slop-forensics-a/results/datasets/generated_auto-caption__captioner-1.45__full.jsonl.
Loading analysis files:  33%|███████              | 1/3 [00:00<00:00,  7.78it/s]2025-04-14 14:42:37,016 - INFO - utils - Reached max_items limit (10000) for /home/ubuntu/dev/slop-forensics-a/results/datasets/generated_auto-caption__captioner-1.4__200words.jsonl.
2025-04-14 14:42:37,078 - INFO - utils

In [8]:
!python ../scripts/generate_phylo_trees.py --output-dir data/captions-analyze-samples-outputs

2025-04-14 14:44:38,224 - INFO - generate_phylo_trees - Starting phylogenetic tree generation using data from: /home/ubuntu/dev/slop-forensics-a/results/slop_profile_results.json
2025-04-14 14:44:38,224 - INFO - generate_phylo_trees - Output directory: data/captions-analyze-samples-outputs
2025-04-14 14:44:38,224 - INFO - generate_phylo_trees - Top N features per model: 1500
2025-04-14 14:44:38,224 - INFO - phylogeny - Starting phylogenetic tree generation...
2025-04-14 14:44:38,224 - INFO - phylogeny - Loading combined metrics data from: /home/ubuntu/dev/slop-forensics-a/results/slop_profile_results.json
2025-04-14 14:44:38,229 - INFO - phylogeny - Extracting features (top words/ngrams) for tree building...
2025-04-14 14:44:38,230 - INFO - phylogeny - Attempting parsimony tree construction using PHYLIP...
2025-04-14 14:44:38,231 - INFO - phylogeny - Parsimony analysis: 3 models, 2419 features.
2025-04-14 14:44:38,233 - ERROR - phylogeny - Could not find PHYLIP 'pars' executable in sys