## Setup

In [2]:
# pip install -r requirements.txt

In [3]:
# Run this in a Python interpreter or script
import nltk
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('cmudict', quiet=True)
print("NLTK data downloaded.")

NLTK data downloaded.


In [4]:
!cp .env.example .env
# You can leave the default values or add your PHYLIP_PATH if needed later

cp: cannot stat '.env.example': No such file or directory


## Prepare Data

- **Goal:** Convert your DataFrame into .jsonl files, one file per language model. Each line in the file should be a JSON object representing one input-output pair.

- **Format:** The analysis script (slop_profile.py) expects each JSON object to have at least an "output" key. For best results and compatibility with all features (like multi-prompt n-gram analysis), include:

    - "output": (Required) The raw text generated by the LLM.
    - "model": (Highly Recommended) The identifier for the model (e.g., "meta-llama/Llama-3-8b-chat-hf"). If this is missing, the script *must* infer it from the filename.
    - "prompt": (Recommended) The original input prompt.
    - "source": (Recommended) A category or source name for the prompt (e.g., "my_dataset").
    - "id": (Recommended) A unique identifier for the prompt within its source (e.g., row index or a specific ID).

- **Filename Convention:** Name your files like generated_{provider}__{model_name}.jsonl. Use the utils.sanitize_filename logic (replace / with __, remove invalid chars). Example: generated_meta-llama__Llama-3-8b-chat-hf.jsonl. This is crucial if the "model" key isn't in your JSON objects.

- **Create an Input Directory:** Make a directory to hold these files, for example: mkdir my_analysis_input

In [1]:
import unibox as ub


dset = ub.loads("hf://incantor/danbooru_mongodb-quail_dump-feb-cap-only")

Using global temporary directory: /tmp/unibox_temp
[37m2025-04-14 14:12:07 [INFO] loads: Loading from hf://incantor/danbooru_mongodb-quail_dump-feb-cap-only[0m


Resolving data files:   0%|          | 0/75 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/75 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/75 [00:00<?, ?it/s]

In [None]:
keep_rows = [
    "filepath",
    "auto-caption__captioner-1.45__full",
    "auto-caption__captioner-1.45__200words",
    "auto-caption__captioner-1.4__200words",
]

In [None]:
"""
Dataset({
    features: ['filepath', 'source', 'auto-caption__captioner-1.45__full', 'auto-caption__captioner-1.45__200words', 'auto-caption__captioner-1.4__200words'],
    num_rows: 7865207
})
"""

dset["auto-caption__captioner-1.45__full"][0]


In [8]:
import pandas as pd
import json
import os
import sys
import re

# --- Configuration ---
# Assume df is your loaded DataFrame with columns 'input_prompt', 'llm_output', 'model_name'
# Example DataFrame creation (replace with your actual data loading):
data = {
    'input_prompt': ["Write a story about a cat.", "Describe a sunset.", "Write a story about a dog.", "Explain gravity."],
    'llm_output': ["The cat sat lazily. It dreamed of chasing mice.", "The sun dipped low, painting the sky orange and purple.", "Buddy barked happily, wagging his tail.", "Gravity is the force pulling objects together."],
    'model_name': ["model_A", "model_A", "model_B", "model_B"] # Example model names
}
df = pd.DataFrame(data)

output_directory = "my_analysis_input"
prompt_source_name = "my_custom_prompts" # Name your prompt source
# --- End Configuration ---

# Make sure the slop_forensics library can be imported for sanitize_filename
# Add the parent directory of 'slop_forensics' package to the path
script_dir = "../"
project_root = os.path.dirname(script_dir) # Assumes script is in repo root or similar
sys.path.insert(0, project_root)

try:
    from slop_forensics.utils import sanitize_filename
except ImportError:
    print("Error: Could not import sanitize_filename. Make sure you are running this script")
    print(f"from a location where the 'slop_forensics' package is accessible (e.g., repo root),")
    print(f"or add the project root ('{project_root}') to your PYTHONPATH.")
    # Basic fallback sanitizer if import fails
    def sanitize_filename(name: str) -> str:
        sanitized = name.replace("/", "__")
        sanitized = re.sub(r'[<>:"|?*\\ ]', '-', sanitized)
        sanitized = sanitized.strip('-_')
        return sanitized if sanitized else "invalid_name"

os.makedirs(output_directory, exist_ok=True)

# Group data by model
grouped = df.groupby('model_name')

for model_name, group in grouped:
    print(f"Processing model: {model_name}")
    sanitized_model_name = sanitize_filename(model_name)
    output_filename = os.path.join(output_directory, f"generated_{sanitized_model_name}.jsonl")

    records = []
    for index, row in group.iterrows():
        record = {
            "source": prompt_source_name,
            "id": str(index), # Use DataFrame index as unique ID within the source
            "prompt": row.get('input_prompt', ''), # Handle potential missing column
            "model": model_name,
            "output": row.get('llm_output', '') # Handle potential missing column
        }
        # Ensure output is a non-empty string
        if not isinstance(record["output"], str) or not record["output"].strip():
            print(f"  Skipping row {index} for model {model_name} due to empty or invalid output.")
            continue
        records.append(record)

    if not records:
        print(f"  No valid records found for model {model_name}. Skipping file creation.")
        continue

    # Write to JSONL file
    try:
        with open(output_filename, 'w', encoding='utf-8') as f:
            for record in records:
                f.write(json.dumps(record, ensure_ascii=False) + '\n')
        print(f"  Successfully wrote {len(records)} records to {output_filename}")
    except IOError as e:
        print(f"  Error writing file {output_filename}: {e}")
    except TypeError as e:
        print(f"  Error serializing record to JSON for {output_filename}: {e}")

print("\nData preparation finished.")

Processing model: model_A
  Successfully wrote 2 records to my_analysis_input/generated_model_A.jsonl
Processing model: model_B
  Successfully wrote 2 records to my_analysis_input/generated_model_B.jsonl

Data preparation finished.


In [12]:
!python ../scripts/slop_profile.py --input-dir my_analysis_input

2025-04-14 13:53:58,155 - INFO - slop_profile - Starting analysis of datasets in: my_analysis_input
2025-04-14 13:53:58,155 - INFO - slop_profile - Analysis output directory: /home/ubuntu/dev/slop-forensics-a/results/analysis
2025-04-14 13:53:58,155 - INFO - slop_profile - Combined metrics output file: /home/ubuntu/dev/slop-forensics-a/results/slop_profile_results.json
2025-04-14 13:53:58,155 - INFO - slop_profile - Max items per model: 10000
2025-04-14 13:53:58,155 - INFO - slop_profile - Will log top 5 patterns per model
2025-04-14 13:53:58,155 - INFO - slop_profile - Found 2 dataset files to analyze.
Analyzing Models:   0%|                                   | 0/2 [00:00<?, ?it/s]2025-04-14 13:53:58,156 - INFO - slop_profile - Processing file: generated_model_A.jsonl
2025-04-14 13:53:58,156 - INFO - slop_profile - Analyzing model: model_A (2 items)
2025-04-14 13:53:58,156 - INFO - analysis - Starting analysis for model: model_A
2025-04-14 13:53:59,312 - INFO - analysis - Analysis com

In [None]:
!python ../scripts/create_slop_lists.py