In [1]:
%cd ..

/local/yada/dev/slop-forensics-a


In [None]:
# pip install -r /local/yada/dev/slop-forensics-a/requirements.txt  


# import nltk
# nltk.download('stopwords')
# nltk.download('cmudict')

## Sample Demo:

In [None]:
#!/usr/bin/env python3
"""
Example usage of the slop-forensics shortcuts module.
This demonstrates how to use the unified analyze_sentences function.
"""

import sys
import os

# Add project root to path
# project_root = os.path.dirname(os.path.abspath(__file__))
# sys.path.insert(0, project_root)

from slop_forensics.shortcuts import analyze_sentences, analyze_multiple_models

def main():
    # Example 1: Analyze a single set of sentences
    print("=== Example 1: Single Model Analysis ===")
    
    # Sample sentences that might contain "slop" (repetitive AI-generated text patterns)
    sample_sentences = [
        "The sun was setting over the horizon, casting a warm glow across the landscape.",
        "In conclusion, it's important to note that this matter requires careful consideration.",
        "The atmosphere was filled with palpable tension as the characters navigated their journey.",
        "It's worth mentioning that the situation demanded immediate attention and careful analysis.",
        "The protagonist found themselves in a precarious situation that would test their resolve.",
        "Furthermore, the implications of this decision would reverberate throughout the narrative.",
        "The complex web of relationships added layers of depth to the unfolding drama.",
        "As the story progressed, it became increasingly clear that the stakes were higher than anticipated.",
        "The intricate plot threads began to weave together in unexpected ways.",
        "Ultimately, the resolution provided a satisfying conclusion to the elaborate tale."
    ]

    sample_sentences = df["detailed"].tolist()
    
    # Analyze the sentences
    results = analyze_sentences(
        sentences=sample_sentences,
        output_dir="./analysis_results/example_1",
        model_name="sample_text",
        generate_phylogeny=False  # Skip phylogeny for single model
    )
    
    print(f"Analysis complete! Results saved to: ./analysis_results/example_1")
    print(f"Summary statistics:")
    print(f"  - Average length: {results['statistics'].get('avg_length', 0)}")
    print(f"  - Slop score: {results['statistics'].get('slop_score', 0)}")
    print(f"  - Repetitive words found: {results['statistics'].get('num_repetitive_words', 0)}")
    print(f"  - Output files: {len(results['output_paths'])} files generated")
    print()
    
    # Example 2: Analyze multiple models/sources
    print("=== Example 2: Multi-Model Analysis ===")
    
    # Sample data from different "models" or sources
    model_data = {
        "chatgpt_style": [
            "I'd be happy to help you with that! Here's what you need to know about this topic.",
            "It's important to note that there are several key considerations to keep in mind.",
            "In summary, the best approach would be to carefully evaluate your options.",
            "I hope this information helps! Let me know if you have any other questions.",
        ],
        "academic_style": [
            "This research demonstrates significant implications for future studies in the field.",
            "The methodology employed in this investigation follows established protocols.",
            "Furthermore, the results indicate a strong correlation between the variables.",
            "In conclusion, these findings contribute to our understanding of the phenomenon.",
        ],
        "creative_writing": [
            "The moonlight danced across the rippling water, creating patterns of silver and shadow.",
            "Her heart pounded with anticipation as she approached the mysterious door.",
            "The ancient forest whispered secrets that only the wind could understand.",
            "Time seemed to stand still in that magical moment of discovery.",
        ]
    }
    
    # Analyze multiple models
    multi_results = analyze_multiple_models(
        model_sentences=model_data,
        output_dir="./analysis_results/example_2",
        generate_phylogeny=True  # Generate phylogeny for multiple models
    )
    
    print(f"Multi-model analysis complete!")
    print(f"Models analyzed: {multi_results['models_analyzed']}")
    print(f"Total sentences: {multi_results['total_sentences']}")
    print(f"Results saved to: ./analysis_results/example_2")
    print()
    
    print("=== Analysis Complete ===")
    print("Check the output directories for detailed results:")
    print("  - analysis_summary.json: Human-readable summary")
    print("  - slop_lists/: Generated slop word lists")
    print("  - phylogeny/: Phylogenetic tree visualizations (if generated)")
    print("  - analysis/: Detailed analysis metrics")


def quick_analyze(sentences, output_dir="./quick_analysis"):
    """Quick analysis function for simple use cases."""
    results = analyze_sentences(
        sentences=sentences,
        output_dir=output_dir,
        model_name="quick_analysis",
        generate_phylogeny=False
    )
    
    # Print quick summary
    print("Quick Analysis Results:")
    print(f"  Sentences analyzed: {len(sentences)}")
    print(f"  Average length: {results['statistics'].get('avg_length', 0):.1f} characters")
    print(f"  Slop score: {results['statistics'].get('slop_score', 0):.3f}")
    print(f"  Repetitive words: {results['statistics'].get('num_repetitive_words', 0)}")
    
    # Show top repetitive words if any
    analysis_file = results["output_paths"].get("analysis")
    if analysis_file and os.path.exists(analysis_file):
        import json
        with open(analysis_file, 'r') as f:
            analysis_data = json.load(f)
        
        top_words = analysis_data.get("top_repetitive_words", [])[:5]
        if top_words:
            print("  Top repetitive words:")
            for word_data in top_words:
                word = word_data.get("word", "")
                score = word_data.get("score", 0)
                print(f"    - '{word}' (score: {score:.2f})")
    
    return results


if __name__ == "__main__":
    main()

## Doing on actual data:

In [12]:
import unibox as ub


df = ub.loads("s3://quail-tmp/dive-vis-rubric-gemini-v7/image_list.parquet")
df.head()

2025-07-22 02:34:44,901 - INFO - logger - Loading from s3://quail-tmp/dive-vis-rubric-gemini-v7/image_list.parquet
2025-07-22 02:34:44,909 - INFO - credentials - Found credentials in shared credentials file: ~/.aws/credentials


Unnamed: 0,s3key,image_id,index,detailed,short1,short1_1,short2,fallback
0,s3://bucket-public-access-uw2/labelling/dive-v...,danbooru-4178000,0,This is a full-body digital illustration of a ...,This is a full-body character illustration in ...,"A full-body character illustration, featuring ...",A cheerful anime girl with blue twin-drill hai...,NONE
1,s3://bucket-public-access-uw2/labelling/dive-v...,danbooru-6173968,1,"In the upper right corner of the image, there ...",This is a digital illustration in a Japanese a...,A digital illustration presenting a portrait o...,"In a suggestive pose, Seraphina from Disgaea 5...",NONE
2,s3://bucket-public-access-uw2/labelling/dive-v...,danbooru-1410765,2,"The image is a full-body, low-angle shot of a ...",This is a high-quality illustration in a Japan...,A high-quality illustration featuring the char...,"A blushing, green-haired Sanae Kochiya in a st...",NONE
3,s3://bucket-public-access-uw2/labelling/dive-v...,danbooru-3560258,3,This is a high-detail Japanese anime-style ill...,This image is a Japanese anime-style illustrat...,"An illustration of Hikari from Arcaea, present...",An ethereal illustration of Hikari from Arcaea...,NONE
4,s3://bucket-public-access-uw2/labelling/dive-v...,danbooru-2615484,4,"This is a high-quality, Japanese anime-style d...",This is a dramatic and emotional Japanese anim...,A dramatic and emotional illustration of the c...,A distressed anime rabbit-girl with purple hai...,NONE


In [None]:
#!/usr/bin/env python3
"""
Example usage of the slop-forensics shortcuts module.
This demonstrates how to use the unified analyze_sentences function.
"""

import sys
import os

# Add project root to path
# project_root = os.path.dirname(os.path.abspath(__file__))
# sys.path.insert(0, project_root)

from slop_forensics.shortcuts import analyze_sentences, analyze_multiple_models

def main():    
    # Example 2: Analyze multiple models/sources
    print("=== Example 2: Multi-Model Analysis ===")
    
    # Sample data from different "models" or sources
    model_data = {
        "detailed": df["detailed"].tolist(),
        "short1": df["short1"].tolist(),
        "short1_1": df["short1_1"].tolist(),
        "short2": df["short2"].tolist(),
    }
    
    # Analyze multiple models
    multi_results = analyze_multiple_models(
        model_sentences=model_data,
        output_dir="./analysis_results/example_2",
        generate_phylogeny=True  # Generate phylogeny for multiple models
    )
    
    print(f"Multi-model analysis complete!")
    print(f"Models analyzed: {multi_results['models_analyzed']}")
    print(f"Total sentences: {multi_results['total_sentences']}")
    print(f"Results saved to: ./analysis_results/example_2")
    print()
    
    print("=== Analysis Complete ===")
    print("Check the output directories for detailed results:")
    print("  - analysis_summary.json: Human-readable summary")
    print("  - slop_lists/: Generated slop word lists")
    print("  - phylogeny/: Phylogenetic tree visualizations (if generated)")
    print("  - analysis/: Detailed analysis metrics")


def quick_analyze(sentences, output_dir="./quick_analysis"):
    """Quick analysis function for simple use cases."""
    results = analyze_sentences(
        sentences=sentences,
        output_dir=output_dir,
        model_name="quick_analysis",
        generate_phylogeny=False
    )
    
    # Print quick summary
    print("Quick Analysis Results:")
    print(f"  Sentences analyzed: {len(sentences)}")
    print(f"  Average length: {results['statistics'].get('avg_length', 0):.1f} characters")
    print(f"  Slop score: {results['statistics'].get('slop_score', 0):.3f}")
    print(f"  Repetitive words: {results['statistics'].get('num_repetitive_words', 0)}")
    
    # Show top repetitive words if any
    analysis_file = results["output_paths"].get("analysis")
    if analysis_file and os.path.exists(analysis_file):
        import json
        with open(analysis_file, 'r') as f:
            analysis_data = json.load(f)
        
        top_words = analysis_data.get("top_repetitive_words", [])[:5]
        if top_words:
            print("  Top repetitive words:")
            for word_data in top_words:
                word = word_data.get("word", "")
                score = word_data.get("score", 0)
                print(f"    - '{word}' (score: {score:.2f})")
    
    return results


if __name__ == "__main__":
    main()

In [None]:

# Raw data provided
data = {
    "short2": {
        "avg_length": 119.06,
        "vocab_complexity": 64.2949,
        "slop_score": 9.5271,
        "total_unique_words_after_filters": 45,
        "avg_corpus_rarity": 1.7422,
        "avg_wordfreq_rarity": 4.2106,
        "rarity_correlation": 0.213,
        "repetition_score": 100.0
    },
    "short1_1": {
        "avg_length": 1024.01,
        "vocab_complexity": 60.5155,
        "slop_score": 13.4961,
        "total_unique_words_after_filters": 494,
        "avg_corpus_rarity": 2.8872,
        "avg_wordfreq_rarity": 4.5503,
        "rarity_correlation": 0.2935,
        "repetition_score": 16.0574
    },
    "short1": {
        "avg_length": 1101.9,
        "vocab_complexity": 64.4827,
        "slop_score": 13.1828,
        "total_unique_words_after_filters": 524,
        "avg_corpus_rarity": 2.9259,
        "avg_wordfreq_rarity": 4.5546,
        "rarity_correlation": 0.2708,
        "repetition_score": 15.4452
    },
    "detailed": {
        "avg_length": 4332.79,
        "vocab_complexity": 56.1503,
        "slop_score": 11.9596,
        "total_unique_words_after_filters": 1606,
        "avg_corpus_rarity": 3.5071,
        "avg_wordfreq_rarity": 4.8591,
        "rarity_correlation": 0.3325,
        "repetition_score": 4.9497
    }
}

# Convert to DataFrame
df = pd.DataFrame(data).T.reset_index().rename(columns={"index": "model_name"})

In [None]:
sample_dict = combined_df.iloc[0].to_dict()
sample_dict

{'bbox': array([  4, 330, 319, 669]),
 'bbox_score': 0.8553559184074402,
 'height': 1024.0,
 'width': 768.0,
 'crop_id': '3y1ge2srvq9',
 'source_id': '4yokxqvwmrg',
 'label': 'GOOD',
 'score': array([9.99912024e-01, 8.79269865e-05]),
 'dataset_from': 'incantor/hands-scorer-pixai-v12',
 'image_path_csway': '/data/zhizhuo/datasets/pixai_50k_apr25/adea1901-a2f8-4be5-b93d-404b10df49ed.webp'}

In [None]:
sample_dict = combined_df.iloc[0].to_dict()
sample_dict

{'bbox': array([  4, 330, 319, 669]),
 'bbox_score': 0.8553559184074402,
 'height': 1024.0,
 'width': 768.0,
 'crop_id': '3y1ge2srvq9',
 'source_id': '4yokxqvwmrg',
 'label': 'GOOD',
 'score': array([9.99912024e-01, 8.79269865e-05]),
 'dataset_from': 'incantor/hands-scorer-pixai-v12',
 'image_path_csway': '/data/zhizhuo/datasets/pixai_50k_apr25/adea1901-a2f8-4be5-b93d-404b10df49ed.webp'}