In [1]:
%cd ..

/local/yada/dev/slop-forensics-a


In [None]:
# pip install -r /local/yada/dev/slop-forensics-a/requirements.txt  


# import nltk
# nltk.download('stopwords')
# nltk.download('cmudict')

In [8]:
#!/usr/bin/env python3
"""
Example usage of the slop-forensics shortcuts module.
This demonstrates how to use the unified analyze_sentences function.
"""

import sys
import os

# Add project root to path
# project_root = os.path.dirname(os.path.abspath(__file__))
# sys.path.insert(0, project_root)

from slop_forensics.shortcuts import analyze_sentences, analyze_multiple_models

def main():
    # Example 1: Analyze a single set of sentences
    print("=== Example 1: Single Model Analysis ===")
    
    # Sample sentences that might contain "slop" (repetitive AI-generated text patterns)
    sample_sentences = [
        "The sun was setting over the horizon, casting a warm glow across the landscape.",
        "In conclusion, it's important to note that this matter requires careful consideration.",
        "The atmosphere was filled with palpable tension as the characters navigated their journey.",
        "It's worth mentioning that the situation demanded immediate attention and careful analysis.",
        "The protagonist found themselves in a precarious situation that would test their resolve.",
        "Furthermore, the implications of this decision would reverberate throughout the narrative.",
        "The complex web of relationships added layers of depth to the unfolding drama.",
        "As the story progressed, it became increasingly clear that the stakes were higher than anticipated.",
        "The intricate plot threads began to weave together in unexpected ways.",
        "Ultimately, the resolution provided a satisfying conclusion to the elaborate tale."
    ]
    
    # Analyze the sentences
    results = analyze_sentences(
        sentences=sample_sentences,
        output_dir="./analysis_results/example_1",
        model_name="sample_text",
        generate_phylogeny=False  # Skip phylogeny for single model
    )
    
    print(f"Analysis complete! Results saved to: ./analysis_results/example_1")
    print(f"Summary statistics:")
    print(f"  - Average length: {results['statistics'].get('avg_length', 0)}")
    print(f"  - Slop score: {results['statistics'].get('slop_score', 0)}")
    print(f"  - Repetitive words found: {results['statistics'].get('num_repetitive_words', 0)}")
    print(f"  - Output files: {len(results['output_paths'])} files generated")
    print()
    
    # Example 2: Analyze multiple models/sources
    print("=== Example 2: Multi-Model Analysis ===")
    
    # Sample data from different "models" or sources
    model_data = {
        "chatgpt_style": [
            "I'd be happy to help you with that! Here's what you need to know about this topic.",
            "It's important to note that there are several key considerations to keep in mind.",
            "In summary, the best approach would be to carefully evaluate your options.",
            "I hope this information helps! Let me know if you have any other questions.",
        ],
        "academic_style": [
            "This research demonstrates significant implications for future studies in the field.",
            "The methodology employed in this investigation follows established protocols.",
            "Furthermore, the results indicate a strong correlation between the variables.",
            "In conclusion, these findings contribute to our understanding of the phenomenon.",
        ],
        "creative_writing": [
            "The moonlight danced across the rippling water, creating patterns of silver and shadow.",
            "Her heart pounded with anticipation as she approached the mysterious door.",
            "The ancient forest whispered secrets that only the wind could understand.",
            "Time seemed to stand still in that magical moment of discovery.",
        ]
    }
    
    # Analyze multiple models
    multi_results = analyze_multiple_models(
        model_sentences=model_data,
        output_dir="./analysis_results/example_2",
        generate_phylogeny=True  # Generate phylogeny for multiple models
    )
    
    print(f"Multi-model analysis complete!")
    print(f"Models analyzed: {multi_results['models_analyzed']}")
    print(f"Total sentences: {multi_results['total_sentences']}")
    print(f"Results saved to: ./analysis_results/example_2")
    print()
    
    print("=== Analysis Complete ===")
    print("Check the output directories for detailed results:")
    print("  - analysis_summary.json: Human-readable summary")
    print("  - slop_lists/: Generated slop word lists")
    print("  - phylogeny/: Phylogenetic tree visualizations (if generated)")
    print("  - analysis/: Detailed analysis metrics")


def quick_analyze(sentences, output_dir="./quick_analysis"):
    """Quick analysis function for simple use cases."""
    results = analyze_sentences(
        sentences=sentences,
        output_dir=output_dir,
        model_name="quick_analysis",
        generate_phylogeny=False
    )
    
    # Print quick summary
    print("Quick Analysis Results:")
    print(f"  Sentences analyzed: {len(sentences)}")
    print(f"  Average length: {results['statistics'].get('avg_length', 0):.1f} characters")
    print(f"  Slop score: {results['statistics'].get('slop_score', 0):.3f}")
    print(f"  Repetitive words: {results['statistics'].get('num_repetitive_words', 0)}")
    
    # Show top repetitive words if any
    analysis_file = results["output_paths"].get("analysis")
    if analysis_file and os.path.exists(analysis_file):
        import json
        with open(analysis_file, 'r') as f:
            analysis_data = json.load(f)
        
        top_words = analysis_data.get("top_repetitive_words", [])[:5]
        if top_words:
            print("  Top repetitive words:")
            for word_data in top_words:
                word = word_data.get("word", "")
                score = word_data.get("score", 0)
                print(f"    - '{word}' (score: {score:.2f})")
    
    return results


if __name__ == "__main__":
    main()

2025-07-22 02:32:05,624 - INFO - shortcuts - Starting comprehensive slop analysis for 10 sentences
2025-07-22 02:32:05,625 - INFO - shortcuts - Creating dataset file...
2025-07-22 02:32:05,626 - INFO - shortcuts - Saved dataset to: ./analysis_results/example_1/datasets/generated_sample_text.jsonl
2025-07-22 02:32:05,626 - INFO - shortcuts - Performing text analysis...
2025-07-22 02:32:05,627 - INFO - analysis - Starting analysis for model: sample_text


=== Example 1: Single Model Analysis ===


2025-07-22 02:32:06,220 - INFO - metrics - Loaded 1000 word items from data/slop_list.json
2025-07-22 02:32:06,221 - INFO - metrics - Loaded 200 bigram items from data/slop_list_bigrams.json
2025-07-22 02:32:06,222 - INFO - metrics - Loaded 200 trigram items from data/slop_list_trigrams.json
2025-07-22 02:32:06,236 - INFO - analysis - Analysis complete for model: sample_text
2025-07-22 02:32:06,236 - INFO - shortcuts - Saved analysis results to: ./analysis_results/example_1/analysis/slop_profile__sample_text.json
2025-07-22 02:32:06,237 - INFO - shortcuts - Generating slop lists...
2025-07-22 02:32:06,237 - INFO - slop_lists - Starting combined slop list generation...
2025-07-22 02:32:06,237 - INFO - slop_lists - Found 1 analysis files. Loading data...
Loading analysis files: 100%|██████████| 1/1 [00:00<00:00, 4951.95it/s]
2025-07-22 02:32:06,239 - INFO - slop_lists - Processing combined text data from 1 models...
2025-07-22 02:32:06,240 - INFO - slop_lists - Counting combined words...

Analysis complete! Results saved to: ./analysis_results/example_1
Summary statistics:
  - Average length: 85.4
  - Slop score: 55.1181
  - Repetitive words found: 0
  - Output files: 6 files generated

=== Example 2: Multi-Model Analysis ===


Loading analysis files: 100%|██████████| 1/1 [00:00<00:00, 6374.32it/s]
2025-07-22 02:32:06,361 - INFO - slop_lists - Processing combined text data from 1 models...
2025-07-22 02:32:06,362 - INFO - slop_lists - Counting combined words...
Counting words: 100%|██████████| 4/4 [00:00<00:00, 37957.50it/s]
2025-07-22 02:32:06,363 - INFO - slop_lists - Filtering combined counts...
2025-07-22 02:32:06,364 - INFO - slop_lists - Analyzing combined word rarity...
2025-07-22 02:32:06,365 - INFO - slop_lists - Filtering common words (wordfreq > 1.2e-05)...
2025-07-22 02:32:06,365 - INFO - slop_lists - Finding over-represented and zero-frequency words...
2025-07-22 02:32:06,366 - INFO - slop_lists - Creating final word slop lists...
2025-07-22 02:32:06,366 - INFO - utils - Saved list with one item per line to: ./analysis_results/example_2/chatgpt_style/slop_lists/slop_list.json
2025-07-22 02:32:06,367 - INFO - slop_lists - Saved standard word slop list (2 words).
2025-07-22 02:32:06,367 - INFO - sl

Multi-model analysis complete!
Models analyzed: ['chatgpt_style', 'academic_style', 'creative_writing']
Total sentences: 12
Results saved to: ./analysis_results/example_2

=== Analysis Complete ===
Check the output directories for detailed results:
  - analysis_summary.json: Human-readable summary
  - slop_lists/: Generated slop word lists
  - phylogeny/: Phylogenetic tree visualizations (if generated)
  - analysis/: Detailed analysis metrics


[nltk_data] Downloading package cmudict to /root/nltk_data...


[nltk_data]   Unzipping corpora/cmudict.zip.


True