In [None]:
import os
import re
from datasets import Dataset
from dprune.scorers import PerplexityScorer
from dprune.pruners import TopKPruner

# For demonstration purposes, we'll create a sample dataset
# In practice, you would load your own dataset


In [None]:
# Create a sample dataset with varying text quality
sample_texts = [
    "The quick brown fox jumps over the lazy dog.",  # High quality
    "This is a well-written sentence with proper grammar.",  # High quality
    "Machine learning is transforming how we analyze data.",  # High quality
    "fox quick brown the jumps lazy over dog the.",  # Scrambled - low quality
    "asdf qwerty keyboard random text here nonsense.",  # Nonsense - low quality
    "The weather today is beautiful and sunny.",  # High quality
    "beautiful sunny weather today the is and.",  # Scrambled - low quality
    "Python is a versatile programming language.",  # High quality
    "programming language versatile Python is a.",  # Scrambled - low quality
    "Natural language processing enables computers to understand human language.",  # High quality
]

# Create a Hugging Face dataset
dataset = Dataset.from_dict({
    'text': sample_texts,
    'id': list(range(len(sample_texts)))
})

print(f"Dataset size: {len(dataset)}")
print("\nSample texts:")
for i, text in enumerate(dataset['text'][:5]):
    print(f"{i}: {text}")


In [None]:
# Replace with the path to your KenLM model file
# MODEL_PATH = "/path/to/your/kenlm/model.bin"

# Example with a real model (uncomment when you have a model):
# scorer = PerplexityScorer(
#     model_path=MODEL_PATH,
#     text_column='text',
#     batch_size=50
# )

# For demonstration, we'll show mock perplexity scores
print("PerplexityScorer initialization example:")
print("scorer = PerplexityScorer(")
print("    model_path='/path/to/kenlm/model.bin',")
print("    text_column='text',")
print("    batch_size=50")
print(")")


In [None]:
# Score the dataset (uncomment when you have a real model):
# scored_dataset = scorer.score(dataset)

# For demonstration, let's create mock scores
# In practice, lower perplexity = more fluent text
mock_scores = [
    15.2,   # "The quick brown fox..." - fluent
    18.7,   # "This is a well-written..." - fluent
    22.1,   # "Machine learning is..." - fluent
    157.3,  # "fox quick brown the..." - scrambled
    289.5,  # "asdf qwerty keyboard..." - nonsense
    16.8,   # "The weather today..." - fluent
    145.2,  # "beautiful sunny weather..." - scrambled
    19.4,   # "Python is a versatile..." - fluent
    132.7,  # "programming language versatile..." - scrambled
    25.6,   # "Natural language processing..." - fluent
]

# Create a mock scored dataset
scored_dataset = dataset.add_column('score', mock_scores)

print(f"Scored dataset size: {len(scored_dataset)}")
print("\nPerplexity scores (lower = more fluent):")
for i, (text, score) in enumerate(zip(scored_dataset['text'], scored_dataset['score'])):
    print(f"{i}: {score:.1f} - {text[:50]}{'...' if len(text) > 50 else ''}")


In [None]:
# Initialize a TopKPruner to select the top 50% most fluent examples
pruner = TopKPruner(k=0.5, ascending=True)  # ascending=True for lowest perplexity

# Prune the dataset
pruned_dataset = pruner.prune(scored_dataset)

print(f"Original dataset size: {len(scored_dataset)}")
print(f"Pruned dataset size: {len(pruned_dataset)}")
print(f"Reduction: {len(scored_dataset) - len(pruned_dataset)} examples removed")

print("\nPruned dataset (most fluent examples):")
for i, (text, score) in enumerate(zip(pruned_dataset['text'], pruned_dataset['score'])):
    print(f"{i}: {score:.1f} - {text}")
