In [5]:
import random
import re

In [6]:
# Sample GSM8K questions for demonstration
SAMPLE_QUESTIONS = [
    {
        "question": "Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?",
        "answer": "Natalia sold 48/2 = 24 clips in May. Natalia sold 48+24 = 72 clips altogether in April and May. #### 72"
    },
    {
        "question": "Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?",
        "answer": "Weng earns 12/60 = $0.2 per minute. Working 50 minutes, she earned 0.2 x 50 = $10. #### 10"
    },
    {
        "question": "Betty is saving money for a new wallet which costs $100. Betty has only half of the money she needs. Her parents decided to give her $15 for that purpose, and her grandparents twice as much as her parents. How much more money does Betty need to buy the wallet?",
        "answer": "In the beginning, Betty has only 100 / 2 = $50. Betty's grandparents gave her 15 * 2 = $30. This means, Betty needs 100 - 50 - 30 - 15 = $5 more. #### 5"
    }
]

In [None]:
def test_partial_completion(question):

    words = question.split()
    # Take 60% of the question
    split_point = int(len(words) * 0.6)
    partial = " ".join(words[:split_point])
    expected = " ".join(words[split_point:])
    
    print(f"Partial Question: {partial}")
    print(f"Expected Completion: {expected}")
    
    # In real implementation, you'd query your model here
    # model_completion = query_model(f"Complete this question: {partial}")
    
    # For demo, simulate different contamination levels
    contamination_level = random.choice(['high', 'medium', 'low'])
    
    if contamination_level == 'high':
        # Model completes almost perfectly - suspicious!
        simulated_completion = expected
        similarity_score = 1.0
    elif contamination_level == 'medium':
        # Partial match
        simulated_completion = " ".join(expected.split()[:len(expected.split())//2])
        similarity_score = 0.5
    else:
        # Poor completion
        simulated_completion = "books are there in the library?"
        similarity_score = 0.1
    
    print(f"Model Completion: {simulated_completion}")
    print(f"Similarity Score: {similarity_score:.2f}")
    print(f"Contamination Level: {contamination_level.upper()}")
    
    return similarity_score

In [None]:
def modify_numbers(question, answer):
    print(f"\nOriginal Question: {question}")
    
    # Find all numbers in the question
    numbers = re.findall(r'\b\d+\.?\d*\b', question)
    
    # Create modified version
    modified_question = question
    number_changes = {}
    
    for num_str in set(numbers):
        try:
            num = float(num_str)
            # Modify by 20-50%
            new_num = int(num * random.uniform(1.2, 1.5))
            number_changes[num_str] = str(new_num)
            modified_question = modified_question.replace(num_str, str(new_num))
        except ValueError:
            continue
    
    print(f"Modified Question: {modified_question}")
    print(f"Number Changes: {number_changes}")
    
    # Extract original answer
    original_answer = re.search(r'#### (\d+)', answer)
    original_answer = int(original_answer.group(1)) if original_answer else None
    
    # Simulate model performance
    # Contaminated models often fail on modified versions
    original_correct = random.random() < 0.8  # 80% accuracy on original
    modified_correct = random.random() < 0.4  # 40% accuracy on modified (performance drop)
    
    print(f"Original Problem Correct: {original_correct}")
    print(f"Modified Problem Correct: {modified_correct}")
    
    performance_gap = original_correct - modified_correct
    print(f"Performance Gap: {performance_gap} (higher = more suspicious)")
    
    return performance_gap

In [None]:
def calculate_contamination_risk(completion_score, performance_gap):
    risk_score = 0.0
    
    # High completion accuracy is suspicious
    if completion_score > 0.8:
        risk_score += 0.5
    elif completion_score > 0.5:
        risk_score += 0.2
    
    # Large performance gap is suspicious
    if performance_gap > 0.5:
        risk_score += 0.4
    elif performance_gap > 0.2:
        risk_score += 0.2
    
    # Determine risk level
    if risk_score > 0.7:
        return "HIGH RISK - Likely contaminated"
    elif risk_score > 0.4:
        return "MEDIUM RISK - Possible contamination"
    else:
        return "LOW RISK - Appears clean"

In [None]:
def run_simple_contamination_test():
    
    print("=== SIMPLE GSM8K CONTAMINATION TEST ===\n")
    
    completion_scores = []
    performance_gaps = []
    
    for i, sample in enumerate(SAMPLE_QUESTIONS):
        print(f"--- Testing Question {i+1} ---")
        
        # Test 1: Partial completion
        completion_score = test_partial_completion(sample['question'])
        completion_scores.append(completion_score)
        
        # Test 2: Number modification
        performance_gap = modify_numbers(sample['question'], sample['answer'])
        performance_gaps.append(performance_gap)
        
        # Calculate risk for this question
        risk = calculate_contamination_risk(completion_score, performance_gap)
        print(f"Question Risk Assessment: {risk}\n")
    
    # Overall analysis
    avg_completion = sum(completion_scores) / len(completion_scores)
    avg_gap = sum(performance_gaps) / len(performance_gaps)
    
    print("=== OVERALL ANALYSIS ===")
    print(f"Average Completion Score: {avg_completion:.2f}")
    print(f"Average Performance Gap: {avg_gap:.2f}")
    
    overall_risk = calculate_contamination_risk(avg_completion, avg_gap)
    print(f"Overall Contamination Risk: {overall_risk}")
    
    # Recommendations
    print("\n=== RECOMMENDATIONS ===")
    if "HIGH" in overall_risk:
        print("🚨 Model likely trained on contaminated data")
        print("• Verify training data sources")
        print("• Test on alternative clean benchmarks")
        print("• Consider retraining with clean data")
    elif "MEDIUM" in overall_risk:
        print("⚠️ Possible contamination detected")
        print("• Run more extensive testing")
        print("• Check for indirect data leakage")
        print("• Monitor performance on new problems")
    else:
        print("✅ Model appears to have good generalization")
        print("• Low contamination risk")
        print("• Continue monitoring with new benchmarks")


In [None]:
def explain_contamination_detection():
    
    print("\n=== HOW CONTAMINATION DETECTION WORKS ===\n")
    
    print("1. PARTIAL COMPLETION TEST:")
    print("   - Give model only part of a question")
    print("   - If it completes perfectly, it might have memorized the dataset")
    print("   - Example: 'Natalia sold clips to 48 friends...' → model completes exactly")
    print()
    
    print("2. NUMBER MODIFICATION TEST:")
    print("   - Change numbers in the original problem")
    print("   - If performance drops significantly, model is overfitting to specific values")
    print("   - Example: Change '48 friends' to '52 friends' and see if model fails")
    print()
    
    print("3. STATISTICAL ANALYSIS:")
    print("   - Look for patterns suggesting memorization vs. reasoning")
    print("   - Compare performance on original vs. modified problems")
    print("   - Measure how often model generates exact dataset text")
    print()
    
    print("4. CONTAMINATION INDICATORS:")
    print("   - High completion accuracy (>80%)")
    print("   - Large performance gap on modified problems (>20%)")
    print("   - Perfect recall of specific numbers or phrases")
    print("   - Generating exact training examples")


In [13]:
if __name__ == "__main__":
    # Run the simple test
    run_simple_contamination_test()
    
    # Explain the concepts
    explain_contamination_detection()
    
    print("\n=== NEXT STEPS ===")
    print("1. Install dependencies: pip install datasets requests numpy")
    print("2. Use the full contamination detector tool")
    print("3. Test with your specific model endpoint")
    print("4. Analyze results and adjust training accordingly")

=== SIMPLE GSM8K CONTAMINATION TEST ===

--- Testing Question 1 ---
Partial Question: Natalia sold clips to 48 of her friends in April, and then she sold half as many clips
Expected Completion: in May. How many clips did Natalia sell altogether in April and May?
Model Completion: in May. How many clips did Natalia sell altogether in April and May?
Similarity Score: 1.00
Contamination Level: HIGH

Original Question: Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?
Modified Question: Natalia sold clips to 65 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?
Number Changes: {'48': '65'}
Original Problem Correct: True
Modified Problem Correct: True
Performance Gap: 0 (higher = more suspicious)
Question Risk Assessment: MEDIUM RISK - Possible contamination

--- Testing Question 2 ---
Partial Question: Weng