In [1]:
import random
import string
import hashlib
from collections import Counter


In [2]:
def generate_exact_dup_dataset():
    """Generate dataset with exact duplicates for MD5 practice"""
    base_texts = [
        "The quick brown fox jumps over the lazy dog",
        "Python is a great programming language",
        "Machine learning is transforming industries",
        "Data science requires statistics knowledge",
        "Natural language processing is fascinating"
    ]
    
    dataset = []
    for i, text in enumerate(base_texts * 3):  # Each appears 3 times
        # Add some with different IDs but same text
        dataset.append({
            'id': f'doc_{len(dataset)}',
            'text': text,
            'source': random.choice(['web', 'book', 'paper'])
        })
    
    # Add some unique ones
    for i in range(5):
        dataset.append({
            'id': f'doc_{len(dataset)}',
            'text': f"Unique text number {i} with random content",
            'source': 'generated'
        })
    
    random.shuffle(dataset)
    return dataset

def generate_near_dup_dataset():
    """Generate dataset with near-duplicates for Jaccard practice"""
    base = "The quick brown fox jumps over the lazy dog"
    
    dataset = [
        {"id": 0, "text": base},
        {"id": 1, "text": "The quick brown fox jumps over a lazy dog"},  # 90% similar
        {"id": 2, "text": "A quick brown fox jumped over the lazy dog"},  # 80% similar
        {"id": 3, "text": "The fast brown fox jumps over the lazy cat"},  # 70% similar
        {"id": 4, "text": "The quick red fox runs over the sleepy dog"},  # 50% similar
        {"id": 5, "text": "Completely different text about programming"},  # 0% similar
        {"id": 6, "text": "The quick brown fox jumps over the lazy dog!"},  # Punctuation diff
        {"id": 7, "text": "THE QUICK BROWN FOX JUMPS OVER THE LAZY DOG"},  # Case diff
    ]
    return dataset

def generate_spam_dataset():
    """Generate dataset with repetitive spam for pattern detection"""
    dataset = []
    
    # Type 1: Word repetition spam
    spam_words = ['buy', 'now', 'click', 'free', 'offer']
    for i in range(3):
        text = ' '.join([random.choice(spam_words) for _ in range(50)])
        dataset.append({"id": len(dataset), "text": text, "label": "spam"})
    
    # Type 2: Phrase repetition spam  
    phrase = "amazing deal click here"
    dataset.append({"id": len(dataset), "text": ' '.join([phrase] * 10), "label": "spam"})
    
    # Type 3: Mixed repetition
    template = "Call now {} best price {} limited time {}"
    nums = ['555-1234', '555-5678', '555-9999']
    for num in nums:
        text = template.format(num, num, num) * 5
        dataset.append({"id": len(dataset), "text": text, "label": "spam"})
    
    # Add legitimate texts
    legit_texts = [
        "Machine learning models require careful validation to avoid overfitting on training data",
        "The conference will feature speakers from industry and academia discussing AI ethics",
        "Python's simplicity makes it an excellent choice for data science applications",
        "Understanding statistics is fundamental to interpreting machine learning results correctly",
        "Cloud computing has revolutionized how we deploy and scale applications"
    ]
    
    for text in legit_texts:
        dataset.append({"id": len(dataset), "text": text, "label": "ham"})
    
    random.shuffle(dataset)
    return dataset

def generate_shingle_dataset():
    """Generate dataset for shingle/MinHash practice"""
    # Similar news articles with overlapping content
    articles = [
        """Apple announced record quarterly earnings driven by strong iPhone sales.
        The company reported revenue of $90 billion, exceeding analyst expectations.
        CEO Tim Cook attributed success to innovation and customer loyalty.""",
        
        """Apple reported record earnings this quarter with strong iPhone performance.
        Revenue reached $90 billion, surpassing Wall Street expectations significantly.
        Tim Cook cited innovation as key driver of the company's success.""",
        
        """Tech giant Apple posted impressive quarterly results led by iPhone sales.
        The firm announced $90 billion in revenue, beating market forecasts.
        Chief executive Cook highlighted product innovation and loyal customers.""",
        
        """Microsoft announced cloud growth driving quarterly earnings higher.
        Azure revenue increased 40% as enterprises accelerate digital transformation.
        CEO Satya Nadella emphasized AI integration across product lines.""",
        
        """Amazon Web Services continues dominating cloud infrastructure market.
        AWS revenue grew 35% year-over-year reaching new milestone.
        The company plans significant AI and machine learning investments."""
    ]
    
    dataset = [{"id": i, "text": article.replace('\n', ' ').strip()} 
               for i, article in enumerate(articles)]
    return dataset


In [3]:
print("=" * 60)
print("PROBLEM 1: EXACT DEDUPLICATION WITH MD5")
print("=" * 60)
print("\nDataset has 20 documents, some are exact duplicates.")
print("Task: Find and remove exact duplicates using MD5 hashing")
print("Expected: 5 unique documents + 5 generated = 10 unique total\n")


PROBLEM 1: EXACT DEDUPLICATION WITH MD5

Dataset has 20 documents, some are exact duplicates.
Task: Find and remove exact duplicates using MD5 hashing
Expected: 5 unique documents + 5 generated = 10 unique total



In [4]:
exact_data = generate_exact_dup_dataset()
print(f"Dataset size: {len(exact_data)} documents")
print("First 3 documents:")
for doc in exact_data[:3]:
    print(f"  ID: {doc['id']}, Text: {doc['text'][:50]}...")


Dataset size: 20 documents
First 3 documents:
  ID: doc_12, Text: Machine learning is transforming industries...
  ID: doc_1, Text: Python is a great programming language...
  ID: doc_14, Text: Natural language processing is fascinating...


In [43]:

print("\nYOUR TASK: Write a function to find exact duplicates using MD5")
print("Return: dict with keys: 'unique_count', 'duplicate_ids', 'hash_collisions'")

from hashlib import md5

res = {
    "unique_count": 0,
    "duplicate_ids": [],
    "hash_collisions": 0,
}


digests = set()
for doc in exact_data:
    hashed_text = md5(doc['text'].encode()).digest()
    if hashed_text in digests:
        res["duplicate_ids"] = res["duplicate_ids"] + [doc['id']]
    else:
        res['unique_count'] += 1
        # this seems a bit pointless... if we have a set we could use that directly
        digests.add(hashed_text)
    # what is a hash collision? if they are equal doesnt that imply they are just the same text?
res


YOUR TASK: Write a function to find exact duplicates using MD5
Return: dict with keys: 'unique_count', 'duplicate_ids', 'hash_collisions'


{'unique_count': 10,
 'duplicate_ids': ['doc_11',
  'doc_7',
  'doc_6',
  'doc_4',
  'doc_0',
  'doc_5',
  'doc_2',
  'doc_9',
  'doc_13',
  'doc_8'],
 'hash_collisions': 0}

In [58]:
import re
print("\n" + "=" * 60)
print("PROBLEM 2: NEAR-DUPLICATE DETECTION WITH JACCARD")
print("=" * 60)
print("\nDataset has texts with varying similarity to the first document")
print("Task: Find all documents with Jaccard similarity > 0.7 to doc 0")
print("Expected: IDs 0, 1, 2, 3, 6, 7 (after normalization)\n")

near_data = generate_near_dup_dataset()
for doc in near_data:
    print(f"  ID: {doc['id']}, Text: {doc['text']}")

print("\nYOUR TASK: Write jaccard_similarity(text1, text2) function")
print("Normalize: lowercase, remove punctuation")
print("Return: list of IDs with similarity > 0.7 to document 0")


def normalize(text):
    # lowercase + remove punctuation
    text = re.sub(r'[^\w\s]', ' ', text)
    # split removes duplicate whitespace too 
    return text.lower().split()


def jaccard_similarity(text1, text2):
    # jaccard is intersection / union
    t1_set = set(normalize(text1))
    t2_set = set(normalize(text2))
    if not t1_set and not t2_set:
        return 1
    elif not t1_set or not t2_set:
        return 0
    return len(t1_set & t2_set) / len(t1_set | t2_set)
jaccard_similarity(near_data[0]['text'], near_data[3]['text'])


PROBLEM 2: NEAR-DUPLICATE DETECTION WITH JACCARD

Dataset has texts with varying similarity to the first document
Task: Find all documents with Jaccard similarity > 0.7 to doc 0
Expected: IDs 0, 1, 2, 3, 6, 7 (after normalization)

  ID: 0, Text: The quick brown fox jumps over the lazy dog
  ID: 1, Text: The quick brown fox jumps over a lazy dog
  ID: 2, Text: A quick brown fox jumped over the lazy dog
  ID: 3, Text: The fast brown fox jumps over the lazy cat
  ID: 4, Text: The quick red fox runs over the sleepy dog
  ID: 5, Text: Completely different text about programming
  ID: 6, Text: The quick brown fox jumps over the lazy dog!
  ID: 7, Text: THE QUICK BROWN FOX JUMPS OVER THE LAZY DOG

YOUR TASK: Write jaccard_similarity(text1, text2) function
Normalize: lowercase, remove punctuation
Return: list of IDs with similarity > 0.7 to document 0


0.6

In [59]:

print("\n" + "=" * 60)
print("PROBLEM 3: SPAM DETECTION WITH REPETITION ANALYSIS")
print("=" * 60)
print("\nDataset has spam and legitimate messages")
print("Task: Detect spam using word/phrase repetition patterns")
print("Expected: 7 spam messages\n")



PROBLEM 3: SPAM DETECTION WITH REPETITION ANALYSIS

Dataset has spam and legitimate messages
Task: Detect spam using word/phrase repetition patterns
Expected: 7 spam messages



In [69]:
a = {'a', 'b', 'c'}
b = {'b', 'a', 'c', 'd'}
a & b, a.intersection(b)

({'a', 'b', 'c'}, {'a', 'b', 'c'})

In [60]:

spam_data = generate_spam_dataset()
print(f"Dataset size: {len(spam_data)} documents")
print("Sample spam:", spam_data[0]['text'][:100] + "...")
print("Sample ham:", [d for d in spam_data if d['label'] == 'ham'][0]['text'][:100])


Dataset size: 12 documents
Sample spam: amazing deal click here amazing deal click here amazing deal click here amazing deal click here amaz...
Sample ham: The conference will feature speakers from industry and academia discussing AI ethics


In [121]:
from collections import Counter

print("\nYOUR TASK: Write is_spam(text) using repetition metrics")
print("Consider: unique word ratio, bigram repetition, phrase patterns")
print("Return: accuracy on the labeled dataset")


# phrase patterns

text = "the quick brown fox jumped over quick brown"
def is_spam(text):
    tokens = text.split()
    # look at bigram repetition
    bigrams = list(zip(tokens, tokens[1:]))
    # look at repetition
    repeat = (len(bigrams) - len(set(bigrams))) / len(bigrams)
    counter = Counter(tokens)
    most_common = counter.most_common(1)[0][1] / len(tokens)
    return most_common, repeat

is_spam(text)



YOUR TASK: Write is_spam(text) using repetition metrics
Consider: unique word ratio, bigram repetition, phrase patterns
Return: accuracy on the labeled dataset


(0.25, 0.14285714285714285)