In [2]:
from collections import defaultdict

# Mapper Function
def mapper(chunk):
    """
    Mapper function to generate word count pairs.
    Args:
        chunk (str): A chunk of text.
    Returns:
        list of tuples: List of (word,1) pairs.
    """
    words = chunk.split()
    return[(word.lower(), 1) for word in words] # convert to lowercase for case insensitivity


In [4]:
# Reducer Function
def reducer(pairs):
    """
    Reducer function to aggregate word counts.
    Args:
        pairs (list of tuples): List of (word,count) pairs.
    Returns:
        dict: Dictionary with words as keys and their counts as values.
    """
    word_counts = defaultdict(int)
    for word, count in pairs:
        word_counts[word] += count
    return dict (word_counts)

In [5]:
# MapReduce Simulation Function
def mapreduce_simulation(text):
    """
    Simulates a MapReduce workflow for word count.
    Args:
        text (str): Input text data.
    Returns:
        dict: Word count results.
    """
    # Split text into chunks (simulate splitting into distributed nodes)
    chunks = text.split('\n')

    # Step 1: Map phase
    Intermediate_pairs = []
    for chunk in chunks:
        Intermediate_pairs.extend(mapper(chunk))

    # Step 2: Shuffle and Sort (Simulated)
    # Group intermediate pairs by key (word)

    grouped_pairs = defaultdict(list)
    for word, count in Intermediate_pairs:
        grouped_pairs[word].append(count)

    # Flatten grouped pairs for reduction
    flattened_pairs = [(word, sum(counts)) for word, counts in grouped_pairs.items()]

    # Step 3: Reduce phase
    final_output = reducer(flattened_pairs)
    return final_output

In [6]:
# Main Function
if __name__ == "__main__":
    # Input Data
    input_data = """Hadoop is an open source framework.
    Hadoop supports MapReduce programming.
    Python is also used for MapReduce simulation."""

    # Run MapReduce Simulation
    word_counts = mapreduce_simulation(input_data)

    # Print the Results
    print("Word Counts:")
    for word, count in word_counts.items():
        print(f"{word}: {count}")


Word Counts:
hadoop: 2
is: 2
an: 1
open: 1
source: 1
framework.: 1
supports: 1
mapreduce: 2
programming.: 1
python: 1
also: 1
used: 1
for: 1
simulation.: 1
