In [None]:
import random
from collections import defaultdict

# Simulating a large dataset
large_dataset = "big data optimization is crucial big data minimizes cost ".strip() * 1000

# Mapper function
def mapper(input_chunk):
    words = input_chunk.split()
    return [(word.lower(), 1) for word in words]

# Combiner function
def combiner(mapped_data):
    combined_data = defaultdict(int)
    for key, value in mapped_data:
        combined_data[key] += value
    return list(combined_data.items())

# Partition function
def partition_data(mapped_data, num_partitions):
    partitions = [[] for _ in range(num_partitions)]
    for key, value in mapped_data:
        partition_index = hash(key) % num_partitions
        partitions[partition_index].append((key, value))
    return partitions

# Reducer function
def reducer(partition):
    reduced_data = defaultdict(int)
    for key, value in partition:
        reduced_data[key] += value
    return reduced_data

# Simulating optimization with tuned parameters
chunk_size = 512
num_partitions = 4

# Step 1: Divide dataset into chunks
chunks = [large_dataset[i:i + chunk_size] for i in range(0, len(large_dataset), chunk_size)]

# Step 2: Apply mapper and combiner for each chunk
mapped_data = []
for chunk in chunks:
    chunk_mapped = mapper(chunk)
    combined_chunk = combiner(chunk_mapped)  # Local combining
    mapped_data.extend(combined_chunk)

# Step 3: Partitioning
partitions = partition_data(mapped_data, num_partitions)

# Step 4: Reduce phase
final_result = defaultdict(int)
for partition in partitions:
    reduced_partition = reducer(partition)
    for key, value in reduced_partition.items():
        final_result[key] += value

# Display optimized results
sorted_results = sorted(final_result.items(), key=lambda x: x[1], reverse=True)

print("Top 10 Words by Frequency:")
for word, count in sorted_results[:10]:
    print(f"{word}: {count}")

Top 10 Words by Frequency:
data: 1985
big: 1016
is: 1000
crucial: 1000
minimizes: 985
optimization: 984
costbig: 984
optimiz: 16
ation: 16
cost: 16
