# Week 3 Exercises: MapReduce


## Exercise 1: map and reduce functions


In [3]:
from functools import reduce

# from collections import defaultdict

numbers = [1, 2, 3, 4, 5]
numbers_2 = [0.99, 1.99, 2.99, 3.99, 4.99]

double_number = lambda x: x * 2

print("Doubling function:", list(map(double_number, numbers)))
print("Rounding function", list(map(round, numbers)))
print("Sum using reduce:", reduce(lambda x, y: x + y, numbers))

Doubling function: [2, 4, 6, 8, 10]
Rounding function [1, 2, 3, 4, 5]
Sum using reduce: 15


## Exercise 2: Word Frequency


In [7]:
from functools import reduce
from collections import defaultdict

words = []


def word_counter(filename):
    file = open(filename)
    for line in file:
        if line.strip() != " ":
            words.extend(line.strip().split())
    return words


word_counter("word_frequency_test.txt")

# Alternatively for a string (not text from a file)
"""
# String
# text = "the quick brown fox jumps over the lazy dog the fox was quick"

# Splitting
# words = text.split()
"""

# Mapping step
mapped_words = list(map(lambda word: (word, 1), words))


# Reducing step
def reducer(acc, pair):
    word, count = pair
    acc[word] += count
    return acc


word_freq = reduce(reducer, mapped_words, defaultdict(int))

# Converting to normal dictionary
word_freq = dict(sorted(word_freq.items(), key=lambda item: item[1], reverse=True))
word_freq

{'apple': 6,
 'banana': 4,
 'repeat': 4,
 'cat': 3,
 'python': 3,
 'code': 3,
 'test': 3,
 'the': 3,
 'orange': 2,
 'hello': 2,
 'word': 2,
 'frequency': 2,
 'openai': 2,
 'gpt': 2,
 'and': 2,
 'once': 2,
 'dog': 1,
 'bird': 1,
 'world': 1,
 'this': 1,
 'is': 1,
 'a': 1,
 'simple': 1,
 'text': 1,
 'file': 1,
 'for': 1,
 'testing': 1,
 'counting': 1,
 'but': 1,
 'or': 1,
 'not': 1,
 'yes': 1,
 'no': 1,
 'maybe': 1,
 'twice': 1}

## Exercise 3: Inverted Index

In [14]:
from functools import reduce
from collections import defaultdict
import os

# filenames = ["inverted_index_test_1.txt", "inverted_index_test_2.txt"]

documents = {"doc1": "dog cat", "doc2": "dog fox", "doc3": "cat mouse"}


"""
def read_file_content(docs):
    words = {}
    for doc in docs:
        with open(doc) as d:
            for line in d:
                (key, val) = line.split()
                words[int(key)] = val
    for i in files:
        file = open(i)
        words.append(file.read().split())
    return words
"""


# Mapping
"""
is_in = lambda doc_id_content: [
    (word, doc_id_content[0]) for word in doc_id_content[1]
].split()
mapping = list(map(is_in, file_contents))
print(mapping)
"""


mapped_words = list(
    map(
        lambda doc_id_content: [
            (word, doc_id_content[0]) for word in doc_id_content[1].split()
        ],
        documents.items(),
    )
)
mapped_words = [
    item for sublist in mapped_words for item in sublist
]  # Flatten the list of lists


# Step 2: Reduce phase - Use reduce to group document_ids for each word
def reducer(acc, pair):
    word, doc_id = pair
    acc[word].add(doc_id)
    return acc


# Reduce to aggregate the document lists for each word
inverted_index = reduce(reducer, mapped_words, defaultdict(set))

# Convert sets to lists for better readability
inverted_index = {word: list(docs) for word, docs in inverted_index.items()}

# Display the result
inverted_index

{'dog': ['doc2', 'doc1'],
 'cat': ['doc3', 'doc1'],
 'fox': ['doc2'],
 'mouse': ['doc3']}