In [1]:
import os, sys
from urllib.parse import urlparse
import requests
from random import randint


In [2]:
# Settings
tmp_dir = 'tmp'
# urls = ['https://www.gutenberg.org/files/1661/1661-0.txt', 'https://www.gutenberg.org/files/174/174-0.txt', 'https://www.gutenberg.org/files/4300/4300-0.txt']
urls = ['https://tschiatschek.net/courses/MMD/SS2021/PP1/1661-0.txt', 'https://tschiatschek.net/courses/MMD/SS2021/PP1/174-0.txt', 'https://tschiatschek.net/courses/MMD/SS2021/PP1/4300-0.txt']


In [3]:
def mapper(key, value, n=5):
    """
    key ... url/filename
    value ... contents of the file
    n ... length of sequences of consecutive words

    yields a generator of tuples, where the first entry in the tuple "count" and the second entry is the number of words in the document
    """
    bad_chars = [',', ';', ':', '.', '*', '?', '!', '[', ']']
    for char in bad_chars:
        value = value.replace(char, '')
    tokens = value.split()
    n_grams = [' '.join(tokens[i:i+n]) for i in range(len(tokens) - n + 1)]

    d_n_gram = {}

    for n_gram in n_grams:
        if n_gram in d_n_gram.keys():
            d_n_gram[n_gram] += 1
        else:
            d_n_gram[n_gram] = 1
    for key, value in d_n_gram.items():
        yield (key, value)

def reducer(key, values):
    """
    key ... n-gram
    values ... counts for the n-gram
    """
    yield (key, sum(values))


In [4]:
if __name__ == "__main__":
    print("Running MapReduce for creating a language model...")

    # (1) Download books from Project Gutenberg and read their content
    files = {}
    try:
        if not os.path.exists(tmp_dir):
            os.mkdir(tmp_dir)
        for url in urls:
            tmp = urlparse(url)
            filename = os.path.basename(tmp.path)
            target_filename = os.path.join(tmp_dir, filename)
            print("- Downloading '%s' to '%s'." % (url, target_filename))
            if os.path.exists(target_filename):
                print("  File already exists, not downloading.")
            else:
                r = requests.get(url, allow_redirects=True)
                open(target_filename, 'wb').write(r.content)

            files[url] = open(target_filename, 'rt', encoding='utf-8').read()
    except:
        raise RuntimeError("Failed to download books: ", sys.exc_info()[0])

    files = list(files.items())
    
    for n in [1, 2, 3]:
        print(f'\nRunning the functions for n={n}:')
        # (2) Run mappers
        mapper_results = map(mapper, [x[0] for x in files], [x[1] for x in files], [n]*len(files))

        # (3) Gather results from mappers, sort and run reducers
        mapper_results = list(mapper_results)
        mapper_results_dict = {}
        for mapper_result in mapper_results:
            for key, value in mapper_result:
                if key not in mapper_results_dict:
                    mapper_results_dict[key] = []
                mapper_results_dict[key].append(value)
        mapper_results_dict = mapper_results_dict.items()
        reducer_results = map(reducer, [x[0] for x in mapper_results_dict], [x[1] for x in mapper_results_dict])

        # (4) Gather restults form reducers and output them
        reducer_results = list(reducer_results)
        reducer_results = [list(x) for x in reducer_results]
        
        num_of_examples = 5
        for _ in range(num_of_examples):
            i = randint(0, len(reducer_results) - 1)
            n_gram = reducer_results[i][0][0]
            num_of_apps = reducer_results[i][0][1]
            print(f"Total number of appearances of the {n}-gram '{n_gram}' in the documents:")
            print(num_of_apps)


Running MapReduce for creating a language model...
- Downloading 'https://tschiatschek.net/courses/MMD/SS2021/PP1/1661-0.txt' to 'tmp\1661-0.txt'.
- Downloading 'https://tschiatschek.net/courses/MMD/SS2021/PP1/174-0.txt' to 'tmp\174-0.txt'.
- Downloading 'https://tschiatschek.net/courses/MMD/SS2021/PP1/4300-0.txt' to 'tmp\4300-0.txt'.

Running the functions for n=1:
Total number of appearances of the 1-gram 'Riario' in the documents:
1
Total number of appearances of the 1-gram 'clockwork' in the documents:
1
Total number of appearances of the 1-gram '“After' in the documents:
1
Total number of appearances of the 1-gram 'Rinbad' in the documents:
1
Total number of appearances of the 1-gram 'manslaughter' in the documents:
1

Running the functions for n=2:
Total number of appearances of the 2-gram 'with running' in the documents:
1
Total number of appearances of the 2-gram 'neglected and' in the documents:
1
Total number of appearances of the 2-gram 'widest array' in the documents:
3
Tot