# Wordcount

## Complete version

In [None]:
import os

In [None]:
import pycompss.interactive as ipycompss

In [None]:
from pycompss.api.task import task

In [None]:
from pycompss.api.parameter import *

In [None]:
ipycompss.start(debug=False, monitor=1000, graph=True, trace=True)

In [None]:
@task(returns=list)
def read_file(file_path):
    """ Read a file and return a list of words.
    :param file_path: file's path
    :return: list of words
    """
    data = []
    with open(file_path, 'r') as f:
        for line in f:
            data += line.split()
    return data

In [None]:
@task(returns=dict)
def wordCount(data):
    """ Construct a frequency word dictorionary from a list of words.
    :param data: a list of words
    :return: a dictionary where key=word and value=#appearances
    """
    partialResult = {}
    for entry in data:
        if entry in partialResult:
            partialResult[entry] += 1
        else:
            partialResult[entry] = 1
    return partialResult

In [None]:
@task(returns=dict, priority=True)
def merge_two_dicts(dic1, dic2):
    """ Update a dictionary with another dictionary.
    :param dic1: first dictionary
    :param dic2: second dictionary
    :return: dic1+=dic2
    """
    for k in dic2:
        if k in dic1:
            dic1[k] += dic2[k]
        else:
            dic1[k] = dic2[k]
    return dic1

In [None]:
from pycompss.api.api import compss_wait_on

# Get the dataset path
pathDataset = '/home/compss/tutorial_apps/python/wordcount/data'

# Read file's content execute a wordcount on each of them
partialResult = []
for fileName in os.listdir(pathDataset):
    file_path = os.path.join(pathDataset, fileName)
    data = read_file(file_path)
    partialResult.append(wordCount(data))

# Accumulate the partial results to get the final result.
result = {}
for partial in partialResult:
    result = merge_two_dicts(result, partial)

# Wait for result
result = compss_wait_on(result)

In [None]:
print("Result:")
from pprint import pprint
pprint(result)
print("Words: {}".format(sum(result.values())))

In [None]:
ipycompss.stop()