# Command line wordcount

## WordCount using a single thread  

Write a program called alice_words.py that creates a text file named __alice_words.txt__ containing an alphabetical tab separated listing of all the words, and the number of times each occurs, in the text version of Alice’s Adventures in Wonderland. (http://www.gutenberg.org/cache/epub/11/pg11.txt) 

In [5]:
# !curl 'http://www.gutenberg.org/cache/epub/11/pg11.txt' -o alicesTExtFilename.txt
# sometimes the above link produces junk characters. However, the direct link works as expected:
!curl 'http://www.gutenberg.org/files/11/11-0.txt' -o alicesTextFilename.txt

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  169k  100  169k    0     0   133k      0  0:00:01  0:00:01 --:--:--  149k


In [6]:
#display the first few lines
!head alicesTextFilename.txt

﻿Project Gutenberg’s Alice’s Adventures in Wonderland, by Lewis Carroll

This eBook is for the use of anyone anywhere at no cost and with
almost no restrictions whatsoever.  You may copy it, give it away or
re-use it under the terms of the Project Gutenberg License included
with this eBook or online at www.gutenberg.org


Title: Alice’s Adventures in Wonderland



In [9]:
%%writefile alice_words.py
import re
import sys
from collections import defaultdict

pathToFile = sys.argv[1]
wordCounts = defaultdict(int)
wordList = []

def wordcount(pathToFile):
    # takes the path to the file as command line argument
    # prints sorted tab separated list of words and counts
    # ex) print word,'\t',count
    # returns sorted list of tuples of words and counts: wordList
    # ex) wordList = [('a', 690),('abide', 2),...]
  
    with open (pathToFile, "r") as my_text:
        # read the whole text file
        text = my_text.read()
        # count word frequencies
        for word in re.findall(r'[a-z]+', text.lower()):
            wordCounts[word] += 1
            
    # extract counts and print to tab-separated text
    for word in sorted(wordCounts):
        count = wordCounts[word]
        print word,'\t',count
        # save the word frequencies for the return statement
        wordList.append((word, count))
    
  
    return wordList

wordcount(pathToFile)

Overwriting alice_words.py


In [11]:
!python alice_words.py 'alicesTextFilename.txt' > alice_words.txt

### Pretty print top 10 results from alice_words.txt

In [49]:
def top_n_printer(text,n):
    print '{:15}{}'.format('Word', 'Count')
    print '='*20

    with open(text) as f:
        idx = 0
        for line in f.readlines():
            line = line.strip()
            word, count = line.split('\t')
        # print the top n lines
            if idx < n:
                print '{:17}{:3d}'.format(word, int(count))
            idx += 1  
    
top_n_printer("alice_words.txt",10)    

Word           Count
a                690
abide              2
able               1
about            102
above              3
absence            1
absurd             2
accept             1
acceptance         1
accepted           2


### How many times does the word alice occur in the book?

In [13]:
%%writefile return_word.py
import re
import sys
import subprocess
from collections import defaultdict

word = sys.argv[1]
pathToFile = sys.argv[2]

def return_word(word,pathToFile):
  # takes a word and the path to the file as arguments
  # returns the line containing the word and count
    
  # START STUDENT CODE HW111
    args = ['grep','-i', word, pathToFile]
    out, err = subprocess.Popen(args,stdout=subprocess.PIPE).communicate()
    return(out)
  # END STUDENT CODE HW111
    
print return_word(word,pathToFile)


Overwriting hw111.py


In [14]:
!python return_word.py 'alice' 'alice_words.txt'

alice 	403



##  Command Line Map Reduce Framework  

In [42]:
%%writefile pWordCount.sh
#!/bin/bash
## pWordCount.sh
## Author: James G. Shanahan
## Usage: pWordCount.sh m wordlist testFile.txt
## Input:
##       m = number of processes (maps), e.g., 4
##       word = a word in quotes, e.g., "alice"
##       inputFile = a text input file
##
## Instructions: Read this script and its comments closely.
##               Do your best to understand the purpose of each command,
##               and focus on how arguments are supplied to mapper.py/reducer.py,
##               as this will determine how the python scripts take input.



usage()
{
    echo ERROR: No arguments supplied
    echo
    echo To run use
    echo "pWordCount.sh m word inputFile"
    echo Input:
    echo "number of processes/maps, EG, 4"
    echo "word = a word in quotes, e.g., 'alice'"
    echo "inputFile = a text input file"
}

if [ $# -eq 0 ]
  then
    usage  
    exit 1
fi
    
## collect user input
m=$1 ## the number of parallel processes (maps) to run

word=$2 ## if set to "*", then all words are used

## a text file 
data=$3

## 'wc' determines the number of lines in the data
## 'perl -pe' regex strips the piped wc output to a number
linesindata=`wc -l $data | perl -pe 's/^.*?(\d+).*?$/$1/'`

## determine the lines per chunk for the desired number of processes
linesinchunk=`echo "$linesindata/$m+1" | bc`

## split the original file into chunks by line
split -l $linesinchunk $data $data.chunk.

## assign python mappers (mapper.py) to the chunks of data
## and emit their output to temporary files
for datachunk in $data.chunk.*; do
    ## feed word list to the python mapper here and redirect STDOUT to a temporary file on disk
    ####
    ####
    ./mapper.py  "$word" < $datachunk > $datachunk.counts &
    ####
    ####
done
## wait for the mappers to finish their work
wait
    
###----------------------------------------------------------------------------------------
#TODO 
#Insert a sort -k1,1 above to collate wordCount records with the same key (i.e., same word)

for wordcount in $data.chunk.*.counts; do    
    sort -k1,1 $wordcount > $wordcount.sorted
done    
    
#
###----------------------------------------------------------------------------------------

    
## 'ls' makes a list of the temporary count files
## 'perl -pe' regex replaces line breaks with spaces
countfiles=`\ls $data.chunk.*.sorted | perl -pe 's/\n/ /'`
## feed the list of countfiles to the python reducer and redirect STDOUT to disk
####
####
cat $countfiles | ./reducer.py  > $data.output
####
####

## clean up the data chunks and temporary count files
\rm $data.chunk.*
    
## display the content of the output file:
cat $data.output

Overwriting pWordCount.sh


In [26]:
!head pWordCount.sh

#!/bin/bash
## pWordCount.sh
## Author: James G. Shanahan
## Usage: pWordCount.sh m wordlist testFile.txt
## Input:
##       m = number of processes (maps), e.g., 4
##       word = a word in quotes, e.g., "alice"
##       inputFile = a text input file
##
## Instructions: Read this script and its comments closely.


In [27]:
# Change the execution priviledges to make the shell script executable by all
!chmod a+x pWordCount.sh

### Test the framework without parameters:

In [28]:
! ./pWordCount.sh

ERROR: No arguments supplied

To run use
pWordCount.sh m word inputFile
Input:
number of processes/maps, EG, 4
word = a word in quotes, e.g., 'alice'
inputFile = a text input file


### Run the following two cells to generate mapper and reducer files, then run the shell script again with arguments.¶

In [16]:
%%writefile mapper.py
#!/usr/bin/python
import sys
import re
count = 0
findword = sys.argv[1]
for line in sys.stdin:
    # count all occurances of the word in each line:
    count = count + line.lower().count(findword)
print(count)

Overwriting mapper.py


In [17]:
%%writefile reducer.py
#!/usr/bin/python
## Description: reducer code for HW1.2
import sys
import re
total = 0

for line in sys.stdin:
    total += int(line)

print(total)

Overwriting reducer.py


### Make the files executable:

In [18]:
!chmod a+x mapper.py
!chmod a+x reducer.py

### Test the framework with parameters:

In [8]:
!./pWordCount.sh 4 'alice' 'alicesTExtFilename.txt'

403


## WordCount via Command Line Map Reduce Framework 

* mapper.py counts all occurrences of a single word.
* reducer.py sums the count value from the collated records for each  word.
* pWordCount.sh, runs mapper.py and reducer.py and sorts the key-value pair records by key from the mappers.



Here, mapper.py will read in a portion (i.e., a single record corresponding to a row) of the Alice in Wonderland data,
count the number of occurences of the  word in question and print/emit a count to the output stream. The reducer responsible for reading in counts of the word from the input stream, and summarizing them before printing that summary to the output stream.
See example the [notebook](http://nbviewer.jupyter.org/urls/dl.dropbox.com/s/5zq0faibmvtjlbr/DivideAndConquer2-python-Plus-CmdLine.ipynb)
See video section 1.12.1 1.12.1 Poor Man's MapReduce Using Command Line (Part 2) located at: 
https://learn.datascience.berkeley.edu/mod/page/view.php?id=10961

In [2]:
%%writefile mapper.py
#!/usr/bin/python

# START STUDENT CODE HW12MAPPER

from collections import defaultdict
from itertools import repeat
import re
import sys

# --------------------- function definitions --------------------------- #

def dictionary_builder(text,findword):
    '''Build a default dictionary of words and corresponding counts.
    
    Parameters:
    -----------
    text: a text file object
    findword: a string, if it is '*', the function looks for all of the words in text
    
    Returns:
    -------
    A defaultdict with words as keys and counts as items.
    '''
    wordCounts = defaultdict(int)
    
    # define regular expression to look for
    if findword == '*':
        word = re.compile(r'[a-z]+',re.IGNORECASE)
    else:
        word = re.compile(r'(?<![\w]){0}(?![\w])'.format(findword),re.IGNORECASE)
    
    # count word frequencies for each line in text
    for line in text.readlines():
        for this_word in word.findall(line):
            wordCounts[this_word] += 1
    return wordCounts
    
def word_emitter(word):
    '''Prints word from a wordCounts dictionary to stdout'''
    print word,'\t',wordCounts[word]

# ---------------------- run -------------------- #    
    
findword = sys.argv[1]
# construct word count dictionary
wordCounts = dictionary_builder(sys.stdin,findword)
# emit the words and counts to stdout
map(word_emitter,wordCounts.keys())
    
# END STUDENT CODE HW12MAPPER    

Overwriting mapper.py


In [120]:
%%writefile reducer.py
#!/usr/bin/python

# START STUDENT CODE HW12REDUCER
import sys

this_word = None
word = None
total = 0
for word_count in sys.stdin:
    word, count = word_count.split('\t')
    # cast count to integer from string
    count = int(count)
    # turn all words lowercase for summing them up
    word = word.lower()
    
    # if the current word is the same as the previous one, sum them up (that is, sum up the lower and upper case instances)
    if this_word == word:
        total += count    
    # if the two adjacent words are different     
    else:
        if this_word is not None:
            # print all words to stdout
            print '{0}\t{1}'.format(this_word, total)   
        total = count
        this_word = word

# print to stdout output when the input argument is a single word 
print '{0}\t{1}'.format(this_word, total)
# END STUDENT CODE HW12REDUCER


Overwriting reducer.py


In the next cell use the Unix chmod command to change the permissions of the mapper/reducer using the following commands:

In [None]:
!chmod +x mapper.py; 
!chmod +x reducer.py

### Run the command below with 1 mapper. You should get the same result as before, with more than 1 mappers, a different one.


In [3]:
!./pWordCount.sh 1 "alice" 'alicesTExtFilename.txt'

alice 	403


In [None]:
!./pWordCount.sh 1 "*" 'alicesTExtFilename.txt'

In [53]:
top_n_printer('alicesTExtFilename.txt.output',10)

Word           Count
a                690
abide              2
able               1
about            102
above              3
absence            1
absurd             2
accept             1
acceptance         1
accepted           2


In [None]:
!./pWordCount.sh 2 "*" 'alicesTExtFilename.txt'

In [109]:
top_n_printer('alicesTExtFilename.txt.output',10)

Word           Count
a                391
abide              1
able               1
about             49
above              2
absurd             2
acceptance         1
accounts           1
accustomed         1
across             2


If the number of mappers is higher than one and the word counter script takes only a single word as argument, the script returns the same results as with only one mapper because all of the temporary results contain only row with the partial sum which can be added together. 

However, when the number of mappers is higher than one and the word counter script takes all of the words as argument, the script emits the partial results sorted within each temporary file that the mappers produced.

##  Count words staring with uppercase and words starting with lowercase

In [None]:
%%writefile reducer.py
#!/usr/bin/python

import sys

word_list = []
total = 0
upper_sum = 0
for word_count in sys.stdin:
    word, count = word_count.split('\t')
    if word[0].isupper():
        upper_sum += int(count)
    total += int(count)

print 'Number of words starting with\n\n\tlowercase letters: {lower}\n\tuppercase letters: {upper}'.format(lower=total-upper_sum,upper=upper_sum)


In [None]:
!./pWordCount.sh 1 "*" 'alicesTExtFilename.txt'