In [1]:
""" Return a list of lines of charaters in filename
    filename: a string that show the file path and file name of a file to read
    if filename does not exist, return None 
"""
def read_file(filename):
    try:
        with open(filename, 'r') as reader:
            list_lines = reader.readlines()
        return list_lines
    except:
        print('*** Error! fail to open ', filename)
        return

""" Return a string free of non alpha-numeric characters, each replaced by a space
    string: a series of characters of which punctuation characters need removing
"""
def remove_punctuation(string):
    for char in string:
        if not char.isalnum():
            string = string.replace(char, ' ')
    return string

""" Return a string free of capitalized characters, each replaced by lower case
    string: a series of characters of which characters need lowering case
"""
def lower_case(string):
    return string.lower()

""" Return a defaultdict with key = word, value = count of the word in a string
    string: a series of characters of which words need counting
    from collections import defaultdict before calling the function
"""
def bag_of_words(string, word_counts, stop_words=False):
    ignored_words = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", 
                      "your", "yours", "yourself", "yourselves", "he", "him", "his", 
                      "himself", "she", "her", "hers", "herself", "it", "its", "itself", 
                      "they", "them", "their", "theirs", "themselves", "what", "which", 
                      "who", "whom", "this", "that", "these", "those", "am", "is", "are", 
                      "was", "were", "be", "been", "being", "have", "has", "had", "having", 
                      "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", 
                      "or", "because", "as", "until", "while", "of", "at", "by", "for", 
                      "with", "about", "against", "between", "into", "through", "during", 
                      "before", "after", "above", "below", "to", "from", "up", "down", "in", 
                      "out", "on", "off", "over", "under", "again", "further", "then", "once", 
                      "here", "there", "when", "where", "why", "how", "all", "any", "both", 
                      "each", "few", "more", "most", "other", "some", "such", "no", "nor", 
                      "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", 
                      "will", "just", "don", "should", "now"]
    
    for word in string.split():
        if stop_words:
            if word not in ignored_words:
                word_counts[word] += 1
        else:
            word_counts[word] += 1

""" Return a defaultdict with key = character, value = count of the character in a string
    string: a series of characters of which characters need counting
    from collections import defaultdict before calling the function
"""
def bag_of_characters(string, char_counts):
    for char in string:
        if char.isalpha():
            char_counts[char] += 1
            
""" Print all pairs key-value of a dict in sorted order by key
    (copy from Dr. Shapiro lecture notes date July 15, 2020)
    ditionary: a dict with pairs key-value
"""
def print_dict(dictionary, word_gap='15s', count_gap='2d'):
    list_items = list(dictionary.items())
    list_items.sort()
    column = 0
    for word, count in list_items:
        print(format(word, word_gap), format(count, count_gap) + ' | ', end = '')
        column += 1
        if column %5 == 0:
            print()

In [2]:
document_lines = read_file('independence.txt')
print('The document has total {} lines.'.format(len(document_lines)))

The document has total 63 lines.


In [3]:
from collections import defaultdict
word_counts = defaultdict(int)
char_counts = defaultdict(int)

# Words in the document 
for line in document_lines:
    line = remove_punctuation(line)
    line = lower_case(line)
    bag_of_words(line, word_counts)
    bag_of_characters(line, char_counts)
    
print_dict(word_counts)

a               16 | abdicated        1 | abolish          1 | abolishing       3 | absolute         3 | 
absolved         1 | abuses           1 | accommodation    1 | accordingly      1 | accustomed       1 | 
acquiesce        1 | act              1 | acts             2 | administration   1 | affected         1 | 
after            1 | against          2 | ages             2 | all             10 | allegiance       1 | 
alliances        1 | alone            1 | already          1 | alter            2 | altering         1 | 
america          2 | among            5 | amongst          1 | amount           1 | an               5 | 
and             56 | annihilation     1 | another          1 | answered         1 | any              2 | 
appealed         1 | appealing        1 | appropriations   1 | arbitrary        1 | are              9 | 
armed            1 | armies           2 | arms             1 | as               4 | assembled        1 | 
assent           4 | assume           1 | at  

In [4]:
# Words in the document without the English stop-words
from collections import defaultdict
word_counts = defaultdict(int)
char_counts = defaultdict(int)

for line in document_lines:
    line = remove_punctuation(line)
    line = lower_case(line)
    bag_of_words(line, word_counts, stop_words=True)
    bag_of_characters(line, char_counts)
    
print_dict(word_counts)

abdicated        1 | abolish          1 | abolishing       3 | absolute         3 | absolved         1 | 
abuses           1 | accommodation    1 | accordingly      1 | accustomed       1 | acquiesce        1 | 
act              1 | acts             2 | administration   1 | affected         1 | ages             2 | 
allegiance       1 | alliances        1 | alone            1 | already          1 | alter            2 | 
altering         1 | america          2 | among            5 | amongst          1 | amount           1 | 
annihilation     1 | another          1 | answered         1 | appealed         1 | appealing        1 | 
appropriations   1 | arbitrary        1 | armed            1 | armies           2 | arms             1 | 
assembled        1 | assent           4 | assume           1 | attempts         1 | attend           1 | 
attentions       1 | authority        1 | away             1 | bands            1 | barbarous        1 | 
bear             1 | become           1 | beco

In [5]:
print_dict(char_counts, word_gap='5s', count_gap='4d')

a      483 | b       95 | c      186 | d      254 | e      867 | 
f      182 | g      130 | h      352 | i      454 | j       16 | 
k       14 | l      229 | m      146 | n      488 | o      517 | 
p      138 | q        6 | r      428 | s      481 | t      648 | 
u      212 | v       74 | w       97 | x        9 | y       81 | 
z        4 | 