In [20]:
'''urllib.request - To download the KJV Bible from a URL'''

import urllib.request

'''Step 1: Downloading the KJV Bible as pg10.txt'''
urllib.request.urlretrieve("https://www.gutenberg.org/cache/epub/10/pg10.txt", "pg10.txt")

'''
Step 2: Setting pg10 as a variable containing strings from pg10.txt
    via the 'read' function
'''
with open("pg10.txt", "r", encoding='utf-8') as f:
    pg10 = f.read()

'''
Step 3: Counting the number of lines in pg10 via splitting the string into lines
    then counting the length
'''
num_lines = len(pg10.splitlines())
print("The string has " + str(num_lines) + " lines.")

The string has 99968 lines.


In [25]:
'''
Step 4: Counting the number of words in pg10 via examining each combination of letters.
    If it is a sequence of directly-adjacent alphabet characters (via 'isalpha'), then we add it to the words list.
    Following this, each word is converted into lower-case to allow for non-case-sentitivity.
    Then, we count the total number of words via len(words)
'''
words = []
word = ''
for char in pg10:
    if char.isalpha():
        word += char
    else:
        if word:
            cleaned_word = word.lower()
            words.append(cleaned_word)
            word = ''
if word:  
    cleaned_word = word.lower()
    words.append(cleaned_word)

num_words = len(words)
print("The string has "+ str(num_words) + " words.")

The string has 795227 words.


In [27]:
'''
Step 5: Counting the number of times 'apostle' appears in pg10 via using the existing 'words' list. This includes possibilities
    such as apostle appearing in words eg. 'apostles', apostles being hyphenated or apostrophed eg. 'apostle's', and if it is 
    beside a punctuation mark eg. 'apostles!'. In any of these cases, the code will "non-case-sensitive" the word, then clean it up before
    and add it to the list. Following this, the number of instances will be counted in increments of 1 and summed.
'''

apostle = []
apostle_word = ''
for char in pg10:
    if char.isalpha() or char in ["'", "-"]:
        apostle_word += char
    else:
        if apostle_word:
            cleaned_apostle = apostle_word.lower().rstrip(".;:,!?")
            apostle.append(cleaned_apostle)
            apostle_word = ''
if apostle_word:  
    cleaned_apostle = apostle_word.lower().rstrip(".;:,!?")
    apostle.append(cleaned_apostle) 

apostle_count = sum(1 for w in apostle if "apostle" in w)
print("The word 'apostle' appears " + str(apostle_count) + " times.")

The word 'apostle' appears 113 times.


In [28]:
'''
Step 6: Identifying the most common word using a loop:
        First, creating an empty dictionary in which to store the words' number of occurrences.
        Second, getting the count of each word (w), which starts the count at 1 if the word appears for
        the first time and adding 1 to the count if the word was already  found.
        Lastly, saving the count to the dictionary, and finding the word with the highest frequency via 'max' and
        'word_count.get'.
'''
word_count = {}
for w in words:
    if w in word_count:
        word_count[w] += 1
    else:
        word_count[w] = 1
        
most_common_word = max(word_count, key=word_count.get)
print("The most common word is '" + str(most_common_word) + "'.")

The most common word is 'the'.
