# Assignment: Word Counts

Perform an advanced frequency analysis on words in a given text source.
1. Convert a text file into a string.
2. Split a string into words, excluding punctuation marks.
3. Remove stop words from the string.
4. Lemmatize the words in the string so that all words are stem words.
5. Count the frequency of each stem word and store the results in a dictionary.
6. Convert the dictionary to a JSON file.

In [3]:
# Kresda Rattanasudsai
import os
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
import json

def read_text_file(file_path):
    f = open(file_path, 'r')
    r = f.read()
    f.close()
    return r
def split_text(text):
    lower_case = text.lower()
    content_list = re.split(r"[-;,.\s]\s*", lower_case)
    try:
        while True:
            content_list.remove('')
    except ValueError:
        pass
    return content_list
def remove_stop_words(words,stop_words):
    filtered_sentence = []
    for w in words:
        if w not in stop_words:
            filtered_sentence.append(w)
    return filtered_sentence
def lemmatize_words(words_clean):
    lemmatizer = WordNetLemmatizer() 
    lem_words = []
    for word in words_clean:
        word_lemmatized = lemmatizer.lemmatize(word)
        lem_words.append(word_lemmatized)
    return lem_words
def compute_frequency_words(words_lemmatized):
    word_freq = dict()
    # Create a dictionary of words as key and count as value, starting with zero first
    for i in words_lemmatized:
        word_freq[i] = 0

    # Loop through the list of words and if the word appears as a key in the dictionary append 1 to the value:
    for i in words_lemmatized:
        if i in word_freq:
            word_freq[i] += 1
    return word_freq
def save_words_frequency(words_frequency,file_path="data/words_frequency.json"):
    with open(file_path, 'w') as outfile:
        json.dump(words_frequency, outfile)
        
    f = open(file_path, 'r')
    # print(f.read())
    f.close()

# Calling the functions to read from text.txt file and find the words' frequency
stop_words = set(stopwords.words('english'))
text = read_text_file("data/text.txt")
words = split_text(text)
words_clean = remove_stop_words(words,stop_words)
words_lemmatized = lemmatize_words(words_clean)
words_frequency = compute_frequency_words(words_lemmatized)
save_words_frequency(words_frequency,file_path="data/words_frequency.json")

#sort the words' frequency (value) by largest to smallest as a list
sorted_frequency = sorted(words_frequency.items(), key=lambda x: x[1], reverse=True)
print("Words' Frequency Sorted by Most-Least Frequent: ", sorted_frequency)
print("\n")

# Sort the words (keys) alphabetically 
sorted_words = sorted(words_frequency.keys(), key=lambda x: x[0].lower())
print("Words Sorted Alphabetically: ", sorted_words)

Words' Frequency Sorted by Most-Least Frequent:  [('little', 2), ('holmes', 2), ('week', 2), ('clearing', 2), ('time', 2), ('seen', 1), ('lately', 1), ('marriage', 1), ('drifted', 1), ('u', 1), ('away', 1), ('complete', 1), ('happiness', 1), ('home', 1), ('centred', 1), ('interest', 1), ('rise', 1), ('around', 1), ('man', 1), ('first', 1), ('find', 1), ('master', 1), ('establishment', 1), ('sufficient', 1), ('absorb', 1), ('attention', 1), ('loathed', 1), ('every', 1), ('form', 1), ('society', 1), ('whole', 1), ('bohemian', 1), ('soul', 1), ('remained', 1), ('lodging', 1), ('baker', 1), ('street', 1), ('buried', 1), ('among', 1), ('old', 1), ('book', 1), ('alternating', 1), ('cocaine', 1), ('ambition', 1), ('drowsiness', 1), ('drug', 1), ('fierce', 1), ('energy', 1), ('keen', 1), ('nature', 1), ('still', 1), ('ever', 1), ('deeply', 1), ('attracted', 1), ('study', 1), ('crime', 1), ('occupied', 1), ('immense', 1), ('faculty', 1), ('extraordinary', 1), ('power', 1), ('observation', 1), (