In [1]:
import re

In [2]:
# The following stopwords have been taken from NLTK

stop_words = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", 
              "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", 
              "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these",
              "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", 
              "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", 
              "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", 
              "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", 
              "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both",
              "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so",
              "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]

In [3]:
# Punctuations
punctuations = [".", "?", "\"", "'", ",", "!", ":", ";", "(", ")", "[", "]", "...", "/"]

In [4]:
text = """
My name is Clark, and I will tell you about my city.

I live in an apartment. In my city, there is a post office where people mail letters. On Monday, I go to work. I work at the post office. Everyone shops for food at the grocery store. They also eat at the restaurant. The restaurant serves pizza and ice cream.

My friends and I go to the park. We like to play soccer at the park. On Fridays, we go to the cinema to see a movie. Children don't go to school on the weekend. Each day, people go to the hospital when they are sick. The doctors and nurses take care of them. The police keep everyone safe. I am happy to live in my city.
"""

In [5]:
sentences = re.findall(r"[^.?!]+", text.strip())
trim = lambda x : ["".join([c.lower() for c in x.strip() if c not in punctuations]) for x in x]
trimmed_sentences = trim(sentences)
trimmed_sentences

['my name is clark and i will tell you about my city',
 'i live in an apartment',
 'in my city there is a post office where people mail letters',
 'on monday i go to work',
 'i work at the post office',
 'everyone shops for food at the grocery store',
 'they also eat at the restaurant',
 'the restaurant serves pizza and ice cream',
 'my friends and i go to the park',
 'we like to play soccer at the park',
 'on fridays we go to the cinema to see a movie',
 'children dont go to school on the weekend',
 'each day people go to the hospital when they are sick',
 'the doctors and nurses take care of them',
 'the police keep everyone safe',
 'i am happy to live in my city']

In [6]:
remove_stop_words = lambda x : [" ".join([w for w in x.split() if w not in stop_words]) for x in x]
normalized_sentences = remove_stop_words(trimmed_sentences)
normalized_sentences

['name clark tell city',
 'live apartment',
 'city post office people mail letters',
 'monday go work',
 'work post office',
 'everyone shops food grocery store',
 'also eat restaurant',
 'restaurant serves pizza ice cream',
 'friends go park',
 'like play soccer park',
 'fridays go cinema see movie',
 'children dont go school weekend',
 'day people go hospital sick',
 'doctors nurses take care',
 'police keep everyone safe',
 'happy live city']

In [7]:
list_of_words = " ".join([s for s in normalized_sentences]).split()
list_of_words

['name',
 'clark',
 'tell',
 'city',
 'live',
 'apartment',
 'city',
 'post',
 'office',
 'people',
 'mail',
 'letters',
 'monday',
 'go',
 'work',
 'work',
 'post',
 'office',
 'everyone',
 'shops',
 'food',
 'grocery',
 'store',
 'also',
 'eat',
 'restaurant',
 'restaurant',
 'serves',
 'pizza',
 'ice',
 'cream',
 'friends',
 'go',
 'park',
 'like',
 'play',
 'soccer',
 'park',
 'fridays',
 'go',
 'cinema',
 'see',
 'movie',
 'children',
 'dont',
 'go',
 'school',
 'weekend',
 'day',
 'people',
 'go',
 'hospital',
 'sick',
 'doctors',
 'nurses',
 'take',
 'care',
 'police',
 'keep',
 'everyone',
 'safe',
 'happy',
 'live',
 'city']

In [8]:
def word_count(words):
    counts = dict()
    for word in words:
        if word in counts:
            counts[word] += 1
        else:
            counts[word] = 1
    return counts

In [9]:
word_dictionary = word_count(list_of_words)
word_dictionary

{'also': 1,
 'apartment': 1,
 'care': 1,
 'children': 1,
 'cinema': 1,
 'city': 3,
 'clark': 1,
 'cream': 1,
 'day': 1,
 'doctors': 1,
 'dont': 1,
 'eat': 1,
 'everyone': 2,
 'food': 1,
 'fridays': 1,
 'friends': 1,
 'go': 5,
 'grocery': 1,
 'happy': 1,
 'hospital': 1,
 'ice': 1,
 'keep': 1,
 'letters': 1,
 'like': 1,
 'live': 2,
 'mail': 1,
 'monday': 1,
 'movie': 1,
 'name': 1,
 'nurses': 1,
 'office': 2,
 'park': 2,
 'people': 2,
 'pizza': 1,
 'play': 1,
 'police': 1,
 'post': 2,
 'restaurant': 2,
 'safe': 1,
 'school': 1,
 'see': 1,
 'serves': 1,
 'shops': 1,
 'sick': 1,
 'soccer': 1,
 'store': 1,
 'take': 1,
 'tell': 1,
 'weekend': 1,
 'work': 2}

In [10]:
normalized_frequency_of_word_in_text = dict([(w, word_dictionary[w]/len(list_of_words)) for w in word_dictionary])
normalized_frequency_of_word_in_text

{'also': 0.015625,
 'apartment': 0.015625,
 'care': 0.015625,
 'children': 0.015625,
 'cinema': 0.015625,
 'city': 0.046875,
 'clark': 0.015625,
 'cream': 0.015625,
 'day': 0.015625,
 'doctors': 0.015625,
 'dont': 0.015625,
 'eat': 0.015625,
 'everyone': 0.03125,
 'food': 0.015625,
 'fridays': 0.015625,
 'friends': 0.015625,
 'go': 0.078125,
 'grocery': 0.015625,
 'happy': 0.015625,
 'hospital': 0.015625,
 'ice': 0.015625,
 'keep': 0.015625,
 'letters': 0.015625,
 'like': 0.015625,
 'live': 0.03125,
 'mail': 0.015625,
 'monday': 0.015625,
 'movie': 0.015625,
 'name': 0.015625,
 'nurses': 0.015625,
 'office': 0.03125,
 'park': 0.03125,
 'people': 0.03125,
 'pizza': 0.015625,
 'play': 0.015625,
 'police': 0.015625,
 'post': 0.03125,
 'restaurant': 0.03125,
 'safe': 0.015625,
 'school': 0.015625,
 'see': 0.015625,
 'serves': 0.015625,
 'shops': 0.015625,
 'sick': 0.015625,
 'soccer': 0.015625,
 'store': 0.015625,
 'take': 0.015625,
 'tell': 0.015625,
 'weekend': 0.015625,
 'work': 0.03125

In [11]:
tf = lambda x : [sum([normalized_frequency_of_word_in_text.get(a) for a in x.split()])/len(x.split()) for x in x]
tf(normalized_sentences)

[0.0234375,
 0.0234375,
 0.028645833333333332,
 0.041666666666666664,
 0.03125,
 0.01875,
 0.020833333333333332,
 0.01875,
 0.041666666666666664,
 0.01953125,
 0.028125,
 0.028125,
 0.03125,
 0.015625,
 0.01953125,
 0.03125]