### Zipf's Law

In [1]:
import numpy as np
import pandas as pd

In [2]:
import nltk

In [3]:
nltk.download('brown')

[nltk_data] Downloading package brown to /home/avizyt/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [4]:
from nltk.corpus import brown 

In [5]:
brown.words()[:10]

['The',
 'Fulton',
 'County',
 'Grand',
 'Jury',
 'said',
 'Friday',
 'an',
 'investigation',
 'of']

In [6]:
brown.tagged_words()[:10]

[('The', 'AT'),
 ('Fulton', 'NP-TL'),
 ('County', 'NN-TL'),
 ('Grand', 'JJ-TL'),
 ('Jury', 'NN-TL'),
 ('said', 'VBD'),
 ('Friday', 'NR'),
 ('an', 'AT'),
 ('investigation', 'NN'),
 ('of', 'IN')]

In [7]:
len(brown.words())

1161192

In [8]:
from collections import Counter
puncs = set((',', '.', '--', '-', '!', '?',':', ';', '``', "''", '(', ')', '[', ']'))

In [9]:
word_list = (x.lower() for x in brown.words() if x not in puncs)

token_counts = Counter(word_list)

In [10]:
token_counts.most_common(20)

[('the', 69971),
 ('of', 36412),
 ('and', 28853),
 ('to', 26158),
 ('a', 23195),
 ('in', 21337),
 ('that', 10594),
 ('is', 10109),
 ('was', 9815),
 ('he', 9548),
 ('for', 9489),
 ('it', 8760),
 ('with', 7289),
 ('as', 7253),
 ('his', 6996),
 ('on', 6741),
 ('be', 6377),
 ('at', 5372),
 ('by', 5306),
 ('i', 5164)]

Total Word count

In [11]:
from nlpia.data.loaders import kite_text, kite_history

In [12]:
kite_intro = kite_text.lower()

In [14]:
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()

In [15]:
intro_token = tokenizer.tokenize(kite_intro)

In [16]:
kite_history = kite_history.lower()
history_tokens = tokenizer.tokenize(kite_history)

In [18]:
intro_total = len(intro_token)
intro_total

363

In [19]:
history_total = len(history_tokens)
history_total

297

Making Term freq

In [20]:
intro_tf = {}
history_tf = {}

intro_count = Counter(intro_token)

In [21]:
intro_tf['kite'] = intro_count['kite'] / intro_total

In [22]:
history_count = Counter(history_tokens)
history_tf['kite'] = history_count['kite'] / history_total

In [25]:
print(f"Term Freq of 'Kite' in intro is {intro_tf['kite']:.4f}")
print(f"Term Freq of 'Kite' in history is {history_tf['kite']:.4f}")

Term Freq of 'Kite' in intro is 0.0440771349862259
Term Freq of 'Kite' in history is 0.0202


In [42]:
def term_freq(tok):
    
    intro_tf[tok] = intro_count[tok] / intro_total
    history_tf[tok] = history_count[tok] / history_total


    
    return intro_tf[tok], history_tf[tok]

In [46]:
tok = 'and'
print(f"Term Freq of '{tok}' in intro is {term_freq(tok)[0]:.4f}")
print(f"Term Freq of '{tok}' in history is {term_freq(tok)[1]:.4f}")

Term Freq of 'and' in intro is 0.0275
Term Freq of 'and' in history is 0.0303


In [47]:
term_freq('and')

(0.027548209366391185, 0.030303030303030304)

Making IDF

In [37]:
numberOfDocContaingAnd = 0
for doc in [intro_token, history_tokens]:
    if 'and' in doc:
        numberOfDocContaingAnd += 1

In [48]:
term_freq('china')

(0.0, 0.010101010101010102)

In [49]:
num_docs = 2
intro_idf = {}
history_idf = {}

intro_idf['and'] = num_docs / numberOfDocContaingAnd
history_idf['and'] = num_docs / numberOfDocContaingAnd

intro_idf['kite'] = num_docs / numberOfDocContaingAnd
history_idf['kite'] = num_docs / numberOfDocContaingAnd

intro_idf['china'] = num_docs / numberOfDocContaingAnd
history_idf['china'] = num_docs / numberOfDocContaingAnd



In [54]:
intro_tfidf = {}
history_tfidf = {}

intro_tfidf['and'] = np.log(intro_tf['and']) * np.log(intro_idf['and'])
history_tfidf['and'] = history_tf['and'] * np.log(history_idf['and'])

print(intro_tfidf['and'])
print(history_tfidf['and'])

-0.0
0.0


In [52]:
np.log(intro_tfidf['and'])

-3.591817741270805