In [24]:
reset -fs

In [1]:
try: 
    from quilt.data.BrianSpiering.shakespeare import shakespeare
except ModuleNotFoundError:
    import os
    os.system('quilt install BrianSpiering/shakespeare')
    from quilt.data.BrianSpiering.shakespeare import shakespeare

In [2]:
with open(shakespeare._data()) as f:
    text = f.read()

In [3]:
import re 

In [4]:
def words(text):
    "List all the word tokens (consecutive letters) in a text. Normalize to lowercase."
    return re.findall('[a-z]+', text.lower()) 

In [84]:
text = words(text)
text[:5]

['the', 'sonnets', 'by', 'william', 'shakespeare']

In [85]:
import random 

In [86]:
def sample(bag, n=10):
    "Sample a random n-word sentence from the model described by the bag of words."
    return ' '.join(random.sample(bag, k=n))

In [87]:
[sample(text) for _ in range(10)]

['ready it what thersites shine have morris leave hast perhaps',
 'cleopatra a meet i sea and well tree straight off',
 'in been i up lord hath his it the sicinius',
 'to world lay d to be me bonfires blood of',
 'better a of clay d man after nurse signet with',
 'poor leap that a several lawyer best you counterfeit and',
 'wilt her stand deeply break by proteus act guiltiness a',
 'married render veil france he the is th love spent',
 'thy mire amorous would and proteus a of with all',
 'bleeding to tooth oft did thou i might but messenger']

__This is unigram model. It generative data based on single token frequency.__

-----

In [88]:
from collections import Counter
from pprint import pprint

In [89]:
counts = Counter(text)
pprint(counts.most_common(10)) 

[('the', 27595),
 ('and', 26735),
 ('i', 22538),
 ('to', 19771),
 ('of', 18132),
 ('a', 14725),
 ('you', 13826),
 ('my', 12490),
 ('that', 11535),
 ('in', 11112)]


In [90]:
pprint(counts.most_common(len(counts))[-10:]) 

[('extincture', 1),
 ('daffed', 1),
 ('plenitude', 1),
 ('cautels', 1),
 ('hurting', 1),
 ('preached', 1),
 ('unexperient', 1),
 ('hovered', 1),
 ('lovered', 1),
 ('glowed', 1)]


In [91]:
print(f'{"word":20}  {"count"}')
print('-'*30)
for word in words('there are common and neverseen words'):
    print(f'{word:20}  {counts[word]:,}')

word                  count
------------------------------
there                 2,210
are                   3,880
common                154
and                   26,735
neverseen             0
words                 421


In [92]:
# TODO: calculate the probability of the words
print(f'{"word":20}  {"probability"}')
print('-'*30)
for word in words('there are common and neverseen words'):
    print(f'{word:20}  {counts[word]/sum(counts.values()):.2}')

word                  probability
------------------------------
there                 0.0024
are                   0.0042
common                0.00017
and                   0.029
neverseen             0.0
words                 0.00045


In [93]:
# TODO: Turn that into a function
def word_prob(counts: dict, word: str)-> float:
    "Calculate the probability of a word based on evidence from a Counter."
    N = sum(counts.values())
    return counts[word]/N

In [94]:
assert round(word_prob(counts, "the"), 4)  == 0.0298
assert round(word_prob(counts, "king"), 4) == 0.0033

In [95]:
print(f'{"word":20}  {"probability"}')
print('-'*30)
for word in words('there are common and neverseen words'):
    print(f'{word:20}  {word_prob(counts, word):.2}')

word                  probability
------------------------------
there                 0.0024
are                   0.0042
common                0.00017
and                   0.029
neverseen             0.0
words                 0.00045


----

Now, what is the probability of a *sequence* of words?  Use the definition of a joint probability:

$P(w_1 \ldots w_n) = P(w_1) \times P(w_2 \mid w_1) \times P(w_3 \mid w_1 w_2) \ldots  \times \ldots P(w_n \mid w_1 \ldots w_{n-1})$

The *bag of words* model assumes that each word is drawn from the bag *independently* of the others.  This gives us the wrong approximation:
    
$P(w_1 \ldots w_n) = P(w_1) \times P(w_2) \times P(w_3) \ldots  \times \ldots P(w_n)$

It is wrong but okay enough to move forward

In [96]:
from numpy import product

In [97]:
def prob_words_in_phrase(phrase):
    "Probability of words, assuming each word is independent of others."
    return product([word_prob(counts, word) for word in words(phrase)])

In [98]:
phrases = ['the',
           'the the',
           'the the the', 
           'the sonnets by',
           'this is a neverbeforeseen word']

print(f'{"word":30}  {"probability"}')
print('-'*50)
for phrase in phrases:
    print(f'{phrase:30}  {prob_words_in_phrase(phrase):.6}')

word                            probability
--------------------------------------------------
the                             0.029791
the the                         0.000887505
the the the                     2.64397e-05
the sonnets by                  7.71506e-10
this is a neverbeforeseen word  0.0


TODO: Why is `the the the` so likely? What would we have to add to our model to reduce the likelihood of nonsense phrases?

"the" is very popular and the models calculates the joint probability as the product of the indepent probability

We should add bigrams frequency or grammar model to reduce the chance of nonsense phrases

-----
TODO: Why is there zero probability for sentence with neverbeforseen word?

Underflow error https://en.wikipedia.org/wiki/Arithmetic_underflow

In [99]:
def word_prob_smoothed(counts: dict, word: str)-> float:
    """Calculate a probability distribution based on evidence from a Counter.
    With laplace smoothing!
    """
    N = sum(counts.values())
    return (counts[word]+1) / (N + len(counts))

In [100]:
assert round(word_prob_smoothed(counts, "the"), 4)  == 0.0291
assert round(word_prob_smoothed(counts, "king"), 4) == 0.0032

In [101]:
print(f'{"word":20}  {"probability"}')
print('-'*30)
for word in words('there are common and neverseen words'):
    print(f'{word:20}  {word_prob_smoothed(counts, word):.2}')

word                  probability
------------------------------
there                 0.0023
are                   0.0041
common                0.00016
and                   0.028
neverseen             1.1e-06
words                 0.00044


In [102]:
def prob_words_smoothed_in_phrase(phrase):
    "Probability of words, assuming each word is independent of others."
    return product([word_prob_smoothed(counts, word) for word in words(phrase)])

In [103]:
print(f'{"word":30}  {"probability"}')
print('-'*50)
for phrase in phrases:
    print(f'{phrase:30}  {prob_words_smoothed_in_phrase(phrase):.6}')

word                            probability
--------------------------------------------------
the                             0.0290542
the the                         0.000844145
the the the                     2.45259e-05
the sonnets by                  8.58927e-10
this is a neverbeforeseen word  6.36721e-16


<br>
<br> 
<br>

----