In [1]:
from os.path import basename, exists

def download(url):
    filename = basename(url)
    if not exists(filename):
        from urllib.request import urlretrieve

        local, _ = urlretrieve(url, filename)
        print("Downloaded " + str(local))
    return filename

download('https://github.com/AllenDowney/ThinkPython/raw/v3/thinkpython.py');
download('https://github.com/AllenDowney/ThinkPython/raw/v3/diagram.py');

import thinkpython

In [2]:
download('https://www.gutenberg.org/cache/epub/43/pg43.txt');

In [3]:
def is_special_line(line):
    return line.strip().startswith('*** ')

In [4]:
def clean_file(input_file, output_file):
    reader = open(input_file, encoding='utf-8')
    writer = open(output_file, 'w')

    for line in reader:
        if is_special_line(line):
            break

    for line in reader:
        if is_special_line(line):
            break
        writer.write(line)
        
    reader.close()
    writer.close()

In [5]:
filename = 'dr_jekyll.txt'
clean_file('pg43.txt', filename)

In [6]:
unique_words = {}
for line in open(filename):
    seq = line.split()
    for word in seq:
        unique_words[word] = 1

len(unique_words)

6042

In [7]:
sorted(unique_words, key=len)[-5:]

['chocolate-coloured',
 'superiors—behold!”',
 'coolness—frightened',
 'gentleman—something',
 'pocket-handkerchief.']

In [8]:
def split_line(line):
    return line.replace('—', ' ').split()

split_line('coolness—frightened')

['coolness', 'frightened']

In [9]:
import unicodedata

unicodedata.category('.')

'Po'

In [10]:
punc_marks = {}
for line in open(filename):
    for char in line:
        category = unicodedata.category(char)
        if category.startswith('P'):
            punc_marks[char] = 1

In [11]:
punctuation = ''.join(punc_marks)
print(punctuation)

.’;,-“”:?—‘!()_


In [12]:
def clean_word(word):
    return word.strip(punctuation).lower()

In [13]:
clean_word('“Behold!”')

'behold'

In [14]:
unique_words2 = {}
for line in open(filename):
    for word in split_line(line):
        word = clean_word(word)
        unique_words2[word] = 1

len(unique_words2)

4005

In [15]:
sorted(unique_words2, key=len)[-5:]

['circumscription',
 'unimpressionable',
 'fellow-creatures',
 'chocolate-coloured',
 'pocket-handkerchief']

In [16]:
word_counter = {}
for line in open(filename):
    for word in split_line(line):
        word = clean_word(word)
        word_counter[word] = word_counter.get(word, 0) + 1

In [17]:
def second_element(t):
    return t[1]

In [18]:
def print_most_common(word_counter, num=5):
    items = sorted(word_counter.items(), key=second_element, reverse=True)

    for word, freq in items[:num]:
        print(freq, word, sep='\t')

In [19]:
print_most_common(word_counter, 5)

1614	the
972	and
941	of
640	to
640	i


In [20]:
word_list = open('words.txt').read().split()

In [21]:
valid_words = {}
for word in word_list:
    valid_words[word] = 1

In [22]:
def subtract(d1, d2):
    res = {}
    for key in d1:
        if key not in d2:
            res[key] = d1[key]
    return res

In [23]:
diff = subtract(word_counter, valid_words)

In [24]:
print_most_common(diff)

640	i
628	a
128	utterson
124	mr
98	hyde


In [25]:
singletons = []
for word, freq in diff.items():
    if freq == 1:
        singletons.append(word)

In [26]:
singletons[-5:]

['gesticulated', 'abjection', 'circumscription', 'reindue', 'fearstruck']

In [27]:
import random

In [28]:
t = [1, 2, 3]
random.choice(t)

3

In [29]:
words = list(word_counter)
random.choice(words)

'attack'

In [30]:
weights = word_counter.values()
random_words  = random.choices(words, weights=weights, k=6)
' '.join(random_words)

'at was or morning indifferent had'

In [31]:
bigram_counter = {}

def count_bigram(bigram):
    key = tuple(bigram)
    bigram_counter[key] = bigram_counter.get(key, 0) + 1

In [62]:
window = []

def process_word(word):
    window.append(word)
    
    if len(window) == 2:
        count_bigram(window)
        window.pop(0)

In [63]:
for line in open(filename):
    for word in split_line(line):
        word = clean_word(word)
        process_word(word)

In [64]:
print_most_common(bigram_counter)

356	('of', 'the')
278	('in', 'the')
188	('it', 'was')
160	('and', 'the')
146	('to', 'the')


In [65]:
bigrams = list(bigram_counter)
weights = bigram_counter.values()
random_bigrams = random.choices(bigrams, weights=weights, k=6)

In [66]:
for pair in random_bigrams:
    print(' '.join(pair), end=' ')

say that turn with answered from will just robert louis i sought 

In [67]:
successor_map = {}

def add_bigram(bigram):
    first, second = bigram
    successor_map.setdefault(first, []).append(second)

In [68]:
def process_word_bigram(word):
    window.append(word)
    
    if len(window) == 2:
        add_bigram(window)
        window.pop(0)

In [38]:
song = """
Half a bee, philosophically,
Must, ipso facto, half not be.
But half the bee has got to be
Vis a vis, its entity. D'you see?
"""
successor_map = {}
window = []

for word in song.split():
    word = clean_word(word)
    process_word_bigram(word)

successor_map

{'half': ['a', 'not', 'the'],
 'a': ['bee', 'vis'],
 'bee': ['philosophically', 'has'],
 'philosophically': ['must'],
 'must': ['ipso'],
 'ipso': ['facto'],
 'facto': ['half'],
 'not': ['be'],
 'be': ['but', 'vis'],
 'but': ['half'],
 'the': ['bee'],
 'has': ['got'],
 'got': ['to'],
 'to': ['be'],
 'vis': ['a', 'its'],
 'its': ['entity'],
 'entity': ["d'you"],
 "d'you": ['see']}

In [70]:
successor_map = {}
window = []

for line in open(filename):
    for word in split_line(line):
        word = clean_word(word)
        process_word_bigram(word)

In [71]:
for i in range(100):
    successors = successor_map[word]
    word = random.choice(successors)
    print(word, end=' ')

of excellent workmanship and secret sinner and still i dreamed of a grinding in there never anything else could have gone away as the place in my resolve was equipped with streaming tears of my master poole ay truly say that i have stared upon his eyes in through the fact is the majority of those indescribable amazement read so long to declare i cannot bring the lawyer conscious of the murderer’s autograph there was so pitiful a fog began at the first time about me i daren’t say nothing in the same grave countenance before him seen that gentleman 

In [73]:
#12.12.2. and 12.12.3. Exercise

trigram_counter = {}

def count_trigam(trigam):
    key = tuple(trigam)
    trigram_counter[key] = trigram_counter.get(key, 0) + 1

successor_trigam_map = {}

def add_trigam(trigam):
    first, second, third = trigam
    key =  (first, second)
    successor_trigam_map.setdefault(key, []).append(third)

window = []

def process_word_trigam(word):
    window.append(word)
    
    if len(window) == 3:
        add_trigam(window)
        window.pop(0)

In [74]:
for line in open(filename):
    for word in split_line(line):
        word = clean_word(word)
        process_word_trigam(word)

for i in range(10):
    successors = successor_map[word]
    word = random.choice(successors)
    print(word, end=' ')

and he had broken in a part as a storm 

In [80]:
successors = list(successor_trigam_map)
trigam = random.choice(successors)
trigam

('i', 'asked')

In [88]:
12.12.4. Exercise

for i in range(50):
    successors = successor_trigam_map.get(trigam)
    word = random.choice(successors)
    trigam = (trigam[1], word)
    print(word, end=' ')

moon lying on her back as though the body of a country house by a bright open fire and furnished with costly cabinets of oak will you do not think that this is downright good of you to be destroyed unread so it will not find dr jekyll he cried 