## Problem: Write a Byte Pain Encoder in Python

### Problem Statement
Implement a **Transformer model** in PyTorch by completing the required sections. The model should consist of an embedding layer, a Transformer encoder, and an output layer for sequence processing and prediction.

### Requirements
1. **Define the Transformer Model Architecture**:
   - **Embedding Layer**:
     - Implement a layer to transform input data into a higher-dimensional space.
     - Use a `torch.nn.Linear` or `torch.nn.Embedding` layer to create embeddings from the input.
   - **Transformer Encoder**:
     - Use `torch.nn.TransformerEncoder` or `torch.nn.Transformer` to process sequences with attention.
     - Configure parameters such as the number of attention heads and encoder layers.
   - **Output Layer**:
     - Add a fully connected (linear) layer to reduce the transformer's sequence output into the desired output dimension.

2. **Implement the Forward Method**:
   - Map the input to the higher-dimensional space using the embedding layer.
   - Pass the transformed input through the Transformer encoder.
   - Use the output layer to convert the encoded sequence into predictions.

### Constraints
- Handle input padding correctly for variable-length sequences.
- Ensure compatibility with batch processing by correctly shaping input and output tensors.


In [None]:
from collections import defaultdict, Counter
import pandas as pd
from tqdm import tqdm
import re
tqdm.pandas()


def get_corpus(docs, cased=False):
    """Creates a corpus with words split into characters and a special end-of-word token."""
    eow_char = '</w>'
    from collections import Counter
    if cased == True:
        docs = [lower(word.strip()) for word in docs]
    word_frequency = Counter(docs)
    corpus = {}
    for word in word_frequency:
        chars = list(word)
        chars.append(eow_char)
        chars = tuple(chars)
        corpus[chars] = word_frequency.get(word, 1)
    return corpus


# def get_all_n_grams(word):
#     n_grams = []
#     for n_gram in range(len(word)):
#         for i in range(len(word)-n_gram):
#             n_grams.append(word[i:i+n_gram+1])
#     return n_grams


def get_all_n_grams(word, n_min=None, n_max=None):
    n_grams = []
    if n_max == None:
        n_max = len(word)
    if n_min == None:
        n_min = 1
    for n_gram in range(n_min, n_max+1):
        for i in range(len(word)-(n_gram-1)):
            n_grams.append(word[i:i+n_gram])
    # print(n_grams)
    return n_grams

def get_stats(corpus):
    """Counts frequency of adjacent symbol pairs."""
    stats = {}
    for characters, word_count in corpus.items():
        # word = ''.join(characters[:-1])
        # word_ngrams = get_all_n_grams(word)
        word_ngrams = get_all_n_grams(characters, n_min=2, n_max=2)
        for pair in word_ngrams:
            stats[pair] = stats.get(pair, 0) + word_count
    
    if len(stats) == 0:
        print(f"found no ngram reps for characters = {corpus}")
    return stats

# def get_merged_key(key, most_freq_pair):
#     """
#     1. abcde, bc
#     2. abcde, fg
#     3. abbcdd, bb
#     """
#     first_elem = most_freq_pair[0]
#     second_elem = most_freq_pair[1]
#     first_elem_already_match = False
#     new_key = []
#     for char in key:
#         if char == second_elem:
#             if first_elem_already_match:
#                 # new_key = new_key[:-1] + [first_elem+second_elem]
#                 new_key[-1] = first_elem+second_elem
#         elif char == first_elem:
#             first_elem_already_match = True
#             new_key.append(char)
#         else:
#             new_key.append(char)

#         if first_elem_already_match == True:
#             first_elem_already_match = False
#         if char == first_elem:
#             first_elem_already_match = True
#     return tuple(new_key)


# def merge_vocab(pair, vocab):
#     """Merges the most frequent pair into a single symbol."""
#     sorted_pair_copy = sorted(pair.items(), key=lambda x: x[1], reverse=True)
#     most_freq_pair = sorted_pair_copy[0]
#     most_freq_pair_chars = ''.join(most_freq_pair[0])
#     print(f"most_freq_pair_chars = {most_freq_pair_chars} | most_freq_pair = {most_freq_pair}")
#     new_vocab = {}
#     for key in vocab:
#         val = vocab.get(key)
#         new_key = get_merged_key(key, most_freq_pair[0])
#         new_vocab[new_key] = val
#     # new_vocab.update(vocab)
#     return new_vocab

# def get_merge_corpus_with_most_recurring_bigram(corpus, most_recurring_bigram):
#     key_before_merge = "_".join(most_recurring_bigram)
#     key_post_merge = "".join(most_recurring_bigram)
#     d = {}
#     for key, value in corpus.items():
#         updated_key = "_".join(key).replace("_".join(most_recurring_bigram), "".join(most_recurring_bigram))
#         updated_key = tuple(updated_key.split('_'))
#         d[updated_key] = value
#     return d

# def get_merge_corpus_with_most_recurring_bigram(corpus, most_recurring_bigram):
#     list_to_str_joiner = "<<>>"
#     key_before_merge = list_to_str_joiner.join(most_recurring_bigram)
#     key_post_merge = "".join(most_recurring_bigram)
#     d = {}
#     for key, value in corpus.items():
#         updated_key = list_to_str_joiner.join(key).replace(list_to_str_joiner.join(most_recurring_bigram), "".join(most_recurring_bigram))
#         updated_key = tuple(updated_key.split(list_to_str_joiner))
#         d[updated_key] = value
#     return d

def get_merge_corpus_with_most_recurring_bigram(corpus, merge_key):
    if len(merge_key) != 2:
        raise ValueError(f"merge key should have two elements ONLY. {merge_key} contains {len(merge_key)} elements instead.")
    first_token, second_token = merge_key
    new_token = f"{first_token}{second_token}"
    new_corpus = {}
    for corpus_key, value in corpus.items():
        new_corpus_key = []
        i = 0
        while i < len(corpus_key):
            if (i < len(corpus_key)-1) and (corpus_key[i] == first_token) and (corpus_key[i+1] == second_token):
                i += 2
                new_corpus_key.append(new_token)
                continue
            new_corpus_key.append(corpus_key[i])
            i += 1
        new_corpus_key_tuple = tuple(new_corpus_key)
        new_corpus[new_corpus_key_tuple] = new_corpus.get(new_corpus_key_tuple, 0) + value
    return new_corpus

def byte_pair_encoding(docs, num_merges=10):
    """Performs BPE on a corpus."""
    corpus = get_corpus(docs)
    vocab = set(''.join(docs))
    for i in tqdm(range(num_merges), desc="running BPE algorithm"):
        stats = get_stats(corpus)
        # print(f"stats = {stats}")
        most_recurring_bigram = sorted(stats.items(), key=lambda x: x[1], reverse=True)[0]
        # print(f"most_recurring_bigram = {most_recurring_bigram}")
        corpus = get_merge_corpus_with_most_recurring_bigram(corpus, most_recurring_bigram[0])
        vocab.add(''.join(most_recurring_bigram[0]))
        # print(f"corpus = {corpus}")
        # print(f"vocab = {vocab}")
        # print("="*100)
    return corpus, vocab

# Example usage
# corpus = ["low", "lower", "lowest", "newer", "wider"]
docs = ['hug', 'hug', 'hug', 'hug', 'hug', 'hug', 'hug', 'hug', 'hug', 'hug', 'pug', 'pug', 'pug', 'pug', 'pug', 'pun', 'pun', 'pun', 'pun', 'pun', 'pun', 'pun', 'pun', 'pun', 'pun', 'pun', 'pun', 'bun', 'bun', 'bun', 'bun', 'hugs', 'hugs', 'hugs', 'hugs', 'hugs']
# vocab = set(''.join(docs))
# print(f"initial vocab = {vocab}")
# corpus = get_corpus(docs)
# print(f"corpus = {corpus}")
# stats = get_stats(corpus)
# print(f"stats = {stats}")
# most_recurring_bigram = sorted(stats.items(), key=lambda x: x[1], reverse=True)[0]
# updated_vocab = merge_vocab(most_recurring_bigram, corpus)
# sorted(get_stats(get_corpus(docs)).items(), key=lambda x: x[1], reverse=True)
import pandas as pd
docs = ''.join(pd.read_csv("/Users/ashutosh/Downloads/pual_graham_essays.csv")['text'].values).lower()
docs = re.sub(r'\\s', '', docs)
docs = docs.replace("  ", " ").replace("  ", " ").replace("  ", " ").replace("  ", " ")
docs = docs.split()
corpus, vocab = byte_pair_encoding(docs, num_merges=500)

running BPE algorithm: 100%|██████████| 500/500 [00:35<00:00, 14.20it/s]


In [9]:
corpus = {('remem', 'ber</w>'): 3, ('the</w>',): 234, ('essays</w>',): 8, ('you</w>',): 79, ('had</w>',): 18, ('to</w>',): 134, ('write</w>',): 23, ('in</w>',): 81, ('high</w>',): 7, ('school', '?</w>'): 1, ('top', 'ic</w>'): 3, ('senten', 'ce', ',</w>'): 1, ('intr', 'o', 'du', 'ctor', 'y</w>'): 1, ('paragraph', ',</w>'): 2, ('supporting</w>',): 2, ('paragraph', 's,</w>'): 1, ('conclusion', '.</w>'): 2, ('conclusion</w>',): 2, ('be', 'ing', ',</w>'): 1, ('say,</w>',): 1, ('that</w>',): 73, ('a', 'hab', '</w>'): 1, ('_', 'moby</w>'): 1, ('dic', 'k', '_</w>'): 1, ('was</w>',): 36, ('a</w>',): 113, ('ch', 'rist', '-', 'like</w>'): 1, ('fig', 'ure', '.</w>'): 1, ('o', 'y.</w>'): 1, ('so</w>',): 21, ("i'm</w>",): 6, ('go', 'ing</w>'): 1, ('try</w>',): 3, ('give</w>',): 4, ('other</w>',): 6, ('side</w>',): 3, ('of</w>',): 130, ('stor', 'y', ':</w>'): 1, ('what</w>',): 36, ('an</w>',): 32, ('essay</w>',): 22, ('re', 'ally</w>'): 3, ('is,</w>',): 2, ('and</w>',): 79, ('how</w>',): 15, ('one.</w>',): 5, ('or</w>',): 12, ('at</w>',): 29, ('le', 'ast,</w>'): 2, ('i</w>',): 55, ('**', 'mo', 'd', 's', '**</w>'): 1, ('most</w>',): 16, ('ob', 'vi', 'ous</w>'): 2, ('difference</w>',): 4, ('between</w>',): 5, ('real</w>',): 11, ('things</w>',): 24, ('one</w>',): 24, ('has</w>',): 7, ('school</w>',): 6, ('is</w>',): 50, ('are</w>',): 24, ('not</w>',): 33, ('ex', 'cl', 'us', 'iv', 'ely</w>'): 1, ('about</w>',): 24, ('english</w>',): 15, ('literature.</w>',): 4, ('certain', 'ly</w>'): 2, ('school', 's</w>'): 3, ('should</w>',): 16, ('te', 'ach</w>'): 2, ('students</w>',): 4, ('write', '.</w>'): 1, ('but</w>',): 40, ('due</w>',): 1, ('series</w>',): 1, ('historical</w>',): 1, ('ac', 'cidents</w>'): 1, ('te', 'ach', 'ing</w>'): 2, ('writing</w>',): 21, ('go', 't', 'ten</w>'): 1, ('m', 'i', 'x', 'ed</w>'): 1, ('to', 'gether</w>'): 2, ('with</w>',): 18, ('study</w>',): 9, ('all</w>',): 11, ('over</w>',): 7, ('c', 'oun', 'try</w>'): 1, ('b', 'ase', 'b', 'all</w>'): 1, ('te', 'am</w>'): 1, ('s', 'm', 'all</w>'): 1, ('bud', 'get</w>'): 1, ('might</w>',): 4, ('com', 'pete</w>'): 1, ('y', 'an', 'ke', 'es,</w>'): 1, ('r', 'ole</w>'): 1, ('color</w>',): 3, ('f', 'ash', 'ion', ',</w>'): 1, ('con', 'stit', 'u', 'tes</w>'): 1, ('good</w>',): 13, ('desser', 't,</w>'): 1, ('sy', 'm', 'b', 'olis', 'm</w>'): 2, ('dic', 'ken', 's.</w>'): 1, ('resul', 't</w>'): 1, ('m', 'ade</w>'): 3, ('seem</w>',): 8, ('b', 'oring</w>'): 2, ('po', 'int', 'less.</w>'): 2, ('who</w>',): 17, ('c', 'ares</w>'): 1, ('dic', 'ken', 's?</w>'): 1, ('dic', 'kens</w>'): 1, ('h', 'im', 'self</w>'): 2, ('would</w>',): 11, ('be</w>',): 44, ('more</w>',): 14, ('interested</w>',): 4, ('b', 'ase', 'b', 'all.</w>'): 1, ('did</w>',): 3, ('get</w>',): 10, ('this</w>',): 25, ('way', '?</w>'): 1, ('answer</w>',): 2, ('we</w>',): 14, ('have</w>',): 21, ('go</w>',): 4, ('back</w>',): 7, ('al', 'most</w>'): 2, ('thous', 'and</w>'): 1, ('year', 's.</w>'): 1, ('ar', 'ound</w>'): 2, ('1', '1', '0', '0', ',</w>'): 1, ('eur', 'ope</w>'): 1, ('l', 'ast</w>'): 1, ('began</w>',): 4, ('c', 'at', 'ch</w>'): 2, ('its</w>',): 1, ('b', 're', 'ath</w>'): 1, ('af', 'ter</w>'): 3, ('centuries</w>',): 3, ('cha', 'o', 's,</w>'): 1, ('on', 'ce</w>'): 2, ('they</w>',): 26, ('l', 'u', 'x', 'ur', 'y</w>'): 1, ('curi', 'o', 'sit', 'y</w>'): 1, ('re', 'disc', 'overed</w>'): 1, ('c', 'all</w>'): 2, ('"', 'the</w>'): 1, ('class', 'ic', 's', '."</w>'): 1, ('e', 'ffe', 'ct</w>'): 1, ('r', 'ather</w>'): 2, ('as</w>',): 37, ('if</w>',): 20, ('were</w>',): 13, ('visited</w>',): 1, ('by</w>',): 16, ('be', 'ings</w>'): 1, ('from</w>',): 14, ('another</w>',): 4, ('s', 'ol', 'ar</w>'): 1, ('sy', 'stem', '.</w>'): 2, ('these</w>',): 2, ('e', 'ar', 'li', 'er</w>'): 1, ('civiliz', 'ations</w>'): 1, ('mu', 'ch</w>'): 3, ('s', 'op', 'histic', 'ated</w>'): 1, ('for</w>',): 29, ('nex', 't</w>'): 2, ('sever', 'al</w>'): 1, ('m', 'ain</w>'): 2, ('wor', 'k</w>'): 3, ('eur', 'ope', 'an</w>'): 2, ('scholar', 's,</w>'): 2, ('ever', 'y</w>'): 1, ('fiel', 'd,</w>'): 1, ('ass', 'im', 'il', 'ate</w>'): 1, ('k', 'ne', 'w', '.</w>'): 2, ('during</w>',): 2, ('peri', 'o', 'd</w>'): 2, ('an', 'ci', 'ent</w>'): 3, ('tex', 'ts</w>'): 3, ('ac', 'qu', 'ired</w>'): 1, ('gre', 'at</w>'): 2, ('prestige', '.</w>'): 1, ('it</w>',): 51, ('seemed</w>',): 5, ('essence</w>',): 2, ('scholar', 's</w>'): 2, ('did.</w>',): 1, ('scholarshi', 'p</w>'): 3, ('g', 'ained</w>'): 1, ('moment', 'um</w>'): 1, ('bec', 'ame</w>'): 2, ('less</w>',): 3, ('import', 'an', 't', ';</w>'): 1, ('1', '3', '5', '0</w>'): 1, ('someone</w>',): 4, ('wanted</w>',): 5, ('lear', 'n</w>'): 2, ('s', 'ci', 'ence</w>'): 1, ('could</w>',): 8, ('find</w>',): 12, ('bet', 'ter</w>'): 1, ('te', 'acher', 's</w>'): 2, ('than</w>',): 7, ('aristo', 't', 'le</w>'): 1, ('h', 'is</w>'): 3, ('own</w>',): 2, ('er', 'a', '.</w>'): 1, ('\\[', '1', '\\]</w>'): 2, ('change</w>',): 4, ('s', 'lower</w>'): 1, ('scholarshi', 'p.</w>'): 1, ('19', 'th</w>'): 3, ('centur', 'y</w>'): 2, ('still</w>',): 1, ('back', 'b', 'one</w>'): 1, ('cur', 'ricul', 'um', '.</w>'): 2, ('time</w>',): 5, ('then</w>',): 7, ('ri', 'pe</w>'): 1, ('question', ':</w>'): 1, ('v', 'alid</w>'): 2, ('fiel', 'd</w>'): 2, ('scholarshi', 'p', ',</w>'): 1, ('why</w>',): 11, ('mo', 'der', 'n</w>'): 3, ('tex', 't', 's?</w>'): 1, ('answer', ',</w>'): 2, ('course', ',</w>'): 1, ('origin', 'al</w>'): 2, ('r', 'a', 'is', 'on</w>'): 1, ('d', "'", 'etre</w>'): 1, ('classical</w>',): 4, ('k', 'ind</w>'): 2, ('intel', 'le', 'ct', 'u', 'al</w>'): 2, ('ar', 'cha', 'e', 'olo', 'g', 'y</w>'): 1, ('does</w>',): 4, ('need</w>',): 6, ('done</w>',): 1, ('c', 'ase</w>'): 3, ('con', 'tem', 'por', 'ar', 'y</w>'): 1, ('au', 'thor', 's.</w>'): 1, ('re', 'as', 'ons</w>'): 1, ('no</w>',): 6, ('answer', '.</w>'): 1, ('ar', 'cha', 'e', 'olo', 'g', 'ical</w>'): 1, ('be', 'ing</w>'): 1, ('mo', 'st', 'ly</w>'): 2, ('done', ',</w>'): 1, ('im', 'pli', 'ed</w>'): 1, ('those</w>',): 4, ('stud', 'ying</w>'): 2, ('class', 'ic', 's</w>'): 1, ('were', ',</w>'): 1, ('w', 'asting</w>'): 1, ('their</w>',): 6, ('time', ',</w>'): 2, ('least</w>',): 8, ('working</w>',): 5, ('on</w>',): 9, ('problem', 's</w>'): 1, ('m', 'inor</w>'): 1, ('import', 'ance', '.</w>'): 1, ('there</w>',): 16, ('de', 'al</w>'): 1, ('res', 'ist', 'ance</w>'): 1, ('fir', 'st.</w>'): 1, ('first</w>',): 7, ('courses</w>',): 1, ('literature</w>',): 6, ('been</w>',): 7, ('o', 'ffered</w>'): 1, ('ne', 'wer</w>'): 1, ('colle', 'ges,</w>'): 1, ('particul', 'ar', 'ly</w>'): 2, ('american</w>',): 1, ('ones.</w>',): 2, ('d', 'art', 'mou', 'th', ',</w>'): 1, ('university</w>',): 6, ('ver', 'mon', 't,</w>'): 2, ('am', 'her', 'st,</w>'): 1, ('colle', 'ge', ',</w>'): 1, ('lon', 'don</w>'): 1, ('taught</w>',): 6, ('18', '2', '0', 's.</w>'): 1, ('har', 'v', 'ar', 'd</w>'): 2, ("didn't</w>",): 8, ('professor</w>',): 4, ('un', 'til</w>'): 1, ('18', '7', '6', ',</w>'): 2, ('o', 'x', 'for', 'd</w>'): 1, ('till</w>',): 1, ('18', '8', '5', '.</w>'): 1, ('(', 'o', 'x', 'for', 'd</w>'): 1, ('cha', 'ir</w>'): 1, ('ch', 'inese</w>'): 1, ('be', 'fore</w>'): 2, ('english', '.)</w>'): 1, ('\\[', '2', '\\]</w>'): 2, ('ti', 'p', 'ped</w>'): 1, ('s', 'c', 'ales,</w>'): 1, ('us,</w>',): 1, ('seems</w>',): 4, ('ide', 'a</w>'): 3, ('professors</w>',): 5, ('do</w>',): 20, ('rese', 'arch</w>'): 3, ('well</w>',): 3, ('te', 'ach', '.</w>'): 1, ('(', 'along</w>'): 1, ('p', 'h', 'd,</w>'): 1, ('de', 'part', 'ment,</w>'): 1, ('in', 'de', 'ed</w>'): 1, ('wh', 'ole</w>'): 3, ('con', 'ce', 'p', 't</w>'): 1, ('universit', 'y', ')</w>'): 1, ('imported</w>',): 1, ('ger', 'm', 'any</w>'): 1, ('l', 'ate</w>'): 2, ('centur', 'y.</w>'): 1, ('beg', 'in', 'n', 'ing</w>'): 2, ('j', 'o', 'h', 'ns</w>'): 1, ('h', 'op', 'k', 'ins</w>'): 1, ('ne', 'w</w>'): 3, ('mo', 'del</w>'): 1, ('s', 'pread</w>'): 2, ('r', 'ap', 'id', 'ly.</w>'): 1, ('c', 'asu', 'al', 'ti', 'es.</w>'): 1, ('colle', 'ges</w>'): 1, ('long</w>',): 4, ('com', 'position', '.</w>'): 1, ('com', 'position', '?</w>'): 2, ('math</w>',): 1, ('re', 'qu', 'ired</w>'): 2, ('math', ',</w>'): 1, ('history</w>',): 5, ('scholar', 'ly</w>'): 1, ('articles</w>',): 1, ('histor', 'y,</w>'): 3, ('rhetoric</w>',): 3, ('on', '?</w>'): 1, ('clo', 'sest</w>'): 1, ('thing</w>',): 5, ('\\[', '3', '\\]</w>'): 2, ('in', 'herited</w>'): 2, ('professor', 's.</w>'): 2, ('two</w>',): 4, ('dr', 'aw', 'back', 's', ':</w>'): 1, ('(', 'a', ')</w>'): 2, ('exper', 't</w>'): 1, ('writer', ',</w>'): 1, ('any</w>',): 4, ('art</w>',): 2, ('histori', 'an</w>'): 1, ('p', 'ainter', ',</w>'): 1, ('(', 'b', ')</w>'): 2, ('su', 'b', 'j', 'e', 'ct</w>'): 1, ('now</w>',): 7, ('ten', 'ds</w>'): 2, ('literature', ',</w>'): 2, ('s', 'in', 'ce</w>'): 3, ("that's</w>",): 7, ('in', '.</w>'): 1, ('imit', 'ate</w>'): 1, ('universiti', 'es.</w>'): 2, ('se', 'e', 'ds</w>'): 1, ('our</w>',): 5, ('m', 'iser', 'able</w>'): 1, ('experi', 'en', 'ces</w>'): 1, ('s', 'own</w>'): 1, ('18', '9', '2', ',</w>'): 1, ('when</w>',): 17, ('n', 'ation', 'al</w>'): 1, ('e', 'du', 'c', 'ation</w>'): 2, ('ass', 'o', 'ci', 'ation</w>'): 1, ('"', 'for', 'm', 'ally</w>'): 1, ('re', 'com', 'men', 'ded</w>'): 1, ('com', 'position</w>'): 1, ('un', 'i', 'fi', 'ed</w>'): 1, ('course', '."</w>'): 1, ('\\[', '4', '\\]</w>'): 2, ("'", 'riting</w>'): 1, ('com', 'ponent</w>'): 1, ('3', '</w>'): 1, ('r', 's</w>'): 1, ('mor', 'p', 'hed</w>'): 1, ('into</w>',): 12, ('english', ',</w>'): 1, ('b', 'iz', 'ar', 're</w>'): 1, ('con', 'se', 'qu', 'ence</w>'): 1, ('literature', '--</w>'): 1, ('write', ',</w>'): 1, ('without</w>',): 1, ('even</w>',): 5, ('realiz', 'ing</w>'): 1, ('it,</w>',): 1, ('imit', 'ations</w>'): 1, ('whatever</w>',): 3, ('publish', 'ing</w>'): 1, ('j', 'our', 'n', 'al', 's</w>'): 1, ('few</w>',): 4, ('de', 'c', 'ades</w>'): 1, ('be', 'fore', '.</w>'): 1, ("it's</w>",): 13, ('w', 'on', 'der</w>'): 2, ('student</w>',): 1, ('po', 'int', 'less</w>'): 1, ('ex', 'er', 'cise', ',</w>'): 1, ('because</w>',): 14, ('we', "'re</w>"): 2, ('thre', 'e</w>'): 2, ('ste', 'ps</w>'): 1, ('remo', 'ved</w>'): 1, ('wor', 'k', ':</w>'): 1, ('imit', 'ating</w>'): 2, ('professor', 's,</w>'): 1, ('merely</w>',): 4, ('in', 'heritor', 's</w>'): 1, ('tr', 'adition</w>'): 2, ('gr', 'ow', 'ing</w>'): 1, ('out</w>',): 15, ('w', 'as,</w>'): 2, ('7', '0', '0</w>'): 1, ('year', 's</w>'): 2, ('ago', ',</w>'): 1, ('f', 'as', 'cin', 'ating</w>'): 1, ('ur', 'gent', 'ly</w>'): 1, ('ne', 'e', 'ded</w>'): 1, ('wor', 'k.</w>'): 3, ('**', 'no</w>'): 1, ('defen', 'se', '**</w>'): 1, ('b', 'ig', '</w>'): 1, ('make</w>',): 11, ("doesn't</w>",): 6, ('take</w>',): 6, ('position</w>',): 4, ('defen', 'd</w>'): 3, ('it.</w>',): 8, ('prin', 'ci', 'ple,</w>'): 1, ('like</w>',): 14, ('ou', 'ght</w>'): 2, ('t', 'ur', 'ns</w>'): 1, ('han', 'gover</w>'): 1, ('for', 'go', 't', 'ten</w>'): 1, ('origin', 's.</w>'): 1, ('o', 'f', 'ten</w>'): 2, ('mistaken', 'ly</w>'): 1, ('beli', 'eved</w>'): 1, ('me', 'di', 'ev', 'al</w>'): 1, ('universities</w>',): 1, ('sem', 'in', 'ari', 'es.</w>'): 1, ('f', 'act</w>'): 2, ('law</w>',): 1, ('school', 's.</w>'): 1, ('law', 'yer', 's</w>'): 3, ('ad', 'v', 'o', 'c', 'ates,</w>'): 1, ('tr', 'ained</w>'): 1, ('e', 'ither</w>'): 2, ('arg', 'ument</w>'): 2, ('c', 'an', '.</w>'): 1, ('whether</w>',): 1, ('c', 'ause</w>'): 2, ('e', 'ffe', 'ct,</w>'): 1, ('s', 'p', 'irit</w>'): 1, ('per', 'v', 'aded</w>'): 1, ('e', 'ar', 'ly</w>'): 1, ('rhetoric', ',</w>'): 2, ('arg', 'u', 'ing</w>'): 2, ('per', 'su', 'as', 'iv', 'el', 'y,</w>'): 1, ('th', 'ir', 'd</w>'): 2, ('under', 'gradu', 'ate</w>'): 2, ('\\[', '5', '\\]</w>'): 2, ('le', 'ct', 'ure</w>'): 1, ('com', 'mon</w>'): 1, ('for', 'm</w>'): 2, ('disc', 'us', 's', 'ion</w>'): 1, ('dis', 'pu', 't', 'ation', '.</w>'): 1, ('nom', 'in', 'ally</w>'): 1, ('preser', 'ved</w>'): 1, ('present', '-', 'd', 'ay</w>'): 1, ('thes', 'is</w>'): 3, ('defen', 'se', ':</w>'): 1, ('people</w>',): 5, ('tre', 'at</w>'): 2, ('words</w>',): 2, ('dis', 'ser', 't', 'ation</w>'): 2, ('inter', 'chan', 'ge', 'able', ',</w>'): 1, ('origin', 'all', 'y,</w>'): 1, ('took</w>',): 3, ('which</w>',): 13, ('defen', 'ded</w>'): 1, ('defen', 'ding</w>'): 1, ('may</w>',): 10, ('ne', 'cess', 'ar', 'y</w>'): 1, ('evil</w>',): 1, ('le', 'g', 'al</w>'): 2, ('dis', 'pu', 'te', ',</w>'): 1, ('best</w>',): 4, ('way</w>',): 10, ('truth', ',</w>'): 1, ('think</w>',): 5, ('ad', 'm', 'it.</w>'): 1, ('just</w>',): 17, ('m', 'is', 's</w>'): 1, ('su', 'b', 't', 'leties</w>'): 1, ('way.</w>',): 3, ('problem</w>',): 1, ("can't</w>",): 4, ('question', '.</w>'): 3, ('yet</w>',): 2, ('prin', 'ci', 'ple</w>'): 2, ('bu', 'il', 't</w>'): 1, ('ver', 'y</w>'): 3, ('stru', 'ct', 'ure</w>'): 1, ('school', '.</w>'): 3, ('sentence</w>',): 1, ('your</w>',): 7, ('thes', 'is,</w>'): 2, ('ch', 'o', 'sen</w>'): 1, ('ad', 'v', 'ance', ',</w>'): 1, ('paragraph', 's</w>'): 2, ('blow', 's</w>'): 1, ('stri', 'ke</w>'): 1, ('con', 'f', 'lict,</w>'): 1, ('conclusion', '--</w>'): 1, ('u', 'h', ',</w>'): 1, ('conclusion', '?</w>'): 1, ('never</w>',): 5, ('sure</w>',): 1, ('supposed</w>',): 5, ('rest', 'ate</w>'): 1, ('s', 'a', 'id</w>'): 2, ('different</w>',): 5, ('enough', '</w>'): 3, ('tel', 'l', '.</w>'): 1, ('b', 'o', 'ther', '?</w>'): 1, ('under', 'st', 'and</w>'): 2, ('origins</w>',): 1, ('sort</w>',): 9, ('"', 'essay', ',', '"</w>'): 1, ('can</w>',): 16, ('see</w>',): 7, ('where</w>',): 5, ('comes</w>',): 2, ('from', '.</w>'): 1, ('concl', 'uding</w>'): 1, ('rem', 'ar', 'ks</w>'): 1, ('j', 'ur', 'y.</w>'): 1, ('convin', 'cing', ',</w>'): 1, ('certain', 'l', 'y,</w>'): 1, ('convincing</w>',): 3, ('go', 't</w>'): 2, ('right</w>',): 3, ('answer', 's,</w>'): 1, ('j', 'ob', '</w>'): 2, ('arg', 'u', 'ing.</w>'): 1, ('dr', 'af', 't</w>'): 2, ('fri', 'en', 'd', 's,</w>'): 1, ('want</w>',): 10, ('know', ':</w>'): 1, ('parts</w>',): 1, ('b', 'ore</w>'): 1, ('them,</w>',): 4, ('un', 'convin', 'cing.</w>'): 1, ('b', 'its</w>'): 2, ('usu', 'ally</w>'): 1, ('fi', 'x', 'ed</w>'): 1, ('c', 'u', 't', 'ting.</w>'): 1, ("don't</w>",): 14, ('fi', 'x', '</w>'): 1, ('un', 'convincing</w>'): 1, ('clever', 'ly.</w>'): 1, ('t', 'al', 'k</w>'): 1, ('mat', 'ter</w>'): 2, ('over', '.</w>'): 1, ('must</w>',): 4, ('ex', 'pl', 'ained</w>'): 1, ('something</w>',): 16, ('b', 'ad', 'ly.</w>'): 1, ('c', 'ase', ',</w>'): 1, ('course</w>',): 1, ('convers', 'ation</w>'): 2, ("i'", 'll</w>'): 1, ('for', 'ced</w>'): 1, ('come</w>',): 4, ('up</w>',): 5, ('clearer</w>',): 1, ('ex', 'pl', 'an', 'ation', ',</w>'): 1, ('in', 'c', 'or', 'por', 'ate</w>'): 1, ('essay.</w>',): 5, ('saying</w>',): 1, ('well', '.</w>'): 2, ('a', 'im</w>'): 2, ('per</w>',): 1, ('se', '.</w>'): 1, ('reader</w>',): 2, ('gets</w>',): 4, ('s', 'm', 'arter', ',</w>'): 1, ('true</w>',): 1, ('become</w>',): 2, ('identic', 'al', ',</w>'): 1, ('convin', 'ce</w>'): 1, ('s', 'm', 'art</w>'): 1, ('reader', 's</w>'): 2, ('ne', 'ar</w>'): 1, ('truth', '.</w>'): 3, ('at', 'tem', 'p', 'ts</w>'): 1, ('per', 'su', 'ade</w>'): 1, ('(', 'or</w>'): 1, ('inevit', 'able', ')</w>'): 1, ('for', 'm', ',</w>'): 1, ('historic', 'ally</w>'): 1, ('in', 'ac', 'cur', 'ate</w>'): 1, ('el', 'se', '.</w>'): 1, ('**', 'tr', 'y', 'ing', '**</w>'): 1, ('re', 'ach</w>'): 1, ('ag', 'ain', ',</w>'): 1, ('though', '</w>'): 1, ('f', 'ar', '.</w>'): 1, ('m', 'ichel</w>'): 1, ('de</w>',): 1, ('mon', 't', 'a', 'ig', 'ne', ',</w>'): 1, ('1', '5', '8', '0</w>'): 1, ('published</w>',): 4, ('b', 'ook</w>'): 2, ('he</w>',): 5, ('c', 'alled</w>'): 2, ('"', 'ess', 'a', 'is', '."</w>'): 1, ('do', 'ing</w>'): 1, ('qu', 'ite</w>'): 2, ('do', ',</w>'): 1, ('em', 'b', 'o', 'di', 'ed</w>'): 1, ('n', 'ame', '.</w>'): 1, ('_', 'essayer', '_</w>'): 1, ('fren', 'ch</w>'): 2, ('ver', 'b', '</w>'): 1, ('mean', 'ing</w>'): 1, ('"', 'to</w>'): 1, ('tr', 'y', '"</w>'): 1, ('_', 'ess', 'a', 'i', '_</w>'): 1, ('at', 'tem', 'p', 't.</w>'): 1, ('fig', 'ure</w>'): 3, ('out.</w>',): 4, ('what', '?</w>'): 1, ('know</w>',): 4, ('yet.</w>',): 1, ('beg', 'in</w>'): 2, ('one', ',</w>'): 1, ('st', 'atement,</w>'): 1, ('essay,</w>',): 2, ('notice</w>',): 4, ('door</w>',): 1, ('a', 'j', 'ar', ',</w>'): 1, ('open</w>',): 1, ('w', 'al', 'k</w>'): 1, ('what', "'s</w>"): 2, ('in', 's', 'ide', '.</w>'): 1, ('ou', 't,</w>'): 1, ('any', 'thing', ',</w>'): 1, ('though', '?</w>'): 1, ('sit</w>',): 1, ('thin', 'k', '?</w>'): 1, ('well,</w>',): 3, ('pre', 'cisely</w>'): 2, ('mon', 't', 'a', 'ig', 'ne', "'s</w>"): 1, ('disc', 'over', 'y.</w>'): 1, ('ex', 'press', 'ing</w>'): 2, ('ideas</w>',): 4, ('hel', 'ps</w>'): 3, ('them.</w>',): 4, ('in', 'de', 'ed,</w>'): 2, ('f', 'ar</w>'): 2, ('too</w>',): 1, ('we', 'a', 'k</w>'): 1, ('wor', 'd.</w>'): 1, ('en', 'ds</w>'): 1, ('m', 'y</w>'): 2, ('only</w>',): 7, ('thought</w>',): 7, ('s', 'at</w>'): 1, ('down</w>',): 3, ('are', ',</w>'): 1, ('the', 'or', 'y,</w>'): 1, ('ex', 'pl', 'ain', 'ing</w>'): 1, ('yourself</w>',): 5, ('reader', '.</w>'): 2, ("you're</w>",): 13, ('your', 'sel', 'f', '.</w>'): 1, ('thin', 'king</w>'): 2, ('loud.</w>',): 1, ('qu', 'ite', '.</w>'): 1, ('in', 'viting</w>'): 1, ('for', 'ces</w>'): 2, ('cle', 'an</w>'): 1, ('apart', 'ment,</w>'): 1, ('will</w>',): 9, ('read</w>',): 5, ('audi', 'en', 'ce', '.</w>'): 1, ("i've</w>",): 4, ('writ', 'ten</w>'): 2, ('m', 'y', 'self</w>'): 1, ('goo', 'd.</w>'): 1, ('ten', 'd</w>'): 3, ('peter</w>',): 2, ('r', 'un</w>'): 1, ('difficul', 'ti', 'es,</w>'): 1, ('concl', 'ude</w>'): 1, ('v', 'ag', 'ue</w>'): 1, ('questions</w>',): 3, ('dri', 'f', 't</w>'): 1, ('o', 'ff</w>'): 2, ('c', 'up</w>'): 1, ('te', 'a', '.</w>'): 1, ('m', 'any</w>'): 2, ('same</w>',): 6, ('st', 'aff</w>'): 2, ('writers</w>',): 4, ('ne', 'w', 's', 'magaz', 'ines.</w>'): 1, ('ou', 't', 'side</w>'): 2, ('sup', 'ply</w>'): 1, ('e', 'ditori', 'al', 's</w>'): 1, ('defen', 'd-', 'a', '-', 'position</w>'): 2, ('v', 'ari', 'et', 'y,</w>'): 1, ('be', 'eline</w>'): 1, ('tow', 'ar', 'd</w>'): 1, ('r', 'ous', 'ing</w>'): 1, ('(', 'and</w>'): 1, ('fore', 'or', 'd', 'aine', 'd', ')</w>'): 1, ('fe', 'el</w>'): 2, ('obliged</w>',): 1, ('"', 'b', 'al', 'ance', 'd', '."</w>'): 1, ("they're</w>",): 7, ('popular</w>',): 4, ('magaz', 'ine', ',</w>'): 1, ('start</w>',): 4, ('r', 'adi', 'o', 'activ', 'ely</w>'): 1, ('con', 'tr', 'over', 's', 'i', 'al</w>'): 1, ('question', 's,</w>'): 2, ('wh', 'ich', '--</w>'): 1, ('magaz', 'ine', '--</w>'): 1, ('pro', 'ce', 'ed</w>'): 1, ('re', 'c', 'o', 'il</w>'): 1, ('ter', 'r', 'or', '.</w>'): 1, ('ab', 'or', 'tion', ',</w>'): 1, ('ag', 'ain', 'st', '?</w>'): 1, ('gr', 'oup</w>'): 2, ('say', 's</w>'): 2, ('thing.</w>',): 2, ('another', '.</w>'): 1, ('certain', ':</w>'): 1, ('question</w>',): 2, ('com', 'plex', '</w>'): 1, ('(', 'but</w>'): 1, ('m', 'ad</w>'): 1, ('us.</w>',): 1, ('dr', 'aw</w>'): 1, ('conclusion', 's', '.)</w>'): 1, ('**', 'the</w>'): 1, ('river', '**</w>'): 1, ("aren't</w>",): 1, ('enough', '.</w>'): 1, ('answer', 's.</w>'): 1, ('alway', 's,</w>'): 1, ('course', '.</w>'): 3, ('sometimes</w>',): 2, ('prom', 'is', 'ing</w>'): 1, ('nowhere', '.</w>'): 1, ('publish', '.</w>'): 1, ('experiments</w>',): 1, ('in', 'conclus', 'ive</w>'): 1, ('resul', 't', 's.</w>'): 1, ('publish', '</w>'): 3, ('tell</w>',): 3, ('al', 're', 'ad', 'y</w>'): 3, ('know', '.</w>'): 2, ('_', 'what', '_</w>'): 1, ('h', 'im</w>'): 1, ('mat', 'ter', ',</w>'): 1, ('interesting.</w>',): 3, ('ac', 'c', 'used</w>'): 1, ('mean', 'dering.</w>'): 1, ('f', 'law', '.</w>'): 1, ('con', 'cer', 'ned</w>'): 1, ('go', 'ing', ',</w>'): 1, ('str', 'a', 'ight</w>'): 1, ('there', ',</w>'): 1, ('bl', 'ustering</w>'): 1, ('th', 'r', 'ou', 'gh', '</w>'): 1, ('ob', 'st', 'acles,</w>'): 1, ('han', 'd-', 'w', 'aving</w>'): 1, ('ac', 'r', 'o', 's', 's</w>'): 2, ('s', 'w', 'am', 'p', 'y</w>'): 1, ('gr', 'oun', 'd.</w>'): 1, ('tr', 'ying</w>'): 2, ('se', 'arch</w>'): 1, ('sus', 'p', 'ici', 'ous</w>'): 1, ('mean', 'der', '.</w>'): 1, ('mean', 'der</w>'): 1, ('(', 'a', 'k', 'a</w>'): 1, ('men', 'deres', ')</w>'): 1, ('river</w>',): 2, ('t', 'ur', 'ke', 'y.</w>'): 1, ('expe', 'ct,</w>'): 1, ('w', 'in', 'ds</w>'): 1, ('pl', 'ace', '.</w>'): 1, ('friv', 'olit', 'y.</w>'): 1, ('p', 'ath</w>'): 1, ('disc', 'overed</w>'): 2, ('e', 'conom', 'ical</w>'): 1, ('r', 'ou', 'te</w>'): 1, ('se', 'a', '.</w>'): 1, ('\\[', '6', '\\]</w>'): 2, ('river', "'s</w>"): 1, ('al', 'gorith', 'm</w>'): 1, ('s', 'im', 'ple', '.</w>'): 1, ('e', 'ach</w>'): 1, ('ste', 'p', ',</w>'): 1, ('f', 'low</w>'): 2, ('dow', 'n', '.</w>'): 1, ('essay', 'ist</w>'): 1, ('tr', 'ans', 'l', 'ates</w>'): 1, ('to', ':</w>'): 1, ('pl', 'aces</w>'): 1, ('nex', 't,</w>'): 1, ('ch', 'oo', 'se</w>'): 1, ('lit', 't', 'le</w>'): 2, ('fores', 'ight</w>'): 1, ('river', '.</w>'): 2, ('always</w>',): 5, ('gener', 'ally</w>'): 1, ('ab', 'out.</w>'): 1, ('s', 'peci', 'fic</w>'): 2, ('conclusions</w>',): 1, ('re', 'ach', ';</w>'): 1, ('paragraph', '</w>'): 2, ('let</w>',): 1, ('sometimes,</w>',): 1, ('river', ',</w>'): 1, ('r', 'uns</w>'): 1, ('ag', 'ain', 'st</w>'): 1, ('w', 'all.</w>'): 1, ('does', ':</w>'): 1, ('back', 'tr', 'ac', 'k.</w>'): 1, ('po', 'int</w>'): 2, ('f', 'ound</w>'): 2, ('f', 'ol', 'low', 'ing</w>'): 1, ('certain</w>',): 2, ('thread</w>',): 1, ('r', 'an</w>'): 1, ('ide', 'as.</w>'): 2, ('seven</w>',): 2, ('dire', 'ction', '.</w>'): 1, ('f', 'un', 'd', 'ament', 'ally</w>'): 1, ('tr', 'ain</w>'): 2, ('thought', '--</w>'): 1, ('cle', 'ane', 'd-', 'up</w>'): 2, ('thought,</w>',): 2, ('di', 'alo', 'g', 'ue</w>'): 1, ('convers', 'ation', '.</w>'): 1, ('convers', 'ation', ',</w>'): 1, ('full</w>',): 1, ('f', 'al', 'se</w>'): 1, ('st', 'art', 's.</w>'): 1, ('ex', 'hausting</w>'): 1, ('re', 'ad.</w>'): 1, ('c', 'u', 't</w>'): 1, ('fill</w>',): 1, ('em', 'p', 'has', 'iz', 'e</w>'): 1, ('centr', 'al</w>'): 1, ('thre', 'ad,</w>'): 1, ('il', 'l', 'ustr', 'ator</w>'): 1, ('in', 'king</w>'): 1, ('pen', 'cil</w>'): 1, ('dr', 'aw', 'ing.</w>'): 1, ('lo', 'se</w>'): 1, ('s', 'pon', 't', 'ane', 'it', 'y</w>'): 1, ('origin', 'al', '.</w>'): 1, ('er', 'r', '</w>'): 1, ('re', 'ference</w>'): 1, ('loo', 'king</w>'): 3, ('che', 'ated</w>'): 1, ("i'", 'd</w>'): 1, ('went</w>',): 2, ('unexpe', 'cted</w>'): 3, ('interesting</w>',): 6, ('dire', 'ction</w>'): 1, ('plo', 'd', 'ded</w>'): 1, ('du', 'ti', 'ful', 'ly</w>'): 1, ('along</w>',): 1, ('pres', 'c', 'ri', 'bed</w>'): 1, ('**', 'surprise', '**</w>'): 1, ('interesting', '?</w>'): 1, ('me', ',</w>'): 1, ('means</w>',): 2, ('surprise', '.</w>'): 3, ('inter', 'f', 'aces,</w>'): 1, ('ge', 'o', 'ffre', 'y</w>'): 1, ('j', 'ames</w>'): 2, ('s', 'a', 'id,</w>'): 1, ('f', 'ol', 'low</w>'): 2, ('aston', 'ish', 'ment.</w>'): 1, ('bu', 't', 'ton</w>'): 1, ('loo', 'ks</w>'): 1, ('m', 'ach', 'ine</w>'): 1, ('stop</w>',): 1, ('stop', ',</w>'): 1, ('s', 'pe', 'ed</w>'): 1, ('u', 'p.</w>'): 1, ('op', 'posite', '.</w>'): 2, ('m', 'a', 'x', 'imum</w>'): 1, ('afr', 'a', 'id</w>'): 1, ('f', 'l', 'ying</w>'): 1, ('tr', 'av', 'el</w>'): 1, ('vic', 'ari', 'ous', 'ly.</w>'): 1, ('fri', 'en', 'ds</w>'): 1, ('c', 'ame</w>'): 1, ('f', 'ar', 'away</w>'): 1, ('pl', 'aces,</w>'): 1, ('w', 'as', "n't</w>"): 1, ('politeness</w>',): 1, ('as', 'ked</w>'): 1, ('s', 'aw', '.</w>'): 1, ('in', 'for', 'mation</w>'): 2, ('them</w>',): 3, ('ask</w>',): 4, ('surprised</w>',): 2, ('pl', 'ace</w>'): 1, ('expe', 'cte', 'd', '?</w>'): 1, ('ex', 'tremely</w>'): 1, ('use', 'ful</w>'): 2, ('unob', 'ser', 'v', 'ant</w>'): 1, ('pe', 'ople,</w>'): 1, ('ex', 'tr', 'act</w>'): 2, ('re', 'c', 'or', 'ding.</w>'): 1, ('surprises</w>',): 8, ('know', ',</w>'): 1, ('con', 'tr', 'adict</w>'): 1, ('v', 'al', 'u', 'able</w>'): 1, ('get.</w>',): 1, ('foo', 'd</w>'): 1, ('he', 'al', 'th', 'y,</w>'): 1, ('c', 'oun', 'ter', 'acts</w>'): 1, ('un', 'he', 'al', 'th', 'y</w>'): 1, ('e', 'ffe', 'cts</w>'): 1, ("you've</w>",): 4, ('e', 'aten', '.</w>'): 1, ('surprises?</w>',): 1, ('there', 'in</w>'): 1, ('lies</w>',): 1, ('hal', 'f</w>'): 2, ('writing.</w>',): 1, ('(', 'the</w>'): 1, ('well', '.)</w>'): 1, ('trick</w>',): 3, ('use</w>',): 1, ('pro', 'x', 'y</w>'): 1, ('lo', 't.</w>'): 1, ('any', 'thing</w>'): 3, ('you', ',</w>'): 2, ('wh', 'o', "'", 've</w>'): 1, ('lo', 't,</w>'): 2, ('prob', 'ably</w>'): 1, ('surprise</w>',): 2, ('reader', 's.</w>'): 1, ('example,</w>',): 5, ('re', 'cent</w>'): 1, ('[', 'essay', '](', 'gh', '.', 'ht', 'm', 'l', ')</w>'): 1, ('po', 'inted</w>'): 1, ('j', 'ud', 'ge</w>'): 1, ('com', 'pu', 'ter</w>'): 1, ('pro', 'gr', 'am', 'mer', 's</w>'): 2, ('know', 's</w>'): 1, ('over', 'all.</w>'): 1, ('realiz', 'e</w>'): 1, ('we', 'ir', 'd.</w>'): 1, ('for', '.</w>'): 1, ('essay', 's,</w>'): 2, ('ingre', 'di', 'ent', 's', ':</w>'): 1, ('top', 'ic', 's</w>'): 1, ('some</w>',): 5, ('ab', 'ilit', 'y</w>'): 2, ('fer', 'ret</w>'): 2, ('unexpe', 'cte', 'd.</w>'): 1, ('ab', 'ou', 't', '?</w>'): 1, ('g', 'u', 'ess</w>'): 1, ('mat', 'ter', '--</w>'): 1, ('de', 'e', 'ply</w>'): 1, ('po', 's', 's', 'i', 'ble</w>'): 1, ('ex', 'ce', 'p', 'tion</w>'): 1, ('deli', 'ber', 'ately</w>'): 1, ('v', 'ari', 'ation</w>'): 1, ('su', 'c', 'ked</w>'): 1, ('f', 'ast</w>'): 2, ('foo', 'd.</w>'): 2, ('retr', 'o', 's', 'pe', 'ct,</w>'): 1, ('b', 'as', 'k', 'in', '-', 'r', 'ob', 'b', 'in', 's?</w>'): 1, ('import', 'ant</w>'): 4, ('c', 'ustomer', 's.</w>'): 1, ('k', 'ids</w>'): 2, ('age</w>',): 5, ('say</w>',): 3, ('yel', 'low', '.</w>'): 2, ('v', 'an', 'il', 'l', 'a</w>'): 1, ('lemon', '?</w>'): 1, ('look</w>',): 4, ('bl', 'an', 'k', 'ly.</w>'): 1, ('m', 'y', 'ster', 'y</w>'): 1, ('peren', 'n', 'i', 'al</w>'): 1, ('f', 'av', 'orite</w>'): 1, ('pr', 'alines</w>'): 1, ("'", 'n', "'", '</w>'): 1, ('c', 're', 'am</w>'): 2, ('ap', 'pe', 'aling.</w>'): 1, ('(', 'i</w>'): 1, ('s', 'al', 't', '.)</w>'): 1, ('f', 'ather', 's</w>'): 2, ('mo', 'ther', 's</w>'): 2, ('b', 'ou', 'ght</w>'): 1, ('ice</w>',): 1, ('k', 'id', 's', ':</w>'): 1, ('benev', 'olent</w>'): 1, ('k', 'ings</w>'): 1, ('bestow', 'ing</w>'): 1, ('l', 'argesse', ',</w>'): 1, ('har', 'ri', 'ed,</w>'): 1, ('g', 'iving</w>'): 1, ('pressure', '.</w>'): 1, ('s', 'o', ',</w>'): 1, ('yes,</w>',): 1, ('materi', 'al</w>'): 1, ('though', '.</w>'): 1, ('s', 'i', 'x', 'te', 'en</w>'): 1, ('ob', 'ser', 'v', 'ant</w>'): 1, ('l', 'um', 'p</w>'): 1, ('r', 'o', 'c', 'k.</w>'): 1, ('fr', 'ag', 'ments</w>'): 1, ('memor', 'y</w>'): 1, ('preser', 've</w>'): 1, ('having</w>',): 1, ('hap', 'pen', 'ing</w>'): 1, ('liv', 'e', ',</w>'): 1, ('fr', 'on', 't</w>'): 1, ('me', '.</w>'): 1, ('**', 'ob', 'ser', 'v', 'ation', '**</w>'): 1, ('in', 'b', 'or', 'n</w>'): 1, ('lear', 'n', '.</w>'): 1, ('it', '?</w>'): 1, ('ex', 'tent</w>'): 1, ('lear', 'n', 'ing</w>'): 1, ('histor', 'y.</w>'): 2, ('wh', 'ir', 'l</w>'): 1, ('n', 'ames</w>'): 1, ('d', 'ates.</w>'): 1, ('nothing</w>',): 2, ('stic', 'k.</w>'): 1, ('lear', 'n', ',</w>'): 1, ('h', 'oo', 'ks</w>'): 1, ('f', 'acts</w>'): 1, ('stick</w>',): 1, ('on', 'to', '--</w>'): 1, ('ac', 'c', 'umul', 'ate</w>'): 1, ('know', 'le', 'd', 'ge</w>'): 1, ('ex', 'ponenti', 'al</w>'): 1, ('r', 'ate', '.</w>'): 1, ('norm', 'ans</w>'): 3, ('con', 'qu', 'ered</w>'): 2, ('eng', 'l', 'and</w>'): 1, ('1', '0', '6', '6', ',</w>'): 1, ('at', 'tention</w>'): 3, ('he', 'ar</w>'): 1, ('s', 'ou', 'ther', 'n</w>'): 1, ('it', 'aly</w>'): 1, ('time', '.</w>'): 1, ('norm', 'an', 'd', 'y,</w>'): 1, ('note</w>',): 1, ('mentions</w>',): 1, ('not,</w>',): 1, ('fr', 'ance', ',</w>'): 1, ('tri', 'bes</w>'): 1, ('f', 'lowed</w>'): 1, ('r', 'om', 'an</w>'): 1, ('em', 'p', 'ire</w>'): 1, ('col', 'l', 'ap', 'sed,</w>'): 1, ('vi', 'k', 'ings</w>'): 2, ('(', 'norm', 'an</w>'): 1, ('=', '</w>'): 1, ('nor', 'th</w>'): 1, ('m', 'an', ')</w>'): 1, ('ar', 'rived</w>'): 1, ('f', 'our</w>'): 1, ('l', 'ater</w>'): 1, ('9', '1', '1', '.</w>'): 1, ('m', 'a', 'kes</w>'): 1, ('e', 'as', 'i', 'er</w>'): 1, ('du', 'blin</w>'): 1, ('al', 'so</w>'): 2, ('est', 'ablished</w>'): 1, ('8', '4', '0', 's.</w>'): 1, ('et', 'c', ',</w>'): 1, ('et', 'c', '</w>'): 1, ('s', 'qu', 'are', 'd.</w>'): 1, ('colle', 'cting</w>'): 2, ('s', 'im', 'il', 'ar</w>'): 1, ('pro', 'cess.</w>'): 1, ('anom', 'alies</w>'): 1, ('se', 'en', ',</w>'): 1, ('e', 'as', 'ily</w>'): 1, ("you'", 'll</w>'): 3, ('means,</w>',): 1, ('o', 'd', 'd', 'ly</w>'): 1, ('enough', ',</w>'): 2, ('gr', 'ow</w>'): 1, ('ol', 'der', ',</w>'): 1, ('li', 'fe</w>'): 1, ('surpris', 'ing.</w>'): 1, ('k', 'id,</w>'): 1, ('used</w>',): 3, ('adul', 'ts</w>'): 1, ('fig', 'ured</w>'): 2, ('back', 'w', 'ar', 'd', 's.</w>'): 1, ('ones</w>',): 1, ('mistaken', '.</w>'): 2, ('surprises,</w>',): 1, ('rich</w>',): 1, ('richer', '.</w>'): 1, ('(', 'as</w>'): 1, ('we', 'al', 'th', ')</w>'): 1, ('hab', 'its</w>'): 1, ('m', 'ind</w>'): 1, ('hel', 'p</w>'): 1, ('pro', 'cess</w>'): 2, ('along', '.</w>'): 1, ('hab', 'it</w>'): 2, ('as', 'king</w>'): 1, ('es', 'peci', 'ally</w>'): 4, ('wh', 'y.</w>'): 2, ('r', 'an', 'dom</w>'): 1, ('year</w>',): 2, ('ol', 'ds</w>'): 1, ('in', 'fin', 'ite</w>'): 1, ('n', 'um', 'ber</w>'): 2, ('question', 's.</w>'): 1, ('fr', 'u', 'it', 'ful</w>'): 1, ('ones?</w>',): 1, ('wr', 'ong', '.</w>'): 1, ('con', 'ne', 'ction</w>'): 2, ('h', 'umor</w>'): 1, ('m', 'is', 'for', 't', 'une', '?</w>'): 1, ('f', 'un', 'n', 'y</w>'): 1, ('char', 'acter', ',</w>'): 1, ('li', 'ke', ',</w>'): 1, ('s', 'li', 'ps</w>'): 1, ('b', 'an', 'an', 'a</w>'): 1, ('pe', 'el', '?</w>'): 1, ('there', "'s</w>"): 3, ('essay', "'s</w>"): 1, ('wor', 'th</w>'): 1, ('sure', '.</w>'): 1, ('wr', 'ong', ',</w>'): 2, ('de', 'gre', 'e</w>'): 1, ('s', 'ke', 'p', 'ticis', 'm</w>'): 1, ('hel', 'p', 'ful', '.</w>'): 1, ('a', 'x', 'i', 'om</w>'): 1, ('ach', 'i', 'eving</w>'): 1, ('1', '%', '</w>'): 1, ('c', 'oul', 'd.</w>'): 1, ('c', 'oun', 'ter', 'act</w>'): 1, ('r', 'ule</w>'): 1, ('be', 'aten</w>'): 1, ('he', 'ads</w>'): 1, ('ch', 'il', 'dren', ':</w>'): 1, ('be', '.</w>'): 1, ('ever', 'y', 'one</w>'): 1, ('t', 'al', 'ked</w>'): 1, ('wh', 'ile</w>'): 1, ('fel', 't</w>'): 1, ('classes', '--</w>'): 1, ('none</w>',): 1, ('us</w>',): 2, ('b', 'all', 's</w>'): 1, ('h', 'y', 'po', 'thes', 'iz', 'e</w>'): 1, ('f', 'act,</w>'): 1, ('mista', 'ke', '.</w>'): 1, ("weren't</w>",): 1, ('get', 'ting.</w>'): 1, ('h', 'un', 'ch</w>'): 1, ('p', 'ay</w>'): 1, ('wr', 'ong</w>'): 2, ('h', 'umor', 'ous</w>'): 1, ('ple', 'ased</w>'): 1, ('l', 'au', 'gh', '</w>'): 1, ('be', '?</w>'): 1, ('a', 'im', 'ing</w>'): 1, ('f', 'un', 'n', 'y', '?</w>'): 1, ('l', 'au', 'gh', ',</w>'): 1, ('wan', 'ts</w>'): 1, ('deliver', '.</w>'): 1, ('me</w>',): 2, ('note', 'b', 'oo', 'k', 's.</w>'): 1, ('act', 'u', 'ally</w>'): 1, ('re', 'ading</w>'): 2, ('us', 'ing</w>'): 1, ('writ', 'ten', ',</w>'): 1, ('re', 'pro', 'du', 'ce</w>'): 1, ('thoughts</w>',): 1, ('l', 'ater', '.</w>'): 1, ('v', 'al', 'ue</w>'): 1, ('note', 'b', 'oo', 'ks</w>'): 1, ('le', 'av', 'es</w>'): 1, ('he', 'ad.</w>'): 1, ('c', 'ool</w>'): 1, ('them', 'sel', 'v', 'es</w>'): 1, ('dis', 'ad', 'v', 'an', 't', 'age</w>'): 1, ('surprises.</w>',): 1, ('c', 'ool', ',</w>'): 1, ('f', 'our', 'te', 'en</w>'): 1, ('ol', 'd</w>'): 1, ('_', 'n', 'il</w>'): 1, ('ad', 'm', 'ir', 'ari', '._</w>'): 1, ('mistaken', ',</w>'): 1, ('d', 'well</w>'): 1, ('it', ';</w>'): 1, ('act</w>',): 1, ('nothing', "'s</w>"): 1, ('m', 'ay', 'be</w>'): 1, ('notice', '.</w>'): 1, ('ke', 'y', 's</w>'): 1, ('c', 'ool', 'ness</w>'): 1, ('av', 'o', 'id</w>'): 1, ('sit', 'u', 'ations</w>'): 1, ('inexperi', 'ence</w>'): 1, ('foolish', '.</w>'): 1, ('lo', 'ts</w>'): 1, ('thing', 's,</w>'): 2, ('con', 'ne', 'ctions</w>'): 1, ('fiel', 'd', 's.</w>'): 1, ('j', 'am', ',</w>'): 1, ('bacon', ',</w>'): 2, ('p', 'ic', 'k', 'les,</w>'): 1, ('che', 'ese', ',</w>'): 1, ('among</w>',): 2, ('ple', 'as', 'ing</w>'): 1, ('foo', 'd', 's,</w>'): 1, ('origin', 'ally</w>'): 1, ('inten', 'ded</w>'): 1, ('metho', 'ds</w>'): 1, ('preser', 'v', 'ation', '.</w>'): 1, ('b', 'oo', 'ks</w>'): 1, ('p', 'ainting', 's.</w>'): 1, ('stud', 'y,</w>'): 1, ('in', 'cl', 'ude</w>'): 1, ('histor', 'y', '--</w>'): 1, ('s', 'o', 'ci', 'al</w>'): 1, ('e', 'conom', 'ic</w>'): 1, ('political</w>',): 2, ('m', 'is', 'le', 'ading</w>'): 1, ('mere</w>',): 1, ('stud', 'y.</w>'): 1, ('des', 'c', 'ri', 'be</w>'): 1, ('_', 'all</w>'): 1, ('d', 'at', 'a</w>'): 1, ('f', 'ar', '._</w>'): 1, ('g', 'iv', 'es</w>'): 1, ('con', 'fidence</w>'): 1, ('w', 'a', 'iting</w>'): 1, ('under</w>',): 1, ('no', 'ses.</w>'): 1, ('s', 'words</w>'): 2, ('ev', 'ol', 'ved</w>'): 1, ('b', 'r', 'on', 'z', 'e</w>'): 1, ('d', 'ag', 'ger', 's,</w>'): 1, ('(', 'like</w>'): 1, ('f', 'lint</w>'): 1, ('pre', 'de', 'cess', 'or', 's', ')</w>'): 1, ('h', 'il', 't</w>'): 2, ('se', 'par', 'ate</w>'): 1, ('bl', 'ade', '.</w>'): 1, ('longer</w>',): 1, ('h', 'il', 'ts</w>'): 1, ('ke', 'p', 't</w>'): 1, ('b', 're', 'a', 'king</w>'): 1, ('o', 'ff', '.</w>'): 1, ('five</w>',): 1, ('h', 'un', 'dred</w>'): 2, ('c', 'asting</w>'): 1, ('bl', 'ade</w>'): 1, ('p', 'i', 'e', 'ce', '.</w>'): 1, ('**', 'dis', 'obe', 'di', 'en', 'ce', '**</w>'): 1, ('ab', 'o', 've</w>'): 1, ('all', ',</w>'): 1, ('p', 'aying</w>'): 1, ('to', ',</w>'): 1, ('"', '[', 'in', 'ap', 'propri', 'ate', '](', 'say', '.', 'ht', 'm', 'l', ')', ',', '"</w>'): 1, ('import', 'an', 't,</w>'): 1, ('on', '.</w>'): 1, ('curi', 'ous</w>'): 1, ('something', ',</w>'): 1, ('trust</w>',): 1, ('in', 'stin', 'ct', 's.</w>'): 1, ('thre', 'ads</w>'): 1, ('at', 'tr', 'act</w>'): 1, ('at', 'tention', '.</w>'): 1, ('in', ',</w>'): 1, ('un', 'c', 'an', 'n', 'y</w>'): 1, ('le', 'ading</w>'): 2, ('any', 'way,</w>'): 1, ('proud</w>',): 1, ('le', 'ad</w>'): 1, ('f', 'as', 'cin', 'ated</w>'): 1, ('com', 'b', '-', 'over', 's,</w>'): 1, ('ex', 'treme</w>'): 1, ('m', 'an</w>'): 1, ('he', "'s</w>"): 1, ('we', 'aring</w>'): 1, ('beret</w>',): 1, ('ha', 'ir', '.</w>'): 1, ('surely</w>',): 1, ('low', 'ly</w>'): 1, ('in', '--</w>'): 1, ('super', 'fici', 'al</w>'): 1, ('qu', 'iz', 'z', 'ing</w>'): 1, ('le', 'f', 't</w>'): 1, ('te', 'en', 'age</w>'): 1, ('g', 'ir', 'l', 's.</w>'): 1, ('under', 'ne', 'ath', '.</w>'): 1, ('ke', 'y</w>'): 1, ('question', ',</w>'): 1, ('realiz', 'ed,</w>'): 1, ('com', 'ber', '-', 'over</w>'): 1, ('o', 'd', 'd</w>'): 1, ('loo', 'k', 's?</w>'): 1, ('_', 'in', 'c', 'rement', 'all', 'y', '._</w>'): 1, ('com', 'b', 'ing</w>'): 1, ('ha', 'ir</w>'): 1, ('c', 'are', 'ful', 'ly</w>'): 1, ('thin</w>',): 1, ('p', 'at', 'ch</w>'): 1, ('gradu', 'all', 'y,</w>'): 1, ('2', '0</w>'): 1, ('year', 's,</w>'): 1, ('gr', 'own</w>'): 1, ('mon', 'str', 'o', 'sit', 'y.</w>'): 1, ('gradu', 'al', 'ness</w>'): 1, ('power', 'ful', '.</w>'): 1, ('power</w>',): 1, ('con', 'stru', 'ctive</w>'): 1, ('pur', 'po', 'ses</w>'): 1, ('too', ':</w>'): 1, ('fre', 'a', 'k', ',</w>'): 1, ('c', 're', 'ating</w>'): 1, ('gr', 'and</w>'): 1, ('d', 'ared</w>'): 1, ('_', 'pl', 'an', '_</w>'): 1, ('su', 'ch</w>'): 1, ('s', 'o', 'f', 'tw', 'are</w>'): 1, ('c', 're', 'ate', 'd.</w>'): 1, ('stri', 'p', 'pe', 'd-', 'down</w>'): 1, ('ker', 'nel</w>'): 1, ('(', 'how</w>'): 1, ('har', 'd</w>'): 2, ('be', '?', ')</w>'): 1, ('gradu', 'ally</w>'): 1, ('gr', 'ow', 's</w>'): 1, ('com', 'plete</w>'): 1, ('oper', 'ating</w>'): 1, ('hence</w>',): 1, ('le', 'ap', ':</w>'): 1, ('p', 'ainting', ',</w>'): 1, ('no', 'v', 'el', '?</w>'): 1, ('friv', 'olous</w>'): 1, ('question', '?</w>'): 1, ('p', 'i', 'e', 'ce</w>'): 1, ('ad', 'vice</w>'): 1, ('be', ':</w>'): 1, ('tol', 'd.</w>'): 1, ('beli', 'eve</w>'): 1, ('to', '.</w>'): 1, ('expe', 'ct', ';</w>'): 1, ('lear', 'ns</w>'): 1, ('expe', 'ct', 's.</w>'): 1, ('dis', 'obe', 'di', 'ence</w>'): 2, ('all.</w>',): 1, ('for', 't', 'un', 'atel', 'y,</w>'): 1, ('sh', 'ow', 's</w>'): 1, ('s', 'ig', 'ns</w>'): 1, ('becom', 'ing</w>'): 1, ('[', 'r', 'am', 'p', 'an', 't', '](', 'ht', 't', 'p', ':', '/', '/', 'w', 'w', 'w', '.', 'o', 'j', 'r', '.', 'or', 'g', '/', 'o', 'j', 'r', '/', 'g', 'l', 'aser', '/', '1', '0', '5', '6', '0', '5', '0', '2', '7', '0', '.', 'p', 'h', 'p', ').</w>'): 1, ('tin', 'y</w>'): 1, ('o', 'ffici', 'ally</w>'): 1, ('ap', 'pro', 'ved</w>'): 1, ('allowed</w>',): 1, ('essay', 's.</w>'): 1, ('magaz', 'ines</w>'): 2, ('j', 'ud', 'ged</w>'): 1, ('wr', 'o', 'te</w>'): 2, ('them', ';</w>'): 1, ('magaz', 'ine</w>'): 1, ('stor', 'y</w>'): 2, ('un', 'known</w>'): 1, ('writer</w>',): 1, ('x', '</w>'): 2, ('for', 't', 'y</w>'): 1, ('wh', 'o', 'se</w>'): 1, ('tit', 'le</w>'): 1, ('problem', ',</w>'): 1, ('lo', 't</w>'): 1, ('in', 's', 'ider', 's</w>'): 1, ('in', 's', 'ider', 's.</w>'): 1, ('inter', 'net</w>'): 1, ('chan', 'g', 'ing</w>'): 1, ('that.</w>',): 1, ('any', 'one</w>'): 1, ('we', 'b', ',</w>'): 1, ('j', 'ud', 'ged,</w>'): 1, ('sh', 'oul', 'd,</w>'): 1, ('say', 's,</w>'): 1, ('x', '?</w>'): 1, ('wr', 'o', 'te', '.</w>'): 1, ('liter', 'ac', 'y</w>'): 1, ('ar', 'riv', 'al</w>'): 1, ('t', 'v', '</w>'): 1, ('gol', 'den</w>'): 2, ('sh', 'ort</w>'): 1, ('stor', 'y.</w>'): 1, ('we', 'b', '</w>'): 1, ('realiz', 'ed</w>'): 1, ('st', 'arted</w>'): 1, ('th', 'is.</w>'): 2, ('**', 'notes', '**</w>'): 1, ('ores', 'me</w>'): 1, ('(', 'c', '.</w>'): 1, ('1', '3', '2', '3', '-', '8', '2', ').</w>'): 1, ('p', 'ick</w>'): 1, ('d', 'ate', ',</w>'): 1, ('sud', 'den</w>'): 1, ('dr', 'op', '-', 'o', 'ff</w>'): 1, ('eur', 'ope', 'ans</w>'): 1, ('fin', 'ished</w>'): 1, ('ass', 'im', 'il', 'ating</w>'): 1, ('s', 'ci', 'en', 'ce', '.</w>'): 1, ('pl', 'ag', 'ue</w>'): 1, ('1', '3', '4', '7', ';</w>'): 1, ('tren', 'd</w>'): 1, ('s', 'ci', 'enti', 'fic</w>'): 1, ('pro', 'gress</w>'): 1, ('mat', 'ches</w>'): 1, ('popul', 'ation</w>'): 1, ('cur', 'v', 'e', '.</w>'): 1, ('par', 'ker', ',</w>'): 2, ('w', 'il', 'li', 'am</w>'): 1, ('r', '.</w>'): 1, ('"', 'where</w>'): 1, ('colle', 'ge</w>'): 1, ('de', 'part', 'ments</w>'): 1, ('from', '?', '"</w>'): 1, ('_', 'colle', 'ge</w>'): 1, ('english', '_</w>'): 1, ('2', '8', '</w>'): 1, ('(', '19', '6', '6', '-', '6', '7', ')', ',</w>'): 1, ('p', 'p.</w>'): 1, ('3', '3', '9', '-', '3', '5', '1', '.</w>'): 1, ('re', 'printed</w>'): 2, ('gr', 'ay,</w>'): 1, ('don', 'al', 'd</w>'): 1, ('j', '.</w>'): 1, ('(', 'e', 'd', ').</w>'): 2, ('_', 'the</w>'): 3, ('de', 'part', 'ment</w>'): 1, ('in', 'di', 'an', 'a</w>'): 2, ('bloom', 'ing', 'ton</w>'): 1, ('18', '6', '8', '-', '19', '7', '0', '._</w>'): 1, ('public', 'ation', 's.</w>'): 1, ('d', 'an', 'i', 'el', 's,</w>'): 1, ('r', 'ober', 't</w>'): 2, ('v', '.</w>'): 1, ('ver', 'mon', 't', ':</w>'): 1, ('year', 's', '._</w>'): 1, ('19', '9', '1', '.</w>'): 1, ('mu', 'el', 'ler', ',</w>'): 1, ('fri', 'e', 'drich</w>'): 1, ('m', '.</w>'): 1, ('let', 'ter</w>'): 1, ('_', 'p', 'all</w>'): 1, ('m', 'all</w>'): 1, ('g', 'az', 'et', 'te', '._</w>'): 1, ('18', '8', '6', '/', '8', '7', '.</w>'): 1, ('al', 'an</w>'): 1, ('n', 'inete', 'enth', '-', 'centur', 'y</w>'): 1, ('studi', 'es', '._</w>'): 1, ('ash', 'g', 'ate', ',</w>'): 1, ('19', '9', '8', '.</w>'): 1, ('com', 'press', 'ing</w>'): 1, ('b', 'it.</w>'): 1, ('se', 'at</w>'): 1, ('p', 'h', 'ilolo', 'g', 'y,</w>'): 1, ('seri', 'ous</w>'): 1, ('ger', 'm', 'any,</w>'): 1, ('gener', 'ation</w>'): 1, ('tr', 'aine', 'd.</w>'): 1, ('c', 'ases</w>'): 1, ('tr', 'ans', 'for', 'med</w>'): 1, ('_', 'in</w>'): 1, ('sit', 'u', '_</w>'): 1, ('fr', 'an', 'cis</w>'): 1, ('ch', 'il', 'd,</w>'): 1, ('b', 'o', 'y', 'l', 'ston</w>'): 1, ('18', '5', '1', ',</w>'): 1, ('18', '7', '6', '</w>'): 1, ('universit', 'y', "'s</w>"): 1, ('english', '.</w>'): 1, ('_', 'op.</w>'): 1, ('cit', '.', '_', ',</w>'): 1, ('p.</w>',): 1, ('2', '5', '.</w>'): 1, ('cur', 'ricul', 'um</w>'): 1, ('_', 'trivi', 'um', '_</w>'): 1, ('(', 'whence</w>'): 1, ('"', 'trivi', 'al', '"', ')</w>'): 1, ('con', 's', 'isted</w>'): 1, ('l', 'atin</w>'): 1, ('gr', 'am', 'm', 'ar', ',</w>'): 1, ('lo', 'g', 'ic', '.</w>'): 1, ('c', 'an', 'did', 'ates</w>'): 1, ('m', 'aster', 's', "'", '</w>'): 1, ('de', 'gre', 'es</w>'): 1, ('_', 'qu', 'adrivi', 'um', '_</w>'): 1, ('arith', 'metic', ',</w>'): 1, ('ge', 'ometr', 'y,</w>'): 1, ('mus', 'ic', ',</w>'): 1, ('astr', 'onom', 'y.</w>'): 1, ('li', 'ber', 'al</w>'): 1, ('art', 's.</w>'): 1, ('dire', 'ct', 'ly</w>'): 1, ('r', 'ome', ',</w>'): 1, ('con', 's', 'idered</w>'): 1, ('su', 'b', 'j', 'e', 'ct.</w>'): 1, ('truth</w>',): 1, ('wor', 'l', 'd</w>'): 1, ('meant</w>',): 1, ('tr', 'ain', 'ing</w>'): 1, ('l', 'an', 'dow', 'ner', 's', "'", '</w>'): 1, ('s', 'ons</w>'): 1, ('s', 'pe', 'a', 'k</w>'): 1, ('interests</w>',): 1, ('dis', 'pu', 'tes.</w>'): 1, ('trev', 'or</w>'): 2, ('bl', 'ac', 'k', 'well</w>'): 1, ('po', 'ints</w>'): 1, ('is', "n't</w>"): 1, ('strict', 'ly</w>'): 1, ('tru', 'e', ',</w>'): 1, ('e', 'd', 'ges</w>'): 1, ('cur', 'v', 'es</w>'): 1, ('er', 'o', 'de</w>'): 1, ('f', 'aster', '.</w>'): 1, ('**', 'than', 'k', 's', '**</w>'): 1, ('ken</w>',): 1, ('an', 'der', 's', 'on', ',</w>'): 1, ('bl', 'ac', 'k', 'well,</w>'): 1, ('s', 'ar', 'a', 'h', '</w>'): 1, ('har', 'lin', ',</w>'): 1, ('j', 'ess', 'ic', 'a</w>'): 1, ('living', 'ston', ',</w>'): 1, ('j', 'ac', 'k', 'i', 'e</w>'): 1, ('m', 'c', 'donou', 'gh', ',</w>'): 1, ('mor', 'ris</w>'): 1, ('dr', 'af', 'ts</w>'): 1, ('li', 'ked</w>'): 1, ('th', 'is,</w>'): 1, ('[', '**', '_', 'hac', 'ker', 's</w>'): 1, ('&', '</w>'): 1, ('p', 'ainter', 's', '_', '**', '](', 'hac', 'k', 'p', 'aint', '.', 'ht', 'm', 'l', ').</w>'): 1}

In [17]:
corpus

{('re', 'me', 'm', 'b', 'er</w>'): 122,
 ('the</w>',): 21978,
 ('ess', 'ay', 's</w>'): 70,
 ('you</w>',): 8101,
 ('had</w>',): 1040,
 ('to</w>',): 17745,
 ('writ', 'e</w>'): 432,
 ('in</w>',): 7857,
 ('h', 'ig', 'h</w>'): 257,
 ('sch', 'oo', 'l', '?</w>'): 3,
 ('t', 'op', 'ic</w>'): 44,
 ('s', 'ent', 'enc', 'e,</w>'): 6,
 ('int', 'ro', 'du', 'c', 't', 'or', 'y</w>'): 4,
 ('par', 'ag', 'ra', 'ph', ',</w>'): 4,
 ('su', 'pp', 'or', 'ting</w>'): 10,
 ('par', 'ag', 'ra', 'ph', 's,</w>'): 2,
 ('con', 'cl', 'us', 'ion', '.</w>'): 5,
 ('con', 'cl', 'us', 'i', 'on</w>'): 12,
 ('be', 'ing,</w>'): 2,
 ('s', 'ay', ',</w>'): 90,
 ('that</w>',): 7928,
 ('a', 'ha', 'b', '</w>'): 1,
 ('_', 'mo', 'by</w>'): 1,
 ('d', 'ic', 'k', '_', '</w>'): 1,
 ('was</w>',): 2880,
 ('a</w>',): 13562,
 ('ch', 'ri', 'st', '-', 'like</w>'): 1,
 ('f', 'ig', 'u', 're', '.</w>'): 1,
 ('o', 'y.</w>'): 1,
 ('so</w>',): 2490,
 ("i'", 'm</w>'): 425,
 ('go', 'ing</w>'): 499,
 ('tr', 'y</w>'): 348,
 ('gi', 've</w>'): 282,
 ('othe

In [25]:
most_recurring_bigram

(('u', 'g'), 20)

In [30]:
def get_merged_key(corpus, most_recurring_bigram):
    key_before_merge = "_".join(most_recurring_bigram)
    key_post_merge = "".join(most_recurring_bigram)
    d = {}
    for key, value in corpus.items():
        updated_key = "_".join(key).replace("_".join(most_recurring_bigram), "".join(most_recurring_bigram))
        updated_key = tuple(updated_key.split('_'))
        d[updated_key] = value
    return d
get_stats(get_merged_key(corpus, most_recurring_bigram[0]))

[('h', 'ug'), ('ug', '</w>')]
[('p', 'ug'), ('ug', '</w>')]
[('p', 'u'), ('u', 'n'), ('n', '</w>')]
[('b', 'u'), ('u', 'n'), ('n', '</w>')]
[('h', 'ug'), ('ug', 's'), ('s', '</w>')]


{('h', 'ug'): 15,
 ('ug', '</w>'): 15,
 ('p', 'ug'): 5,
 ('p', 'u'): 12,
 ('u', 'n'): 16,
 ('n', '</w>'): 16,
 ('b', 'u'): 4,
 ('ug', 's'): 5,
 ('s', '</w>'): 5}

In [15]:
corpus

{('h', 'u', 'g', '</w>'): 10,
 ('p', 'u', 'g', '</w>'): 5,
 ('p', 'u', 'n', '</w>'): 12,
 ('b', 'u', 'n', '</w>'): 4,
 ('h', 'u', 'g', 's', '</w>'): 5}

In [112]:
{('t', 'e', 's', 't', '</w>'): 1}

{('t', 'e', 's', 't', '</w>'): 1}

In [20]:
def test_get_corpus():
    corpus = ["test"]
    vocab = get_corpus(corpus)
    print(vocab)
    assert vocab == {('t', 'e', 's', 't', '</w>'): 1}
    print("✓ test_get_corpus passed")

def test_get_stats():
    vocab = {('t', 'e', 's', 't', '</w>'): 1}
    stats = get_stats(vocab)
    expected = {
        ('t', 'e'): 1,
        ('e', 's'): 1,
        ('s', 't'): 1,
        ('t', '</w>'): 1
    }
    assert stats == expected
    print("✓ test_get_stats passed")

def test_merge_vocab():
    vocab = {('t', 'e', 's', 't', '</w>'): 1}
    stats = {
        ('t', 'e'): 1,
        ('e', 's'): 2,
        ('s', 't'): 1,
        ('t', '</w>'): 1
    }
    merged = merge_vocab(stats, vocab)
    # merged = merge_vocab(('e', 's'), vocab)
    print(merged)
    expected = {('t', 'es', 't', '</w>'): 1}
    assert merged == expected
    print("✓ test_merge_vocab passed")

def test_bpe_sequence():
    corpus = ["low", "lower", "newest", "widest"]
    final_vocab, merges = byte_pair_encoding(corpus, num_merges=5)
    assert isinstance(final_vocab, dict)
    assert all(isinstance(pair, tuple) for pair in merges)
    assert len(merges) == 5
    print("✓ test_bpe_sequence passed")

# Run all tests
test_get_corpus()
test_get_stats()
test_merge_vocab()

{('t', 'e', 's', 't', '</w>'): 1}
✓ test_get_corpus passed
✓ test_get_stats passed
most_freq_pair_chars = es | most_freq_pair = (('e', 's'), 2)
{('t', 'es', 't', '</w>'): 1}
✓ test_merge_vocab passed
