In [2]:
import string
import random
from collections import Counter

In [3]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [9]:
def format_line(line):
    line = line.strip().casefold()
    
    # remove punctuation
    for char in line:
        if char in string.punctuation:
            line = line.replace(char, '')
    
    return line.split()

In [6]:
def word_iter(words):
#     for i in range(len(words) - 1):
#         current = words[i]
#         future = words[i + 1]

#     for i, word in enumerate(words[:-1]):
#         current = word
#         future = words[i + 1]

    for current, future in zip(words[:-1], words[1:]):
        yield current, future

In [14]:
def build_markov_model(line, markov_model = None):
    """Function to build or add to a 1st order Markov model given a string of text.

    Stores the markov model as a dictionary of dictionaries. The key in the outer
    dictionary represents the current state, and the inner dictionary represents
    the next state with their contents containing the transition probabilities.

    Args:
        new_text (list): The next string to be added to the markov_model
        markov_model (dict): An existing markov_model (default: None)

    Returns:
        markov_model (dict): The updated markov_model
    """
    if markov_model is None:
        markov_model = {}
    
    # bookending
    words = ['*S*'] + line + ['*E*']
    
    for current, future in word_iter(words):
        markov_model.setdefault(current, Counter()).update({future})
    
    return markov_model

In [15]:
def ingest_text(text_file, markov_model = None):
    with open(text_file) as text:
        markov_model = markov_model
        
        for line in text:
            line = format_line(line)
            markov_model = build_markov_model(line, markov_model)
    
    return markov_model

In [11]:
mm = ingest_text('one-fish-two-fish.txt')

In [13]:
import json
print(json.dumps(mm, sort_keys = True, indent = 4))

{
    "*S*": {
        "a": 1,
        "and": 7,
        "are": 1,
        "black": 1,
        "blue": 2,
        "but": 1,
        "do": 1,
        "don\u2019t": 1,
        "eight": 1,
        "eleven": 2,
        "five": 1,
        "from": 1,
        "funny": 1,
        "go": 2,
        "he": 1,
        "how": 1,
        "i": 2,
        "in": 1,
        "is": 1,
        "look": 1,
        "new": 1,
        "not": 1,
        "of": 2,
        "oh": 2,
        "old": 1,
        "one": 3,
        "red": 1,
        "sad": 1,
        "say": 2,
        "some": 7,
        "the": 1,
        "they": 1,
        "this": 3,
        "two": 1,
        "we": 2,
        "what": 1,
        "where": 1,
        "who": 1,
        "why": 1,
        "yes": 1
    },
    "a": {
        "little": 2,
        "long": 1,
        "lot": 2,
        "yellow": 1
    },
    "and": {
        "bad": 1,
        "glad": 1,
        "some": 9
    },
    "another": {
        "*E*": 1
    },
    "are": {
        "*E*": 1,
  

In [20]:
def get_next_word(current_word, markov_model = None, seed = None):
    """Function to randomly move a valid next state given a markov model
    and a current state (word)

    Args: 
        current_word (str): a word that exists in our model
        markov_model (dict of dicts): a dictionary of word:(next_word:frequency pairs)
        seed (int): set the random seed if desired (default: None)

    Returns:
        next_state (str): a randomly selected next word based on transition probabilies
    """
    random.seed(seed)
    
    # Aliasing to be concise and to prevent mutliple dictionary lookups
    curr = markov_model[current_word]
    
    weights = [ val/sum(curr.values()) for val in curr.values()]
    next_state = random.choices([*curr.keys()], weights = weights)[0]
    
    return next_state

In [21]:
def generate_random_text(markov_model, start_state = '*S*', end_state = '*E*', seed = None):
    """Function to generate text given a markov model

    Args: 
        markov_model (dict of dicts): a dictionary of word:(next_word:frequency pairs)
        start_state (str): representation of the start state
        seed (int): set the random seed if desired (default: None)

    Returns:
        sentence (str): a randomly generated sequence given the model
    """
    current_word = start_state
    
    sentence = ''
    
    while current_word != end_state:
        current_word = get_next_word(current_word, markov_model, seed)
        
        if current_word != end_state:
            sentence += f' {current_word}'
        else:
            sentence += '.'
    
    return sentence

In [18]:
markov_model = ingest_text('one-fish-two-fish.txt')

In [24]:
for i in range(5):
    print(generate_random_text(markov_model))

 of funny things go ask your dad.
 go ask us why.
 don’t ask your dad.
 some are old fish there are low.
 two fish.
