In [1]:
import text_parsing_utils as util
import numpy as np

# load in text, preprocess, and parse into words

In [2]:
# load in and preprocess text
csvname = "war_of_the_worlds.txt"
text = util.load_preprocess(csvname)

In [3]:
# parse into individual words
tokens,keys,words_to_keys,keys_to_words = util.parse_words(text)

# Markov model - words

In [4]:
# get unique keys - for dimension of transition matrix 
unique_keys = np.unique(keys)
num_unique_words = len(unique_keys)
num_words = len(tokens)

In [10]:
# generate initial zeros order N transition matrix
# use a dictionary - or else this for sure won't scale 
# to any order > 1
transition_matrix = {}

# sweep through tokens list, update each individual distribution
# as you go - each one a column
order = 1
for i in range(order,num_words):
    # grab current key, and previous order keys
    next_key = keys[i]
    prev_keys = tuple(keys[i-order:i])
    
    ## update transition matrix
    # we've seen current key already
    if prev_keys in transition_matrix.keys():
        if next_key in transition_matrix[prev_keys].keys():
            transition_matrix[prev_keys][next_key] += 1
        else:
            transition_matrix[prev_keys][next_key] = 1
    else:      # we haven't seen key already, so create new subdict
        transition_matrix[prev_keys] = {}
        transition_matrix[prev_keys][next_key] = 1

In [11]:
# use transition matrix to generate sentence of desired length
# starting at randomly chosen word (all of this is done using
# the associated keys, then re-translated into words)
starter_ind = np.random.permutation(keys)[0]
generated_keys = keys[starter_ind:starter_ind+order]

In [12]:
num_produce = 100   # number of words to produce
for i in range(num_produce):
    # get current key
    prev_keys = tuple(generated_keys[i:i+order])
    
    # use maximum index of this distribution in transition matrix
    # to get next key
    stats = transition_matrix[prev_keys]
    next_key = max(stats, key=lambda key: stats[key])
    
    # store next key
    generated_keys.append(next_key)

In [13]:
# translate generated keys back into words and print
generated_words = []
for key in generated_keys:
    word = keys_to_words[key]
    generated_words.append(word)

In [14]:
' '.join(generated_words)

'garden there was the martians were the martians were the martians were the martians were the martians were the martians were the martians were the martians were the martians were the martians were the martians were the martians were the martians were the martians were the martians were the martians were the martians were the martians were the martians were the martians were the martians were the martians were the martians were the martians were the martians were the martians were the martians were the martians were the martians were the martians were the martians were the martians were the martians'