In [14]:
sample_memo = '''
Milt, we're gonna need to go ahead and move you downstairs into storage B. We have some new people coming in, and we need all the space we can get. So if you could just go ahead and pack up your stuff and move it down there, that would be terrific, OK?
Oh, and remember: next Friday... is Hawaiian shirt day. So, you know, if you want to, go ahead and wear a Hawaiian shirt and jeans.
Oh, oh, and I almost forgot. Ahh, I'm also gonna need you to go ahead and come in on Sunday, too...
Hello Peter, whats happening? Ummm, I'm gonna need you to go ahead and come in tomorrow. So if you could be here around 9 that would be great, mmmk... oh oh! and I almost forgot ahh, I'm also gonna need you to go ahead and come in on Sunday too, kay. We ahh lost some people this week and ah, we sorta need to play catch up.
'''

In [16]:
def NextWordProbability(sampletext,word):
    '''
    return count
    '''
    words = sampletext.split()
    indices = [i for i, w in enumerate(words) if w==word]
    nextwords = [words[i+1] for i in indices]
    
    word_dict = {}
    for w in set(nextwords):
        word_dict[w] = nextwords.count(w)
        
    return word_dict

In [17]:
print NextWordProbability(sample_memo, 'to')
print NextWordProbability(sample_memo, 'go')

{'go': 4, 'play': 1}
{'ahead': 6}


In [18]:
def NextWordProbability(sampletext,word):
    '''
    return probability
    '''
    words = sampletext.split()
    indices = [i for i, w in enumerate(words) if w==word]
    nextwords = [words[i+1] for i in indices]
    
    word_dict = {}
    for w in set(nextwords):
        word_dict[w] = float(nextwords.count(w))/float(len(nextwords))
        
    return word_dict

In [19]:
print NextWordProbability(sample_memo, 'to')
print NextWordProbability(sample_memo, 'go')

{'go': 0.8, 'play': 0.2}
{'ahead': 1.0}


In [38]:
def LaterWords(sample,word,distance):
    '''@param sample: a sample of text to draw from
    @param word: a word occuring before a corrupted sequence
    @param distance: how many words later to estimate (i.e. 1 for the next word, 2 for the word after that)
    @returns: a single word which is the most likely possibility
    '''
    
    '''
    The function has two basic step
    1- Find the next words and their probability of ocurring, aka the "likelihood"
    2- Find the posterior probablity for each word
    
    To do this, the function does the following:
    - For each iteration up to @param distance:
        - find the nextword_dict (this is the likelihood)
        - multiply with prior_dict to find post_dict
        - During the 1st iteration, prior_dict is initialized with the given word and prob of 1
        - From the 2nd iteration on, post_dict becomes prior_dict at the start of each loop, and post_dict is emptied out
    '''
        
    #initialize    
    prior_dict = {word: 1}
    
    for dist in range(distance):        
        post_dict = {}
        
        for pw in prior_dict:
            nextword_dict = NextWordProbability(sample, pw)
        
            for nw in nextword_dict:
                if nw not in post_dict:
                    post_dict[nw] = nextword_dict[nw] * prior_dict[pw]
                else:
                    post_dict[nw] = post_dict[nw] + nextword_dict[nw] * prior_dict[pw]
        
        prior_dict = post_dict
                                               
#     return post_dict #return dictionary of word and probability
    return max(post_dict, key=post_dict.get) # return word with the highest probability

In [39]:
print LaterWords(sample_memo, 'to', 1)
print LaterWords(sample_memo, 'go', 1)
print LaterWords(sample_memo, 'play', 1)
print LaterWords(sample_memo, 'to', 2)

{'go': 0.8, 'play': 0.2}
{'ahead': 1.0}
{'catch': 1.0}
{'catch': 0.2, 'ahead': 0.8}
