# Markow Text generator

### Solution by: Akshay Punwatkar(AP509) and Ishan Gupta (IG55)

In [4]:
import numpy as np
import nltk
from collections import defaultdict 
from pprint import pprint
import operator
import time

In [5]:
#!pip install nltk
#nltk.download('gutenberg')

In [6]:
def finish_sentence(sentence, n, corpus, deterministic=False):
    """
    Input:-
    sentence     : A sentence [list of tokens] that we’re trying to build on
    n [int]      : The length of n-grams to use for prediction, and
    corpus [list]: Source corpus [list of tokens]
    deterministic: Flag indicating whether the process should be deterministic [bool]
    
    If deterministic is true ; Choose at each step the single most probable next token. 
                               When two tokens are equally probable, choose the lesser one (according to Python).
    If deterministic is false; Draw the next word randomly from the appropriate distribution. Use stupid backoff and no smoothing.
    
    Output:-
    Returns an extended sentence until the first ., ?, or ! is found OR until it has 15 total tokens
    """
    
    if n < 2:
        return "Error: n should be greater than 1 to make predictions"
    
    #removing non-required character and numbers from corpus
    corpus[:] = [item.strip('"') for item in corpus if not (item in ['[',']',"*","-",";",":",".--","--",",",";--"] or item.isdigit())]
#     corpus = corpus[:50]
    n_gram_array_list = []

    # an n-gram model looks n-1 words into the past (eg. a trigram (3-gram) model looks two words into the past)
    start_time = time.time()
    # creating n-gram (for n)
    for j in range(0,n-1):
        n_gram_size = j+2
        n_gram_array = []
        for i in range(len(corpus)-n_gram_size+1):
            n_gram_array.append(corpus[i:(i+n_gram_size)])
        n_gram_array = np.array(n_gram_array)    
        n_gram_array_list.append(n_gram_array)    
#     print("Time taken for array : %s"%round(time.time()-start_time,3))
    
    #converting to numpy-array
    n_gram_array_list = np.array(n_gram_array_list,dtype=object)
    
    start_time = time.time()
    #creating n-gram dictionary
    #n_gram_dict = defaultdict(lambda: defaultdict(int)) 
    n_gram_dict = defaultdict(lambda: defaultdict(lambda: defaultdict(int))) 
    
    for j in range(0,n-1):
        for n_gram in n_gram_array_list[j]:
            n_gram_dict[j+2][tuple(n_gram[:-1])][n_gram[-1]] += 1
#     print("Time taken for Dictionary creation : %s"%round(time.time()-start_time,3))
    #auto completing a sentence
    
    start_time = time.time()
    w_in_sentc = [w.lower() for w in sentence]
    for i in range(15):
        suggest_word = w_in_sentc[-1]
        
        for i in range(n,1,-1):
            n_gram_key = tuple(w_in_sentc[(-i+1):])
            
            if n_gram_key in n_gram_dict[i].keys():
                if deterministic == True:
                    most_probab = max(n_gram_dict[i][n_gram_key].values())
                    opts = []
                    for k,v in n_gram_dict[i][n_gram_key].items():
                        if v == most_probab:
                            opts.append(k) 
                    suggest_word = np.sort(opts)[0]
                    break
                else:
                    suggest_word = np.random.choice(list(n_gram_dict[i][n_gram_key].keys()))
                    break
        
        w_in_sentc.append(suggest_word)  
        
        if suggest_word in [".","?","!"]:
#             print("Time taken for search : %s"%round(time.time()-start_time,3))
            return w_in_sentc
#     print("Time taken for search : %s"%round(time.time()-start_time,3))    
    return w_in_sentc

In [7]:
#test case
sentence = ['she', 'was', 'not'] 
n = 3
corpus = [w.lower() for w in nltk.corpus.gutenberg.words('austen-sense.txt')]
deterministic = True

#expected output:
#[’she’, ’was’, ’not’, ’in’, ’the’, ’world’, ’.’]

In [8]:
finish_sentence(sentence, n, corpus, deterministic=True)

['she', 'was', 'not', 'in', 'the', 'world', '.']

In [9]:
finish_sentence(["akshay","was"], 3, corpus, deterministic=True)

['akshay', 'was', 'not', 'in', 'the', 'world', '.']

In [10]:
finish_sentence(["akshay","was"], 2, corpus, deterministic=True)

['akshay',
 'was',
 'not',
 'be',
 'a',
 'very',
 'well',
 'as',
 'she',
 'was',
 'not',
 'be',
 'a',
 'very',
 'well',
 'as',
 'she']

In [11]:
finish_sentence(["akshay","was"], 3, corpus, deterministic=False)

['akshay',
 'was',
 'consoled',
 'for',
 'every',
 'past',
 'affliction',
 'her',
 'regard',
 'without',
 'a',
 'thought',
 'struck',
 'him',
 'when',
 'edward',
 'did']

****

***