# Markow Text generator

In [4]:
%%capture
!pip install nltk

In [203]:
import numpy as np
import nltk
from collections import defaultdict 
from tqdm.notebook import tqdm
from pprint import pprint
import operator
import time
import pickle

In [6]:
nltk.download('gutenberg')

[nltk_data] Downloading package gutenberg to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.


True

In [317]:
class autoComplete():
    def __init__(self):
        self.corpus = []
        self.n = 0
        self.ppFlag = False
        self.pbar = False
        
    def clean_corpus(self):
        print("\nCleaning the Corpus !") if self.ss else None
        self.corpus[:] = [item.strip('"-') for item in self.corpus if bool(re.match('^\w+|[!.?]',item.strip('"-')))]
        print(f"Total tokens in corpus : {len(self.corpus)}") if self.ss else None
        
    def create_start_word_list(self,top_n):

        print("\nCreating start word list !") if self.ss else None
        self.start_word_dict = defaultdict(int)
        for i, token in enumerate(self.corpus):
            if token in [".","?","!"] and i+1<len(self.corpus):
                if self.corpus[i+1] not in ["mrs","ms","mr","eg",'']:
                    self.start_word_dict[self.corpus[i+1]] += 1
                    
        start_word_list = sorted(self.start_word_dict.items(), 
                                 key=lambda x: x[1], reverse=True)[:top_n]
        self.start_tokens = [word for word,_ in start_word_list]
        print(f"Total unique start words {len(self.start_word_dict)}") if self.ss else None
        print(f"Selected top {len(self.start_tokens)} start words !") if self.ss else None
        
    def create_n_gram_array(self, n):
        """
        Function to create n-gram
        """
        
        print(f"\nGenerating n-grams for n = {n}") if self.ss else None
        n_gram_size = n
        n_gram_array = []
        for i in tqdm(range(len(self.corpus)-n_gram_size+1), disable = self.pbar):
            n_gram_array.append(self.corpus[i:(i+n_gram_size)])
        n_gram_array = np.array(n_gram_array)
        self.n_gram_array_list.append(n_gram_array)
        return self.n_gram_array_list
    
    def create_n_gram_dict(self, n):
        """
        function to create reference dictionary for n-gram
        with n-1 key and nth term as value
        """
        print(f"\nGenerating n-gram dictionary for n = {n}") if self.ss else None
        for n_gram in tqdm(self.n_gram_array_list[n-2], disable= self.pbar):
            self.n_gram_dict[n][tuple(n_gram[:-1])][n_gram[-1]] += 1
    
    def create_n_gram(self,corpus,n,top_n):
        self.n = n
        self.corpus = corpus
        self.n_gram_array_list = []
        self.n_gram_dict = defaultdict(lambda: defaultdict(lambda: defaultdict(int))) 
        self.clean_corpus()
        self.create_start_word_list(top_n)

        print("\nGenerating n-gram and dictionary !") if self.ss else None
            
        for i in tqdm(range(2,n+1), disable= self.pbar):
            self.create_n_gram_array(i)
            self.create_n_gram_dict(i)
            
    def pre_process_corpus(self,corpus, n_gram_len, top_n=15,show_status=True):
        """
        Function to pre-process the text
        Input:
            corpus [list]: Source corpus [list of tokens]
            n_gram       : maximum limit of n-grams to be created
        """
        self.ss = show_status
        self.pbar = not show_status
        self.ppFlag = True
        self.create_n_gram(corpus, n_gram_len, top_n)
        
        
    def finish_sentence(self, sentence, use_n, deterministic=False, 
                         max_len=15, stop_at_punc = True):
        """
        Input:-
            sentence     : A sentence [list of tokens] that we’re trying to build on
            use_n [int]  : The length of n-grams to use for prediction, and
            deterministic: Flag indicating whether the process should be deterministic [bool]
    
        If deterministic is true ; Choose at each step the single most probable next token. 
                               When two tokens are equally probable, choose the lesser one (according to Python).
        If deterministic is false; Draw the next word randomly from the appropriate distribution. Use stupid backoff and no smoothing.
    
        Output:-
            Returns an extended sentence until the first ., ?, or ! is found OR until it has 15 total tokens
        """

        if not self.ppFlag:
            raise ValueError("N-gram dictionary not generated. Generated the n-gram dictionary first using pre_process_corpus()")
        
        if sentence == None:
            first_word = np.random.choice(self.start_tokens,1)[0]
            self.w_in_sentc = [first_word]
        else:
            self.w_in_sentc = [w.lower() for w in sentence]
            
        for i in range(max_len):
            suggest_word = self.w_in_sentc[-1]
            for i in range(use_n,1,-1):
                n_gram_key = tuple(self.w_in_sentc[(-i+1):])

                if n_gram_key in self.n_gram_dict[i].keys():
                    if deterministic == True:
                        most_probab = max(self.n_gram_dict[i][n_gram_key].values())
                        opts = []
                        for k,v in self.n_gram_dict[i][n_gram_key].items():
                            if v == most_probab:
                                opts.append(k) 
                        suggest_word = np.sort(opts)[0]
                        break
                    else:
                        suggest_word = np.random.choice(list(self.n_gram_dict[i][n_gram_key].keys()))
                        break
            self.w_in_sentc.append(suggest_word)  
            
            #stop the generation at punctuation
            if stop_at_punc and suggest_word in [".","?","!"]:
                return self.w_in_sentc
            
        return self.w_in_sentc

***
# Generating text

### Generating text using Gutenberg

In [338]:
#test case
sentence = ['she', 'was', 'not'] 
n = 3
corpus = [w.lower() for w in nltk.corpus.gutenberg.words('austen-sense.txt')]
deterministic = True
ac = autoComplete()
ac.pre_process_corpus(corpus, n_gram_len=3, top_n=100, show_status=False)

In [341]:
ac.finish_sentence(sentence, 3, max_len= 15, deterministic=True, stop_at_punc=True)

['she', 'was', 'not', 'in', 'the', 'world', '.']

In [342]:
#generating corpus with 3-gram (2 words used to predict the third)

counter = 0
sentence_list= []
print("\n Generating Text")
pbar = tqdm(total=150000)
max_len = 5
while(counter<150000):
    gen_sent = ac.finish_sentence(None, 3,max_len=max_len, deterministic=True, stop_at_punc=False)
    if len(gen_sent) == max_len+1 and '' not in gen_sent :
        pbar.update(1)
        sentence_list.append(gen_sent)
        counter +=1
pbar.close()        


 Generating Text


HBox(children=(FloatProgress(value=0.0, max=150000.0), HTML(value='')))




In [343]:
sentence_list = [" ".join(sent) for sent in sentence_list]
sentence_list[:5]

['yes i am sure i should',
 'you will be a very good',
 'one of the house and the',
 'however i am sure i should',
 'margaret and elinor was not in']

In [344]:
with open('generated_text_guten.pickle', 'wb') as f:
    pickle.dump(sentence_list, f)

In [345]:
pickle_gen = open("generated_text_guten.pickle","rb")
gen_text = pickle.load(pickle_gen)

### Generating text using Brown corpus

In [279]:
from nltk.corpus import brown
import re

nltk.download('brown')

[nltk_data] Downloading package brown to /home/jovyan/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [280]:
len(brown.sents(categories=brown.categories()))

57340

In [295]:
### Read and merge all Brown corpus

sent_list = brown.sents(categories=brown.categories())
corpus_brown = sent_list[0]
    
for i in tqdm(range(1,len(sent_list))):
    corpus_brown = corpus_brown + sent_list[i]

corpus_brown = [w.lower() for w in tqdm(corpus_brown)]

HBox(children=(FloatProgress(value=0.0, max=57339.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1161192.0), HTML(value='')))




In [None]:
ac_brown = autoComplete()
ac_brown.pre_process_corpus(corpus_brown,n_gram_len=3,top_n=100,show_status=True)

In [327]:
ac_brown.finish_sentence(None, 3, max_len= 5, deterministic=True, stop_at_punc=False)

['by', 'the', 'time', 'of', 'the', 'united']

In [336]:
#generating corpus with 3-gram (2 words used to predict the third)

counter = 0
sentence_list= []
print("\n Generating Text")
pbar = tqdm(total=150000)
max_len = 5
while(counter<150000):
    gen_sent = ac_brown.finish_sentence(None, 3,  max_len=max_len, deterministic=True, stop_at_punc=False)
    if len(gen_sent) == max_len+1 and '' not in gen_sent :
        pbar.update(1)
        sentence_list.append(gen_sent)
        counter +=1
pbar.close()     


 Generating Text


HBox(children=(FloatProgress(value=0.0, max=150000.0), HTML(value='')))




In [337]:
sentence_list = [" ".join(sent) for sent in sentence_list]
sentence_list[:5]

["he's a friend of mine .",
 'or the other hand the bright',
 'how to feed beef cattle .',
 'one of the united states .',
 'in the world . the first']

In [331]:
with open('generated_text_brown.pickle', 'wb') as f:
    pickle.dump(sentence_list, f)

In [334]:
pickle_gen = open("generated_text_brown.pickle","rb")
gen_text_brown = pickle.load(pickle_gen)
gen_text_brown[:5]

['however the first time in the',
 'moreover the centralization of government .',
 'at the same time . the',
 'from the fact that the united',
 'his own . he was a']

***
# Probabilistic Model testing on Generated and Real data

***
## 1. On Generated text (generated Brown and Guttenberg)

In [347]:
# reading the generated data to used as input for models 

gen_text_gut = pickle.load(open("generated_text_guten.pickle","rb"))
gen_text_brown = pickle.load(open("generated_text_brown.pickle","rb"))

In [365]:
def generate_corpus(sentence_list):
    corp = sentence_list[0].split()
    for i in tqdm(range(1,len(sentence_list))):
        corp = corp + sentence_list[i].split()
    return corp 

In [368]:
#tokenizing all text instead instead of last 1000 sentences which will be used to test
gutten_corpus_train = generate_corpus(gen_text_gut[:-1000])
brown_corpus_train = generate_corpus(gen_text_brown[:-1000])

HBox(children=(FloatProgress(value=0.0, max=148999.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=148999.0), HTML(value='')))




In [369]:
# Saving the train data 
with open('generated_text_guten_joined.pickle', 'wb') as f:
    pickle.dump(gutten_corpus_train, f)
with open('generated_text_brown_joined.pickle', 'wb') as f:
    pickle.dump(brown_corpus_train, f)

### Processing generated **Brown** and **Guttenberg** corpus using 3-gram

In [372]:
ac_g = autoComplete()
ac_g.pre_process_corpus(gutten_corpus_train, n_gram_len=3, top_n=100, show_status=False)

ac_b = autoComplete()
ac_b.pre_process_corpus(gutten_corpus_train, n_gram_len=3, top_n=100, show_status=False)

In [373]:
test_gutten = gen_text_gut[-1000:]
test_brown  = gen_text_brown[-1000:]

### Extracting the first two tokens of test set 

In [374]:
start_tokens_gutten = [item.split()[:2] for item in test_gutten]
start_tokens_brown = [item.split()[:2] for item in test_brown]

In [377]:
print(test_gutten[0], start_tokens_gutten[0])
print(test_brown[0], start_tokens_brown[0])

for the sake of his own ['for', 'the']
perhaps the most important of all ['perhaps', 'the']


### Generating sentences using first two tokens of original sentences

In [388]:
#Generating sentences by providing the first two tokens from the original sentence

gen_test_gutten = []
gen_test_brown  = []

# generating limited data
max_len = 4
for i in tqdm(range(1000)):
    out_g = ac_g.finish_sentence(start_tokens_gutten[i], 3,  max_len=max_len, deterministic=True, stop_at_punc=False)
    gen_test_gutten.append(out_g)
    out_b = ac_b.finish_sentence(start_tokens_brown[i], 3,  max_len=max_len, deterministic=True, stop_at_punc=False)
    gen_test_brown.append(out_b)

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))




In [389]:
print("Guttenberg :")
print(f"Original sentence : {test_gutten[0]}")
print(f"Generated sentence : {' '.join(gen_test_gutten[0])}")

print("\nBrown :")
print(f"Original sentence : {test_brown[2]}")
print(f"Generated sentence : {' '.join(gen_test_brown[2])}")

Guttenberg :
Original sentence : for the sake of his own
Generated sentence : for the sake of his own

Brown :
Original sentence : it is not a single stage
Generated sentence : it is not to be sure


### Saving the results to pickle

In [396]:
#Writing to pickle to save resutls
with open('../data/test_orig_gutten.pickle', 'wb') as f:
    pickle.dump(test_gutten, f)
with open('../data/test_orig_brown.pickle', 'wb') as f:
    pickle.dump(test_brown, f)

with open('../data/test_gen_gutten.pickle', 'wb') as f:
    pickle.dump(gen_test_gutten, f)
with open('../data/test_gen_brown.pickle', 'wb') as f:
    pickle.dump(gen_test_brown, f)

### Printing the results for **Guttenberg** and **Brown**

In [3]:
import pickle
import numpy as np 

In [7]:
gen_test_gutten = pickle.load(open("../data/test_gen_gutten.pickle","rb"))
test_gutten = pickle.load(open("../data/test_orig_gutten.pickle","rb"))
gen_test_gutten = [" ".join(item) for item in gen_test_gutten]


sample_ = np.random.choice(np.arange(len(test_brown)),5, replace=False)
print("\nResults for Brown:\n")
for i in range(5):
    print(f"Original  : {test_gutten[int(sample_[i])]}")
    print(f"Generated : {gen_test_gutten[int(sample_[i])]}\n")


Results for Brown:

Original  : about it . i am sure
Generated : about it . i am sure

Original  : as she had been in the
Generated : as she had been in the

Original  : certainly not . i am sure
Generated : certainly not . i am sure

Original  : indeed ! and i am sure
Generated : indeed ! and i am sure

Original  : that she had been in the
Generated : that she had been in the



In [5]:
test_brown = pickle.load(open("../data/test_orig_brown.pickle","rb"))
gen_test_brown = pickle.load(open("../data/test_gen_brown.pickle","rb"))
gen_test_brown = [" ".join(item) for item in gen_test_brown]


sample_ = np.random.choice(np.arange(len(test_brown)),5, replace=False)
print("\nResults for Brown:\n")
for i in range(5):
    print(f"Original  : {test_brown[int(sample_[i])]}")
    print(f"Generated : {gen_test_brown[int(sample_[i])]}\n")


Results for Brown:

Original  : sometimes he would have been a
Generated : sometimes he was not in the

Original  : with the same time . the
Generated : with the same time . lucy

Original  : all the way to the editor
Generated : all the world . elinor was

Original  : those who have been a good
Generated : those who had not been able

Original  : in the world . the first
Generated : in the world . elinor was



****
## 2. On Real data (IMDB Movie plots)

In [95]:
# for real data using IMDB Movie plots
pickle_in = open("../data/plots_text.pickle","rb")
movie_plots = pickle.load(pickle_in)

# count of movie plot summaries
len(movie_plots)

500

In [434]:
#processing the data step 1
plot_corpus = [re.sub("[^a-z'\. ]", "", i) for i in movie_plots]

In [442]:
#function to extract sentence from the corpus for test cases
sentences_plot = []
for i in range(len(plot_corpus)):
    sents = (re.split(pattern, plot_corpus[i]))
    sentences_plot.extend(sents)
print(f"Sentences found : {len(sentences_plot)}")

Sentences found : 8134


In [443]:
#processing the data step 2
#Removing the fullstop and merging all sentences and creating tokens
plot_corpus = [re.sub("[^a-z' ]", "", i) for i in plot_corpus]
plot_corpus = " ".join(plot_corpus).split()

In [446]:
rand_idx = np.random.choice(np.arange(len(sentences_plot)),1000, replace=False) 

In [453]:
subset_sentences = [sentences_plot[idx] for idx in rand_idx]
subset_start = [item.split()[:2] for item in subset_sentences]
print(subset_sentences[0], subset_start[0])

she is also convinced bram got the hots for kevin as he visits the bistro quite often and can't keep his eyes of bram. ['she', 'is']


### Processing all the Movie plots dataset to create 3-gram dictionary

In [404]:
# Processing the corpus
ac_m = autoComplete()
ac_m.pre_process_corpus(plot_corpus, n_gram_len=3, top_n=100, show_status=False)

### Generating sentences using first two tokens of randomly selected 1000 sentences

In [462]:
#Generating test sentences by providing two input tokens

gen_test_plot = []
gen_test_plot_unresitricted = []
max_len = 4
for i in tqdm(range(1000)):
    gen_plot = ac_m.finish_sentence(subset_start[i], 3,  max_len=max_len, deterministic=True, stop_at_punc=False)
    gen_test_plot.append(gen_plot)
    
    gen_plot = ac_m.finish_sentence(subset_start[i], 3,  max_len=20, deterministic=True, stop_at_punc=True)
    gen_test_plot_unresitricted.append(gen_plot)

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))




In [465]:
sample_n = int(np.random.choice(np.arange(1000),1))
print(f"\nPlot contains {len(subset_start)} generated text. \nSample :")
print(f"Original sentence : {subset_sentences[sample_n]}")
print(f"Generated sentence (restricted): {' '.join(gen_test_plot[sample_n])}")
print(f"Generated sentence (un-restricted): {' '.join(gen_test_plot_unresitricted[sample_n])}")


Plot contains 1000 generated text. 
Sample :
Original sentence : devoe then leads a special forces unit to stop them.
Generated sentence (restricted): devoe then leads a commando raid
Generated sentence (un-restricted): devoe then leads a commando raid depicted was intended to be a good thing after a few days later they disappear and


### Saving the results to pickle

In [466]:
with open('../data/test_orig_plot.pickle', 'wb') as f:
    pickle.dump(subset_sentences, f)

with open('../data/test_gen_plot_restricted.pickle', 'wb') as f:
    pickle.dump(gen_test_plot, f)
    
with open('../data/test_gen_plot_unrestricted.pickle', 'wb') as f:
    pickle.dump(gen_test_plot_unresitricted, f)

### Printing the results for Movie plots

In [10]:
test_plot = pickle.load(open('../data/test_orig_plot.pickle','rb'))
gen_test_plot = pickle.load(open("../data/test_gen_plot_unrestricted.pickle","rb"))
gen_test_plot = [" ".join(item) for item in gen_test_plot]


sample_ = np.random.choice(np.arange(len(test_brown)),5, replace=False)
print("\nResults for Movie plot:\n")
for i in range(5):
    print(f"Original  : {test_plot[int(sample_[i])]}")
    print(f"Generated : {gen_test_plot[int(sample_[i])]}\n")


Results for Movie plot:

Original  : meanwhile hsiao in a grave robbery attempt accidentally awakens a female corpse  who eventually turns out to be the marshal's  wife.
Generated : meanwhile hsiao in a car accident all of the film ends with a large sum of money he discovers that the man

Original  : this leads to a running joke in the torture chamber as henry keeps changing his mind about the confession due to political necessities requiring multiple changes and retractions of the original confession.
Generated : this leads to a nearby bus station and also a bit awkwardly jrgen stalls the negotiations relating to deceased crew members as

Original  : it sensitively portrays the struggles of a father whose main priority in life is the wellbeing of his children and who is determined to see his children grow up to become decent successful people.
Generated : it sensitively portrays the conservative pan indian socio cultural life where he is not the only one who was the only one

Origina

****