# Text Generation using Markov Chains <a class='tocSkip'>

In [1]:
from nltk.corpus import reuters, shakespeare
from nltk import bigrams, trigrams
from collections import Counter, defaultdict
import nltk
nltk.download('reuters')
nltk.download('punkt')
nltk.download('shakespeare')

[nltk_data] Downloading package reuters to /home/llorenzo/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to /home/llorenzo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package shakespeare to
[nltk_data]     /home/llorenzo/nltk_data...
[nltk_data]   Package shakespeare is already up-to-date!


True

In [320]:
import pandas as pd

class MarkovChain:
    def __init__(self, mode='bigrams'):
        self.mode = mode
        self.model = defaultdict(lambda: defaultdict(lambda: 0))

    def add_corpus(self, sentences):
        for sentence in sentences:
            if self.mode == 'bigrams':
                for w1, w2, w3 in trigrams(sentence, pad_right=True, pad_left=True):
                    self.model[(w1, w2)][w3] += 1
            elif self.mode == 'unigrams':
                for w1, w2 in bigrams(sentence, pad_right=True, pad_left=True):
                    self.model[(w1,)][w2] += 1

    def trans_probability(self, text):
        probs = pd.Series(self.model[tuple(text)])
        probs = (probs / probs.sum()).to_frame('prob')
        return probs

    def next_word(self, text):
        return self.trans_probability(text).sample(weights='prob').index[0]

    def generate_sentence(self, text, maxwords=100):
        sentence_finished = False

        while not sentence_finished:
            if self.mode == 'bigrams':
                word = self.next_word(text[-2:])
            elif self.mode == 'unigrams':
                word = self.next_word(text[-1:])
            text.append(word)

            if text[-2:] == [None, None] and self.mode == 'bigrams':
                sentence_finished = True
            elif text[-1:] == [None] and self.mode == 'unigrams':
                sentence_finished = True

            if len(text) >= maxwords:
                sentence_finished = True

        text = ' '.join([t for t in text if t])

        return text

In [321]:
reuters_model = MarkovChain()
shakespeare_model = MarkovChain()

reuters_model.add_corpus(reuters.sents())
shakespeare_model.add_corpus(shakespeare_sents)

In [330]:
reuters_model.generate_sentence(['today', 'the'])

'today the public as part of a stake in IPCO Corp common shares for 33 pct to 635 . 5 mln dlrs in quarter and 17 at the last two years as a way to spur domestic production is likely to be carried out on whether to affiliate Insituform Permaline Ltd to acquire some 80 pct ," Mohler said .'

In [331]:
shakespeare_model.generate_sentence(['How', 'do'])

'How do you hear ;'

In [332]:
import re

def create_corpus(corpus):
    splitted_corpus = [re.split(r'\b', sentence) for sentence in corpus.split('\n') if sentence.strip()]
    splitted_corpus = [[word.strip() for word in words if word.strip()] for words in splitted_corpus]

    return splitted_corpus

In [376]:
corpus = create_corpus("""
Took a morning ride to the place
Where you and I were supposed to meet
The city yawns, they echo on
My thoughts are spinning on and on my head
It seems, they lead me back to you, ooh
I keep coming back to you
Took a morning ride, found a place up in my mind
No one else can see
Maybe, it's fate that we lose control
In circles around, we go
We become who we ought to know
We just gotta let it go
We just gotta let it go
So, I'm coming home to you, ooh-ooh, ooh-ooh-ooh-ooh-ooh-ooh
You, ooh-ooh, ooh-ooh-ooh-ooh-ooh-ooh
You're all I need, the very air I breathe
You are home, home
Took a morning ride, gotta leave this all behind
For with you is where I want to be
Maybe, it's fate that we can't control (fate that we can't control)
Oh, around and around, it goes ('round and around, it goes)
And all that we seem to know (all that we seem to know)
We just gotta let it go
We just gotta let it go
So, I'm coming home to you, ooh-ooh, ooh-ooh-ooh-ooh-ooh-ooh
You, ooh-ooh, ooh-ooh-ooh-ooh-ooh-ooh
You're all I need, the very air I breathe
You are home, home
So many questions I've thrown to the skies
And all of the answers, I've found in your eyes
When I'm with you, home is never too far
And my weary heart has come to rest in yours
I found my way home
I found my way home
I found my way home
I found my way home
I found my way home, I found my way home
I found my way home, I found my way home
I found my way home, I found my way home
I found my way home
So, I'm coming home to you, ooh-ooh, ooh-ooh-ooh-ooh-ooh-ooh
You, ooh-ooh, ooh-ooh-ooh-ooh-ooh-ooh
You're all I need, the very air I breathe
You are home, home
Coming home to you, ooh-ooh, ooh-ooh-ooh-ooh-ooh-ooh
You, ooh-ooh, ooh-ooh-ooh-ooh-ooh-ooh
You're all I need, the very air I breathe
You are home
""")

In [377]:
my_model = MarkovChain(mode='unigrams')
my_model.add_corpus(corpus)

In [384]:
my_model.generate_sentence(['I'])

'I need , they lead me back to you , ooh - ooh - ooh - ooh - ooh - ooh - ooh - ooh - ooh'

In [20]:
import re

plays = shakespeare.fileids()
shakespeare_sents = []
for play_name in plays:
    play = shakespeare.xml(play_name)
    sentences = []
    for p in play:
        sentences.extend(list(p.itertext()))
    shakespeare_sents.extend([re.split(r'\b', text) for text in sentences if text.strip()])

In [27]:
shakespeare_sents = [[word.strip() for word in words if word.strip()] for words in shakespeare_sents]