# Text generation models

One very basic kind of text generation model is the Markov model.  In
such a model, we have a state which consists of the previous character.
We also have a matrix of transitions from one character to another.  We
*train* the model by feeding it some text, and observing the
transitions.  We can then generate more text from the model.

In [None]:
from collections import defaultdict
import re

def train_sentence(transitions, sentence, depth=1):
    # We need a "special" character to represent the beginning of a sentence.
    # This is also the character we'll use to feed the generator, below.
    prevchar = "•" * depth
    for char in sentence:
        transitions[prevchar][char] += 1
        prevchar = prevchar[1:] + char

    return transitions

def split_text(text):
    for sentence in re.finditer(".*?([.?!][”’]?|\n\n)", text, re.DOTALL):
        # Turn all sequences of whitespace into a single space
        sentence = re.sub("[ \t\n\r]+", " ", sentence.group(0)).strip()
        yield sentence

def train(filename, depth=1):
    transitions = defaultdict(lambda: defaultdict(int))
    with open(filename) as fin:
        text = fin.read()
        for sentence in split_text(text):
            if len(sentence) < 3:
                continue
            transitions = train_sentence(transitions, sentence, depth)

    return transitions

In [None]:
import pandas as pd
import numpy as np

def format_transitions(trs):
    rows = []
    for key in trs:
        for key2 in trs[key]:
            rows.append({'from': key, 'to': key2, 'n': trs[key][key2]})
    data = pd.DataFrame(rows)
    data = data.pivot_table(index='from', columns='to', values='n')
    data = data.div(data.sum(axis=1), axis=0)
    data[np.isnan(data)] = 0

    return data

In [None]:
def produce(transitions):
    if isinstance(transitions, defaultdict):
        transitions = format_transitions(transitions)

    # Nifty trick: auto-calculate the depth we were trained on
    depth = len(transitions.index[0])

    output = ""
    last = "•" * depth
    nxt = ""

    while nxt not in [".", "?", "!"]:
        trs = transitions.loc[last]
        nxt = np.random.choice(trs.index, p=trs)
        last = last[1:] + nxt
        output += nxt

    return output

In [None]:
tr = train("alice.txt", 4)

In [None]:
produce(tr)

Ideas for extension:

- Train on a different text
- Try normalizing the text in different ways (e.g. what happens if you take
  out quotation marks?)
- Play around with different depths.  Do different ones work better for
  different texts?
- Try generating longer passages (you will need to alter the training
  also)

# Hidden Markov Models

# Neural networks