In [1]:
import hmmlearn.hmm

In [61]:
N_STATES = 16

In [62]:
with open("alice.txt") as fin:
    text = fin.read()

def pre_clean_text(text):
    text = re.sub("[][$%*@_()/:;‘“’”-]", "", text)
    text = text.lower()
    return text

def post_clean_text(text):
    text = re.sub("\n", " ", text)
    text = re.sub(" ?([.,?!()/:;‘“]) ?", " \\1 ", text)
    text = re.sub("([’”]) ", " \\1 ", text)
    text = re.sub(" ?\\[ ?", " [ ", text)
    text = re.sub(" ?\\] ?", " [ ", text)
    text = re.sub(" ?(-+) ?", " \\1 ", text)
    text = re.sub(" +", " ", text)
    return text

def post_clean_text2(text):
    text = re.sub("\n", " ", text)
    text = re.sub(" ?([.,?!]) ?", " \\1 ", text)
    text = re.sub(" +", " ", text)
    return text

def split_text2(text):
    for sentence in re.finditer(".*?([.?!][”’)]?)|\n\n", text, re.DOTALL):
        yield sentence.group(0)
        
sentences = list(map(post_clean_text2, split_text2(pre_clean_text(text))))

In [63]:
frequencies = defaultdict(int)
for sentence in sentences:
    for word in sentence.split(" "):
        frequencies[word] += 1
    
words = sorted(frequencies.keys())
total_words = sum([x for x in frequencies.values()])
    
# emission_probs = np.stack([np.fromiter([frequencies[x] / total_words for x in frequencies.keys()], dtype=np.float64)] * N_STATES)

Now we need to transform the characters in the text into numbers, because HMMs speak numbers.  We do this manually, but we could just as easily use `sklearn.preprocessing.LabelEncoder` for it.  We need to end up with a numpy vector, each element of which is a one-element vector containing a character-number (that's what the `.reshape()` incantation is for

In [64]:
word_vector = np.fromiter([words.index(word) for sentence in sentences for word in sentence.split(" ")], 
                          dtype=np.int64).reshape(-1,1)
lengths = list(map(len, map(lambda x: x.split(" "), sentences)))

In [65]:
len(word_vector) - sum(lengths)

0

In [66]:
hmm = hmmlearn.hmm.MultinomialHMM(n_components=N_STATES, algorithm="viterbi", 
                                  n_iter=500, verbose=True, init_params='ste')

hmm.fit(word_vector, lengths)

         1     -302518.2598             +nan
         2     -209962.6838      +92555.5760
         3     -209511.9441        +450.7397
         4     -208990.9656        +520.9786
         5     -208413.8448        +577.1208
         6     -207720.6343        +693.2104
         7     -206895.8764        +824.7580
         8     -206026.9169        +868.9594
         9     -205143.2351        +883.6818
        10     -204069.9705       +1073.2647
        11     -202653.3002       +1416.6702
        12     -201238.2251       +1415.0751
        13     -200014.3892       +1223.8360
        14     -198727.4786       +1286.9106
        15     -197449.3697       +1278.1089
        16     -196379.0562       +1070.3135
        17     -195366.0878       +1012.9684
        18     -194336.2511       +1029.8367
        19     -193308.9373       +1027.3138
        20     -192281.3297       +1027.6075
        21     -191229.9520       +1051.3777
  return np.log(self.emissionprob_)[:, np.concatenate(X

       179     -170338.7973          +4.8149
       180     -170334.8341          +3.9632
       181     -170331.1364          +3.6977
       182     -170325.7020          +5.4344
       183     -170318.8992          +6.8027
       184     -170310.8362          +8.0630
       185     -170302.7840          +8.0522
       186     -170295.8304          +6.9536
       187     -170289.3863          +6.4441
       188     -170284.0250          +5.3614
       189     -170279.2434          +4.7816
       190     -170273.9150          +5.3284
       191     -170265.9848          +7.9302
       192     -170256.5161          +9.4687
       193     -170249.0773          +7.4388
       194     -170243.8994          +5.1779
       195     -170239.8615          +4.0379
       196     -170235.8346          +4.0269
       197     -170230.3452          +5.4895
       198     -170223.1378          +7.2074
       199     -170217.1209          +6.0169
       200     -170213.2279          +3.8930
       201

       362     -169862.7918          +0.2440
       363     -169862.3761          +0.4157
       364     -169860.8641          +1.5119
       365     -169858.2048          +2.6594
       366     -169855.6227          +2.5820
       367     -169853.5946          +2.0282
       368     -169851.6532          +1.9414
       369     -169850.0033          +1.6499
       370     -169849.1191          +0.8842
       371     -169848.4981          +0.6210
       372     -169847.8575          +0.6407
       373     -169847.2540          +0.6035
       374     -169846.6625          +0.5915
       375     -169845.7898          +0.8727
       376     -169844.1296          +1.6602
       377     -169841.7849          +2.3447
       378     -169840.1392          +1.6457
       379     -169839.0290          +1.1102
       380     -169837.6190          +1.4101
       381     -169836.4232          +1.1958
       382     -169835.7031          +0.7201
       383     -169834.8613          +0.8418
       384

MultinomialHMM(algorithm='viterbi', init_params='ste', n_components=16,
               n_iter=500, params='ste',
               random_state=<mtrand.RandomState object at 0x7fc82809f480>,
               startprob_prior=1.0, tol=0.01, transmat_prior=1.0, verbose=True)

In [67]:
symbols, _states = hmm.sample(30)

In [68]:
' '.join([words[x[0]] for x in symbols])

' said dream !  and it is , and soooop .  that !  really form dinah she do up some which .  it comes to it'

Tuning options:
- how many states is optimal?
- are there better ways to clean the data?
- explore viterbi vs map algorithms