In [2]:
from collections import Counter
from math import log

In [3]:
with open('data/dict.txt.small', encoding='utf-8') as f:
    n = 0
    for line in f:
        lines = line.split(' ')
        print(lines)
        n += 1
        if n > 10:
            break

['的', '3188252', 'uj\n']
['了', '883634', 'ul\n']
['是', '796991', 'v\n']
['在', '727915', 'p\n']
['和', '555815', 'c\n']
['有', '423765', 'v\n']
['他', '401339', 'r\n']
['不', '360331', 'd\n']
['我', '328841', 'r\n']
['人', '313209', 'n\n']
['也', '307851', 'd\n']


In [47]:
hmm_model = {i: Counter(i) for i in 'SBME'}
with open('data/dict.txt.small', encoding='utf-8') as f:
    for line in f:
        lines = line.strip('\n').split(' ')
        if len(lines[0]) == 1:
            hmm_model['S'][lines[0]] += int(lines[1])
        else:
            hmm_model['B'][lines[0][0]] += int(lines[1])
            hmm_model['E'][lines[0][-1]] += int(lines[1])
        for m in lines[0][1:-1]:
            hmm_model['M'][m] += int(lines[1])

In [48]:
log_total = {i: log(sum(hmm_model[i].values())) for i in 'SBME'}
log_total

{'S': 17.214580210949702,
 'B': 17.298268018586587,
 'M': 15.597015630115711,
 'E': 17.298268018586587}

### 转移矩阵

In [52]:
P={'B': {'E': -0.510825623765990, 'M': -0.916290731874155},
 'E': {'B': -0.5897149736854513, 'S': -0.8085250474669937},
 'M': {'E': -0.33344856811948514, 'M': -1.2603623820268226},
 'S': {'B': -0.7211965654669841, 'S': -0.6658631448798212}}
trans_p = P
trans_p

{'B': {'E': -0.51082562376599, 'M': -0.916290731874155},
 'E': {'B': -0.5897149736854513, 'S': -0.8085250474669937},
 'M': {'E': -0.33344856811948514, 'M': -1.2603623820268226},
 'S': {'B': -0.7211965654669841, 'S': -0.6658631448798212}}

### 发射矩阵

In [53]:
emit_p = {i: {t: log(j[t]+1)-log_total[i] for t in j.keys()} for i, j in hmm_model.items()}
emit_p

{'S': {'S': -16.52143303038976,
  '的': -2.239596535105928,
  '了': -3.5227808505225173,
  '是': -3.625980290868881,
  '在': -3.7166392750435797,
  '和': -3.9863876277399353,
  '有': -4.25764351587363,
  '他': -4.312015983609193,
  '不': -4.419799103280834,
  '我': -4.511247539716967,
  '人': -4.559951039880415,
  '也': -4.577205783960993,
  '为': -4.616624274034768,
  '就': -4.696902688798984,
  '这': -4.739274636636278,
  '上': -4.753470076338498,
  '年': -4.791140666882773,
  '中': -4.812973677153439,
  '你': -4.848994147951583,
  '说': -4.914025000721399,
  '一': -4.923105399181942,
  '到': -4.982148050200671,
  '都': -4.9946983530024855,
  '等': -5.029041960392378,
  '着': -5.067276098335187,
  '对': -5.088227408552763,
  '来': -5.2223074054806045,
  '与': -5.225513739025345,
  '地': -5.22826934138733,
  '还': -5.250203401034611,
  '要': -5.253245097545996,
  '又': -5.291202096360269,
  '大': -5.336317428962088,
  '而': -5.342345275455504,
  '之': -5.3583629583026,
  '道': -5.361290094642916,
  '以': -5.393383590862

### 初始概率

In [59]:
start_p={'B': -0.26268660809250016,
 'E': -3.14e+100,
 'M': -3.14e+100,
 'S': -1.4652633398537678}

### 状态转移集合，比如B状态前只可能是E或S状态

In [54]:
PrevStatus = {
    'B': 'ES',
    'M': 'MB',
    'S': 'SE',
    'E': 'BM'
}

In [61]:
MIN_FLOAT = -3.14e100
def jieba_viterbi(obs, states, start_p, trans_p, emit_p):
    V = [{}]  # 状态概率矩阵  
    path = {}
    for y in states:  # 初始化状态概率
        V[0][y] = start_p[y] + emit_p[y].get(obs[0], MIN_FLOAT)
        path[y] = [y] # 记录路径
    for t in range(1, len(obs)):
        V.append({})
        newpath = {}
        for y in states:
            em_p = emit_p[y].get(obs[t], MIN_FLOAT)
            # t时刻状态为y的最大概率(从t-1时刻中选择到达时刻t且状态为y的状态y0)
            (prob, state) = max([(V[t - 1][y0] + trans_p[y0].get(y, MIN_FLOAT) + em_p, y0) for y0 in PrevStatus[y]])
            V[t][y] = prob
            newpath[y] = path[state] + [y] # 只保存概率最大的一种路径 
        path = newpath 
    # 求出最后一个字哪一种状态的对应概率最大，最后一个字只可能是两种情况：E(结尾)和S(独立词)  
    (prob, state) = max((V[len(obs) - 1][y], y) for y in 'ES')

    return (prob, path[state])

In [66]:
def hmm_cut(s):
    prob, tags = jieba_viterbi(s, 'SBME', start_p, trans_p, emit_p)
    print(tags)
    words = [s[0]]
    for i in range(1, len(s)):
        if tags[i] in ['B', 'S']:
            words.append(s[i])
        else:
            words[-1] += s[i]
    return words

In [67]:
text = '小明硕士毕业于中国科学院计算所'
' '.join(hmm_cut(text))

['B', 'E', 'B', 'E', 'B', 'M', 'E', 'B', 'E', 'B', 'M', 'E', 'B', 'E', 'S']


'小明 硕士 毕业于 中国 科学院 计算 所'