In [1]:
from collections import Counter
from math import log

In [2]:
import random
with open('data/dict.txt.small', encoding='utf-8') as f:
    for line in f:
        lines = line.split(' ')
        print(lines)
        n += 1
        if n > 10:
            break

['的', '3188252', 'uj\n']
['了', '883634', 'ul\n']
['是', '796991', 'v\n']
['在', '727915', 'p\n']
['和', '555815', 'c\n']
['有', '423765', 'v\n']
['他', '401339', 'r\n']
['不', '360331', 'd\n']
['我', '328841', 'r\n']
['人', '313209', 'n\n']
['也', '307851', 'd\n']


#### 常用的中文分词可以用字标注法和HMM模型来处理，字标注法即对一个中文序列每个字做“sbme”的标注，s为single，b为begin，m为middle，e为end。这样中文分词就转化为对一个文本序列的每个token做sbme的分类，进而根据该标注来分词。dict.txt为结巴分词的词典

### 根据词库字典，统计每个字属于sbme的频率

In [3]:
hmm_model = {i: Counter(i) for i in 'sbme'}
with open('data/dict.txt.small', encoding='utf-8') as f:
    for line in f:
        lines = line.strip('\n').split(' ')
        if len(lines[0]) == 1:
            hmm_model['s'][lines[0]] += int(lines[1])
        else:
            hmm_model['b'][lines[0][0]] += int(lines[1])
            hmm_model['e'][lines[0][-1]] += int(lines[1])
        for m in lines[0][1:-1]:
            hmm_model['m'][m] += int(lines[1])

In [4]:
hmm_model

{'s': Counter({'s': 1,
          '的': 3188252,
          '了': 883634,
          '是': 796991,
          '在': 727915,
          '和': 555815,
          '有': 423765,
          '他': 401339,
          '不': 360331,
          '我': 328841,
          '人': 313209,
          '也': 307851,
          '为': 295952,
          '就': 273122,
          '这': 261791,
          '上': 258101,
          '年': 248559,
          '中': 243191,
          '你': 234587,
          '说': 219817,
          '一': 217830,
          '到': 205341,
          '都': 202780,
          '等': 195934,
          '着': 188584,
          '对': 184674,
          '来': 161501,
          '与': 160984,
          '地': 160541,
          '还': 157058,
          '要': 156581,
          '又': 150749,
          '大': 144099,
          '而': 143233,
          '之': 140957,
          '道': 140545,
          '以': 136106,
          '得': 134479,
          '她': 134035,
          '个': 125538,
          '后': 124793,
          '去': 123402,
          '将': 122305,
          

### 计算sbme所有字的频率和并取对数

In [5]:
log_total = {i: log(sum(hmm_model[i].values())) for i in 'sbme'}
log_total

{'s': 17.214580210949702,
 'b': 17.298268018586587,
 'm': 15.597015630115711,
 'e': 17.298268018586587}

### 转移矩阵trans,计算各标签的转移概率，P(O(k)|O(k-1))

In [6]:
trans = {'ss': 0.3,
         'sb': 0.7,
         'bm': 0.3,
         'be': 0.7,
         'mm': 0.3,
         'me': 0.7,
         'es': 0.3,
         'eb': 0.7
         }

trans = {i: log(j) for i, j in trans.items()}
trans

{'ss': -1.2039728043259361,
 'sb': -0.35667494393873245,
 'bm': -1.2039728043259361,
 'be': -0.35667494393873245,
 'mm': -1.2039728043259361,
 'me': -0.35667494393873245,
 'es': -1.2039728043259361,
 'eb': -0.35667494393873245}

### 根据结巴分词的状态转移矩阵，value值为概率对数

In [28]:
P={'B': {'E': -0.510825623765990, 'M': -0.916290731874155},
 'E': {'B': -0.5897149736854513, 'S': -0.8085250474669937},
 'M': {'E': -0.33344856811948514, 'M': -1.2603623820268226},
 'S': {'B': -0.7211965654669841, 'S': -0.6658631448798212}}
trans = {}
for key, values in P.items():
    for k,v in values.items():
        trans[(key+k).lower()] = v 
trans

{'be': -0.51082562376599,
 'bm': -0.916290731874155,
 'eb': -0.5897149736854513,
 'es': -0.8085250474669937,
 'me': -0.33344856811948514,
 'mm': -1.2603623820268226,
 'sb': -0.7211965654669841,
 'ss': -0.6658631448798212}

### 初始概率，第一个词只可能是b或者s

In [39]:
start_p={'b': -0.26268660809250016,
 'e': -3.14e+100,
 'm': -3.14e+100,
 's': -1.4652633398537678}

### viterbi算法， 动态规划算法实现，得到最优路径，即得到概率最大的sbme组合

In [67]:
def viterbi(start_p, nodes, trans):
    # paths以字典的方式缓存状态序列，以及概率值，初始位置只有start_p
    paths = start_p  
    # 从第二个词开始计算可能的最大概率路径
    for l in range(1, len(nodes)): 
        paths_ = paths # paths_缓存上一刻的状态
        print('{}:'.format(l-1), paths_)
        paths = {}
        # 在l时刻，即第l个词的位置，分别对s,b,m,e四种状态值分析,计算在该状态值中最大概率路径，并保存到缓存paths中
        for i in nodes[l]: 
            nows = {} # 当前时刻状态值为i的所有可能路径缓存
            # j为所有之前时刻缓存的路径
            for j in paths_: 
                # j[-1]为j路径最后一个状态值，比如j为'bess'，判断s+i是否是可能组合，比如se就不可能
                if j[-1]+i in trans:  
                    # l时刻下状态值i的路径概率 = 之前路径概率 + token到i的发射概率 + j[-1]i的转移概率 
                    nows[j+i] = paths_[j]+nodes[l][i]+trans[j[-1]+i]
            # 选取nows中概率最大的路径
            prob_i, path_i = max((v, k) for k,v in nows.items())
            paths[path_i] = prob_i
    print('{}:'.format(len(nodes)-1), paths)
    # 求出最后一个字哪一种状态的对应概率最大，最后一个字只可能是两种情况：e(结尾)和s(独立词)
    prob, states = max((v, k) for k,v in paths.items() if k[-1] in 'es')
    return prob, states

### HMM模型，对输入句子做分词

In [71]:
def hmm_cut(s):
    # nodes 为输入语句s中每个token分别为sbme的概率
    nodes = [{i: log(j[t]+1)-log_total[i] for i, j in hmm_model.items()} for t in s]
    _, tags = viterbi(start_p, nodes, trans)
    print('tags: ', tags)
    words = [s[0]]
    for i in range(1, len(s)):
        if tags[i] in ['b', 's']:
            words.append(s[i])
        else:
            words[-1] += s[i]
    return words

In [72]:
text = '华为手机深得大家的喜欢'
print(' '.join(hmm_cut(text)))

0: {'b': -0.26268660809250016, 'e': -3.14e+100, 'm': -3.14e+100, 's': -1.4652633398537678}
1: {'ss': -6.7477507587683565, 'sb': -8.514824192919416, 'bm': -7.803021856684376, 'be': -5.779768469237251}
2: {'bes': -13.546372927012078, 'beb': -12.721481505265722, 'bmm': -16.329054443981185, 'bme': -14.38750591207887}
3: {'bess': -22.58204745531191, 'besb': -20.100674738295005, 'bebm': -20.053773324661236, 'bebe': -19.39785648454214}
4: {'bebes': -28.147928301644228, 'bebeb': -26.84139484673273, 'besbm': -29.56559469156299, 'bebme': -28.614871482566265}
5: {'bebess': -34.21920088939125, 'bebesb': -35.64550863881571, 'bebebm': -33.966633148962686, 'bebebe': -33.14651295553467}
6: {'bebebes': -39.29135543196375, 'bebebeb': -38.32672117128855, 'bebebmm': -38.95179717057812, 'bebebme': -39.73898048704186}
7: {'bebebess': -46.54991062378507, 'bebebesb': -46.739964735908735, 'bebebebm': -44.26298344994734, 'bebebebe': -43.90148032305975}
8: {'bebebebes': -46.94960190563267, 'bebebebeb': -52.88266

In [57]:
text = '王五的老师经常夸奖他'
print(' '.join(hmm_cut(text)))

{'b': -0.26268660809250016, 'e': -3.14e+100, 'm': -3.14e+100, 's': -1.4652633398537678}
{'ss': -9.68645865775611, 'sb': -8.818124129286224, 'bm': -7.463457085791514, 'be': -9.059281246920886}
{'bes': -12.107402829493807, 'beb': -18.040464355602982, 'bmm': -15.843214681637635, 'bme': -14.541237693934628}
{'bess': -19.570816701526986, 'besb': -18.922711473359414, 'bmmm': -24.280791848372584, 'bmme': -24.25305766055385}
{'besss': -28.04247874670649, 'bessb': -27.348786960140785, 'besbm': -27.02774705115723, 'besbe': -26.154353202032688}
{'besbes': -34.22313521821076, 'besbeb': -32.33307943547145, 'bessbm': -34.17204654544735, 'besbme': -33.44614438955326}
{'besbmes': -41.993242777599676, 'besbmeb': -40.8388334763909, 'besbebm': -38.619004790510836, 'besbebe': -39.17485448546041}
{'besbebes': -50.4283177670246, 'besbebeb': -49.23161726312815, 'besbebmm': -50.7056981781877, 'besbebme': -50.98803118831202}
{'besbebess': -60.262211765571045, 'besbebesb': -59.92063882880876, 'besbebebm': -59.5

###  nodes为对于每个token，分别为's'、'b'、'm'、'e'的概率

In [13]:
text = ''
nodes = [{i:log(j[t]+1)-log_total[i] for i,j in hmm_model.items()} for t in text]
nodes

[{'s': -8.048505255271534,
  'b': -6.351258143700308,
  'm': -9.045935295072308,
  'e': -9.842391331094763},
 {'s': -6.181223980094909,
  'b': -7.0809190818153755,
  'm': -6.919576089557378,
  'e': -6.572075401806883},
 {'s': -3.625980290868881,
  'b': -7.296429459703505,
  'm': -7.5718263082248765,
  'e': -5.009385727657326},
 {'s': -4.923105399181942,
  'b': -3.6294806261874406,
  'm': -4.350454418914257,
  'e': -6.029722509594453},
 {'s': -5.474208464698561,
  'b': -7.1515990209997415,
  'm': -6.541809755089519,
  'e': -4.569919553697922},
 {'s': -5.77914072487132,
  'b': -6.813067765936818,
  'm': -7.550466272832633,
  'e': -6.5204582189988045},
 {'s': -11.834682857409241,
  'b': -7.4401440078019885,
  'm': -7.577402835715445,
  'e': -8.922177668148349},
 {'s': -7.357660294229909,
  'b': -7.9043568704160165,
  'm': -5.646024454935617,
  'e': -4.804200849422587}]

In [14]:
hmm_model['s']['李']

9566

In [15]:
hmm_model['b']['李']

56783

In [16]:
hmm_model['m']['李']

699

In [17]:
hmm_model['e']['李']

1729

In [61]:
paths = {'bebebebesss': -66.66337116442051, 'bebebebessb': -64.60666567893038, 'bebebebesbm': -65.49347468402344, 'bebebebesbe': -63.772287447683404}
(prob, state) = max((v, k) for k,v in paths.items() if k[-1] in 'es')
print((prob, state))

(-63.772287447683404, 'bebebebesbe')
