In [40]:
from collections import defaultdict

import numpy as np
import pandas as pd
import jieba

from scipy.sparse import csr_matrix, csr_array

In [28]:
import numpy as np

def viterbi(y, A, B, Pi=None):
    """
    Return the MAP estimate of state trajectory of Hidden Markov Model.

    Parameters
    ----------
    y : array (T,)
        Observation state sequence. int dtype.
    A : array (K, K)
        State transition matrix. See HiddenMarkovModel.state_transition  for
        details.
    B : array (K, M)
        Emission matrix. See HiddenMarkovModel.emission for details.
    Pi: optional, (K,)
        Initial state probabilities: Pi[i] is the probability x[0] == i. If
        None, uniform initial distribution is assumed (Pi[:] == 1/K).

    Returns
    -------
    x : array (T,)
        Maximum a posteriori probability estimate of hidden state trajectory,
        conditioned on observation sequence y under the model parameters A, B,
        Pi.
    T1: array (K, T)
        the probability of the most likely path so far
    T2: array (K, T)
        the x_j-1 of the most likely path so far
    """
    # Cardinality of the state space
    K = A.shape[0]
    # Initialize the priors with default (uniform dist) if not given by caller
    Pi = Pi if Pi is not None else np.full(K, 1 / K)
    T = len(y)
    T1 = np.empty((K, T), 'd')
    T2 = np.empty((K, T), 'B')

    # Initilaize the tracking tables from first observation
    T1[:, 0] = Pi * B[:, y[0]]
    T2[:, 0] = 0

    # Iterate throught the observations updating the tracking tables
    for i in range(1, T):
        T1[:, i] = np.max(T1[:, i - 1] * A.T * B[np.newaxis, :, y[i]].T, 1)
        T2[:, i] = np.argmax(T1[:, i - 1] * A.T, 1)

    # Build the output, optimal model trajectory
    x = np.empty(T, 'B')
    x[-1] = np.argmax(T1[:, T - 1])
    for i in reversed(range(1, T)):
        x[i - 1] = T2[x[i], i]

    return x, T1, T2

In [19]:
df = pd.read_csv("./ok_data_level3-4/ok_data_level3.csv").iloc[:-3]

In [45]:
states = ["s", "m", "e", "suf"]

In [29]:
df.head()

Unnamed: 0,id,pid,deep,name,pinyin_prefix,pinyin,ext_id,ext_name,suffix
0,11,0,0,北京,b,bei jing,110000000000,北京市,市
1,1101,11,1,北京,b,bei jing,110100000000,北京市,市
2,110101,1101,2,东城,d,dong cheng,110101000000,东城区,区
3,110102,1101,2,西城,x,xi cheng,110102000000,西城区,区
4,110105,1101,2,朝阳,c,chao yang,110105000000,朝阳区,区


In [26]:
df["suffix"] = df.apply(lambda row: row["ext_name"].split(row["name"])[1], axis=1)

In [87]:
emission_freq = defaultdict(int)
transition_matrix = np.zeros((len(states), len(states) + 1))

token_dict = dict()
token_increment = 0

for name in df["name"]:
    for i, token in enumerate(name):
        if i == 0:
            prev_state = None
            state = "s"
        elif i == len(name) - 1:
            prev_state = "m" if len(name) > 2 else "s"
            state = "e"
        else:
            prev_state = "s" if i == 1 else "m"
            state = "m"

        state_index = states.index(state)

        if prev_state:
            prev_state_index = states.index(prev_state)
            transition_matrix[prev_state_index, state_index] += 1

        if token in token_dict:
            token_index = token_dict[token]
        else:
            token_increment += 1
            token_dict[token] = token_increment
            token_index = token_increment

        emission_freq[(state_index, token_index)] += 1

for suffix in df["suffix"]:
    for i, token in enumerate(suffix):
        state = "suf"
        state_index = states.index(state)
        
        if token in token_dict:
            token_index = token_dict[token]
        else:
            token_increment += 1
            token_dict[token] = token_increment
            token_index = token_increment

        emission_freq[(state_index, token_index)] += 1

transition_matrix[2, 3] = df.shape[0]
transition_matrix[2, 0] = 0.5 * df.shape[0]
transition_matrix[2, 4] = 0.5 * df.shape[0]
transition_matrix[3, 0] = df.shape[0]
transition_matrix[3, 3] = df.shape[0]
transition_matrix[3, 4] = df.shape[0]

transition_matrix = transition_matrix / transition_matrix.sum(axis=1, keepdims=True)
transition_matrix = transition_matrix[:, :-1]

initial_probs = np.array([1, 0, 0, 0])

In [88]:
state_indices = []
token_indices = []
frequencies = []

for (si, ti), freq in emission_freq.items():
    state_indices.append(si)
    token_indices.append(ti)
    frequencies.append(freq)

emission_matrix = csr_matrix((frequencies, (state_indices, token_indices))).toarray()

In [89]:
def infer(text, token_dict, transition_matrix, emission_matrix, initial_probs=None):
    indices = []
    for c in text:
        indices.append(token_dict[c])
    
    return viterbi(indices, A=transition_matrix, B=emission_matrix, Pi=initial_probs)
        

In [99]:
infer("广西市海淀土家族苗族自治县", token_dict, transition_matrix, emission_matrix, initial_probs)

(array([0, 2, 3, 0, 2, 3, 3, 3, 3, 3, 3, 3, 3], dtype=uint8),
 array([[2.10000000e+01, 0.00000000e+00, 1.10984191e+03, 2.83724918e+06,
         0.00000000e+00, 2.56320558e+06, 0.00000000e+00, 0.00000000e+00,
         2.48061340e+09, 0.00000000e+00, 1.55121025e+12, 4.03314664e+13,
         0.00000000e+00],
        [0.00000000e+00, 4.05668685e+00, 0.00000000e+00, 2.14394337e+02,
         0.00000000e+00, 0.00000000e+00, 2.72331776e+06, 3.78632283e+06,
         0.00000000e+00, 9.58387785e+08, 1.66559959e+09, 7.49139786e+11,
         0.00000000e+00],
        [0.00000000e+00, 7.39894606e+02, 7.67528167e+01, 2.90766975e+04,
         2.56320558e+06, 0.00000000e+00, 4.63126247e+06, 0.00000000e+00,
         0.00000000e+00, 0.00000000e+00, 6.25267867e+08, 4.20414686e+12,
         4.62736432e+15],
        [0.00000000e+00, 0.00000000e+00, 2.65992111e+05, 0.00000000e+00,
         0.00000000e+00, 1.66608363e+07, 5.55361209e+07, 3.72092010e+09,
         3.47285876e+10, 2.32681537e+12, 1.20994399e+14, 

In [48]:
emission_matrix

<4x1296 sparse matrix of type '<class 'numpy.intc'>'
	with 1987 stored elements in Compressed Sparse Row format>

In [12]:
import jieba
import jieba.posseg as pseg
# words = pseg.cut("我爱北京天安门") #jieba默认模式
# jieba.enable_paddle() #启动paddle模式。 0.40版之后开始支持，早期版本不支持
words = pseg.cut("北京海淀") #paddle模式
for word, flag in words:
    print('%s %s' % (word, flag))


北京 ns
海淀 ns
