# 词性标注
- 词性标注`(Part-Of-Speech tagging, POS tagging)`，判断句子中单词的词性：谓词、虚词、代词、感叹词等
- 本质上属于分类问题，将句子中的单词按词性分类
  
   
- 因此需要词性标注好的语料库，其中给定句子$s=w_1w_2...w_n$及对应的词性 $t=z_1z_2...z_n$

In [1]:
# 语料样本
open('../datasets/pos_tagging_data.txt','r').readline()

'Newsweek/NNP\n'

# 构建`N-gram`语言模型
- 基于语料库，构建`N-gram`模型

In [2]:
# 创建字典，便于将文本数值化
tag2id, id2tag = {}, {}
word2id, id2word = {}, {}

for line in open('../datasets/pos_tagging_data.txt', 'r'):
    items = line.split('/')
    word, tag = items[0], items[1].rstrip()

    if word not in word2id:
        word2id[word] = len(word2id)
        id2word[len(id2word)] = word
    if tag not in tag2id:
        tag2id[tag] = len(tag2id)
        id2tag[len(tag2id)] = tag

M = len(word2id)  # 词典的大小
N = len(tag2id)  # 词性的种类

In [3]:
print(M,N)

18978 54


In [4]:
import numpy as np

In [6]:
pi = np.zeros(N)  # 每个词性出现在句首的概率
A = np.zeros((N, M))  # A[i][j],给定 tag i,出现单词 j 的概率
B = np.zeros((N, N))  # B[i][j],词性为 tag i 时，其后单词的词性为 tag j 的概率

prev_tag = ""
for line in open('../datasets/pos_tagging_data.txt', 'r'):
    items = line.split('/')
    wordId, tagId = word2id[items[0]], tag2id[items[1].rstrip()]

    if prev_tag == "":  # 判断句子的开始
        pi[tagId] += 1
        A[tagId][wordId] += 1
    else:
        A[tagId][wordId] += 1
        B[tag2id[prev_tag]][tagId] += 1

    if items[0] == ".":
        prev_tag = ""
    else:
        prev_tag = items[1].rstrip()

# 转化成概率
pi = pi / sum(pi)
for i in range(N):
    A[i] /= sum(A[i])
    B[i] /= sum(B[i])

In [7]:
pi

array([1.81324111e-01, 0.00000000e+00, 1.00049407e-02, 3.33498024e-03,
       3.95256917e-03, 3.68083004e-02, 1.11660079e-01, 3.66847826e-02,
       6.17588933e-04, 3.81669960e-02, 8.76976285e-03, 5.18774704e-02,
       6.02766798e-02, 2.47035573e-04, 2.17267787e-01, 0.00000000e+00,
       1.48221344e-03, 6.05237154e-03, 8.64624506e-04, 2.47035573e-04,
       0.00000000e+00, 4.73073123e-02, 0.00000000e+00, 7.16403162e-03,
       1.72924901e-03, 2.09980237e-03, 7.53458498e-02, 6.36116601e-02,
       2.59387352e-03, 1.85276680e-03, 5.92885375e-03, 1.97628458e-03,
       2.84090909e-03, 0.00000000e+00, 0.00000000e+00, 2.71739130e-03,
       5.92885375e-03, 5.92885375e-03, 9.88142292e-04, 3.70553360e-04,
       1.23517787e-04, 0.00000000e+00, 0.00000000e+00, 1.85276680e-03,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00])

# 维特比算法求解最优标注

In [8]:
def log_(v):
    if v==0:
        return np.log(v+0.000001)
    return np.log(v)

In [9]:
from math import log


def viterbi(x, pi, A, B):  # x 为输入句子，"I like playing soccer"
    x = [word2id[word] for word in x.split(" ")]
    T = len(x)

    dp = np.zeros((T, N))  # 默认浮点数
    ptr = np.array([[0 for x in range(N)] for y in range(T)])  # 整数

    for j in range(N):
        dp[0][j] = log_(pi[j]) + log_(A[j][x[0]])  # 需要添加平滑项

    for i in range(1, T):
        for j in range(N):
            dp[i][j] = float('-inf')
            for k in range(N):
                score = dp[i - 1][k] + log_(B[k][j]) + log_(A[j][x[i]])
                if score > dp[i][j]:
                    dp[i][j] = score
                    ptr[i][j] = k

    # decoding：找出最好的 tag sequence
    best_seq = [0] * T
    # step1：找出最后一个单词的词性
    best_seq[T - 1] = np.argmax(dp[T - 1])

    # step2：从后向前循环依次找出每个单词的词性
    for i in range(T - 2, -1, -1):
        best_seq[i] = ptr[i + 1][best_seq[i + 1]]

    return [id2tag[id] for id in best_seq]

In [10]:
x = "I like play soccer"
viterbi(x,pi,A,B)

['CC', 'WDT', 'TO', 'VB']