# POS tagger with Viterbi algorithm

References:
* [1] [한국어 형태소 분석기와 품사 태거 구현](https://github.com/gritmind/morph_and_pos_analyzer_korean)
* [2] [A deep dive into part-of-speech tagging using the Viterbi algorithm](https://medium.freecodecamp.org/a-deep-dive-into-part-of-speech-tagging-using-viterbi-algorithm-17c8de32e8bc)
* [3] [Viterbi matrix for calculating the best POS tag sequence of a HMM POS tagger](https://www.youtube.com/watch?v=_568XqOByTs)

In [38]:
#####################################################################################################
""" Load Resources  and Input Data """
#####################################################################################################
# Resoruces (i.e. transition probabilities, output probabilities)
# Input Data (i.e. multiple sentences to be tagged)

# resources for test
trans_prob_dict = {
    ('Noun','Noun'):      1 * 10 ** -1, 
    ('Verb','Noun'):      4 * 10 ** -1, 
    ('Verb','Adv'):       4 * 10 ** -1, 
    ('Noun','Adv'):       1 * 10 ** -1, 
    ('Noun','Verb'):      3 * 10 ** -1, 
    ('Verb', 'Verb'):     1 * 10 ** -1, 
    ('<s>','Noun'):       2 * 10 ** -1, 
    ('<s>','Verb'):       3 * 10 ** -1, 
    ('Adj','</s>'):       1 * 10 ** -1
}
output_prob_dict = {
    ('Verb','learning'):      3 * 10 ** -3, 
    ('Verb','changes'):       4 * 10 ** -3, 
    ('Adv','thoroughly'):     2 * 10 ** -3, 
    ('Noun','learning'):      1 * 10 ** -3,
    ('Noun','changes'):       3 * 10 ** -3
}

# input data for test
input_word_list = ["learning changes thoroughly", "learning changes", "learning"]


# extract more information from resources
set_tag_list = [key[0] for key in list(trans_prob_dict.keys())] + [key[1] for key in list(trans_prob_dict.keys())]
set_tag_list = list(set(set_tag_list))
len_set_tag_list = len(set_tag_list)

In [39]:
#####################################################################################################
""" Viterbi Algorithm """
#####################################################################################################
# 통합확률은 Hidden Markov Model에 의해서 정의되어, 전이확률과 출력확률로 구성됨.
# 단어와 태그 사이의 모든 경우에 해당하는 전이확률과 출력확률을 미리 구해야함.
smoothing = 1e-9

def ExtractMax_Prob_Index(total_prob_state, i, j, trans_prob_dict, output_prob_state):
    
    temp = [0 for x in range(len(set_tag_list))]
    for k, _ in enumerate(set_tag_list):

        try:
            cur_trans_prob = trans_prob_dict[(set_tag_list[k], set_tag_list[j])]
        except:
            cur_trans_prob = smoothing
        
        ## 통합확률 bottom-up process - 현재(2D-state)에서의 모든 통합확률을 구한다.  
        temp[k] = total_prob_state[i-1][k] * cur_trans_prob * output_prob_state[i][j]  # index 주의, log-sum 고려
    
    # 가장 높은 확률을 가지는 과거의 태그축-state index를 추출한다.
    max_prob = max(temp)
    argmax_idx = temp.index(max_prob)
    return max_prob, argmax_idx


def ViterbiAlgorithm(splited_sent):
    
    ## STEP0 - 준비 ([x=단어,y=태그] 라는 2D-State 공간을 구성)
    # 출력확률, 통합확률, 이전index(backtracking때 필요)를 저장할 수 있는 3가지 종류의 2D-State를 만듦.    
    len_splited_sent = len(splited_sent)
    output_prob_state = [[0 for x in range(len_set_tag_list)] for y in range(len_splited_sent)]
    total_prob_state = [[0 for x in range(len_set_tag_list)] for y in range(len_splited_sent)]
    total_prev_state_idx = [[-1 for x in range(len_set_tag_list)] for y in range(len_splited_sent)]
    
    ## STEP1 - 출력확률 2D-State에 해당 tag와 word에 맞게 값을 채워넣는다.
    # 각 state에 대해 모든 출력확률을 구한다.
    for i, word in enumerate(splited_sent):
        for j, tag in enumerate(set_tag_list):
            try:
                output_prob_state[i][j] = output_prob_dict[(tag, word)] # 출력확률
            except:
                output_prob_state[i][j] = smoothing
    
    #print(set_tag_list)     
    #print(output_prob_state)

    ## STEP2 - 통합확률 bottom-up process (dynamic programming)
    # 단어축-state가 진행될 때마다 차차 확률을 곱해나가고 이전 시간의 태그축-state index를 저장
    # 단어축: 시간의 개념, 태그축: 공간의 개념
    for i, word in enumerate(splited_sent):
        for j, tag in enumerate(set_tag_list):
            if i == 0: # 첫 단어에 대해서만...
                start_state = 1.0 # 첫 단어 기준으로 이전 통합확률은 1로 명시함 (왜냐? 존재하질 않으므로).
                try:
                    cur_trans_prob = trans_prob_dict['<s>', set_tag_list[j]] # 첫 단어는 무조건 <s>가 이전 태그임.
                except:
                    cur_trans_prob = smoothing
                # 통합확률 bottom-up process
                total_prob_state[i][j] = start_state * cur_trans_prob *  output_prob_state[i][j] # 첫 단어에서의 통합확률
                total_prev_state_idx[i][j] = set_tag_list.index('<s>') # 첫 단어에서의 이전index
            else:
                # ExtractMaxProb 함수를 통하여 가장 확률이 높은 이전 시간의 태그축-state index를 선택하고 현재 2D-state의 통합확률로 할당
                # 여기서 max 확률과 그의 이전index를 저장하는 작업이 dynamic programming의 핵심 
                total_prob_state[i][j], total_prev_state_idx[i][j] = ExtractMax_Prob_Index(total_prob_state, i, j, trans_prob_dict, output_prob_state)
    
    #print(set_tag_list)
    #for i in total_prob_state:
        #print(i)
        #pass
    
    ## STEP3 - Preparing for backtracking 
    # </s> 태그 시점에서 확률이 가장 높은 마지막 단어축-state index 선택
    temp = [0 for x in range(len(set_tag_list))]
    last_idx = len(splited_sent)-1
    for j, _ in enumerate(set_tag_list):
        end_state = 1.0 # 끝 단어 기준으로 이후 출력확률은 1로 명시함 (왜냐? 존재하질 않으므로).
        try: 
            cur_trans_prob = trans_prob_dict[(set_tag_list[j], '</s>')]
        except: # smoothing.
            cur_trans_prob = smoothing
            
        # 마지막 통합확률 bottom-up process
        temp[j] = total_prob_state[last_idx][j] * cur_trans_prob * end_state
        
    max_prob = max(temp)
    end_state_idx = temp.index(max_prob)
    # 이제 end_state_idx 를 기점으로 backtracking을 실시하자.
    
    ## STEP4 - Executing Backtracking
    prev_tag_idx = -1
    pos_sent = ['none_tag'] * (len(splited_sent) + 1) # <s> 태그때문에 +1해준다
    #joint_prob = 0
    for i, x in enumerate(reversed(pos_sent)):
        last_idx = len(pos_sent)-1
        ri = last_idx - i # reversed index
        
        if ri == last_idx: # last word
            pos_sent[ri] = set_tag_list[end_state_idx]
            #joint_prob = max_prob
            prev_tag_idx = end_state_idx
        else:
            cur_tag_idx = total_prev_state_idx[ri][prev_tag_idx]
            pos_sent[ri] = set_tag_list[cur_tag_idx]
            
            # 다음 턴을 위해서..
            prev_tag_idx = cur_tag_idx
            
    print('Output:\t',' '.join(pos_sent[1:])) # start tag는 생략
    #print(joint_prob)
    #print(total_prob_state)

    
#####################################################################################################
""" Main """
#####################################################################################################
for case in input_word_list:
    print('Input:\t', case)
    ViterbiAlgorithm(case.split())
    print('-----------------------------')


Input:	 learning changes thoroughly
Output:	 Verb Verb Adv
-----------------------------
Input:	 learning changes
Output:	 Verb Noun
-----------------------------
Input:	 learning
Output:	 Verb
-----------------------------


Above results are the same as [3].