In [43]:
import numpy as np
import nltk
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import pprint
from nltk.corpus import brown
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import OrderedDict, deque

# POS tagging


<img src="https://blog.aaronccwong.com/assets/images/bigram-hmm/pos-title.jpg" alt="topic_modeling" style="width: 620px;"/>


Prepare the texts to tag

In [10]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'not', 'would', 'say', 'could', '_', 'be', 'know', 'good', 'go', 'get', 'do', 'done', 'try', 'many', 'some', 'nice', 'thank', 'think', 'see', 'rather', 'easy', 'easily', 'lot', 'lack', 'make', 'want', 'seem', 'run', 'need', 'even', 'right', 'line', 'even', 'also', 'may', 'take', 'come'])

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Sergei\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [11]:
with open('wiki_lingvo.txt', mode='r', encoding='utf-8') as file:
    text = file.readlines()
    
len(text)

2000

In [12]:
text[0]

'<p><b>Leipon</b>, or <b>Pityilu</b>, is an Austronesian language spoken on Hauwai, Ndrilo, and Pityilu islands, just off Manus Island in Papua New Guinea. </p>\n'

In [41]:
#delete html tags
text = [BeautifulSoup(t, 'lxml').text for t in text]

In [42]:
text[4]

"Judeo-Yemeni Arabic (also known as Judeo-Yemeni and Yemenite Judeo-Arabic) is a variety of Arabic spoken by Jews living or formerly living in Yemen. The language is quite different from mainstream Yemeni Arabic, and is written in the Hebrew alphabet. The cities of Sana'a, Aden, al-Bayda, and Habban District and the villages in their districts each have (or had) their own dialect.The vast majority of Yemenite Jews have relocated to Israel and have shifted to Modern Hebrew as their first language. In 1995, Israel was home to 50,000 speakers of Judeo-Yemeni in 1995, while 1,000 remained in Yemen.  According to Yemeni rabbi al-Marhabi, most of these have since left for the United States. As of  2010, fewer than 300 Jews were believed to remain in Yemen.\n"

Tagging model

In [15]:
nltk.download("brown")
nltk.download('universal_tagset')

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\Sergei\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\Sergei\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

In [20]:
brown_tagged_sents = brown.tagged_sents(tagset="universal")
brown_tagged_sents[0]

[('The', 'DET'),
 ('Fulton', 'NOUN'),
 ('County', 'NOUN'),
 ('Grand', 'ADJ'),
 ('Jury', 'NOUN'),
 ('said', 'VERB'),
 ('Friday', 'NOUN'),
 ('an', 'DET'),
 ('investigation', 'NOUN'),
 ('of', 'ADP'),
 ("Atlanta's", 'NOUN'),
 ('recent', 'ADJ'),
 ('primary', 'NOUN'),
 ('election', 'NOUN'),
 ('produced', 'VERB'),
 ('``', '.'),
 ('no', 'DET'),
 ('evidence', 'NOUN'),
 ("''", '.'),
 ('that', 'ADP'),
 ('any', 'DET'),
 ('irregularities', 'NOUN'),
 ('took', 'VERB'),
 ('place', 'NOUN'),
 ('.', '.')]

In [21]:
brown_tagged_words = brown.tagged_words(tagset='universal')
brown_tagged_words = list(map(lambda x: (x[0].lower(), x[1]), brown_tagged_words))

In [22]:
brown_tagged_words[0]

('the', 'DET')

In [25]:
tags = [tag for (word, tag) in brown_tagged_words]
words = [word for (word, tag) in brown_tagged_words]

tag_num = pd.Series(nltk.FreqDist(tags)).sort_values(ascending=False)
word_num = pd.Series(nltk.FreqDist(words)).sort_values(ascending=False)

In [26]:
brown_tagged_sents = brown.tagged_sents(tagset="universal")
my_brown_tagged_sents = []

for sent in brown_tagged_sents:
    my_brown_tagged_sents.append(list(map(lambda x: (x[0].lower(),x[1]), sent)))

my_brown_tagged_sents = np.array(my_brown_tagged_sents)

In [29]:
train_sents, test_sents = train_test_split(my_brown_tagged_sents, test_size=0.1, shuffle=True)

Lets train simple HMM model

In [38]:
class HiddenMarkovModel:    
    def __init__(self):
    
        pass
        
    def fit(self, train_tokens_tags_list):
        tags = [tag for sent in train_tokens_tags_list
                    for (word, tag) in sent]
        words = [word for sent in train_tokens_tags_list
                      for (word, tag) in sent]
        
        tag_num = pd.Series(nltk.FreqDist(tags)).sort_index()
        word_num = pd.Series(nltk.FreqDist(words)).sort_values(ascending=False)
         
        self.tags = tag_num.index
        self.words = word_num.index
        
        A = pd.DataFrame({'{}'.format(tag) : [0] * len(tag_num) for tag in tag_num.index}, index=tag_num.index)
        B = pd.DataFrame({'{}'.format(tag) : [0] * len(word_num) for tag in tag_num.index}, index=word_num.index)
        
        for sent in train_tokens_tags_list:
            for i in range(len(sent)):
                B.loc[sent[i][0], sent[i][1]] += 1
                if len(sent) - 1 != i:
                    A.loc[sent[i][1], sent[i + 1][1]] += 1
                    
        A = A.divide(A.sum(axis=1), axis=0)        
        B = B / np.sum(B, axis=0)        
        self.A = A
        self.B = B
        
        return self
        
    
    def predict(self, test_tokens_list):
        predict_tags = OrderedDict({i : np.array([]) for i in range(len(test_tokens_list))})
        
        for i_sent in range(len(test_tokens_list)):
            
            current_sent = test_tokens_list[i_sent]
            len_sent = len(current_sent)
            
            q = np.zeros(shape=(len_sent + 1, len(self.tags)))
            q[0] = 1
            back_point = np.zeros(shape=(len_sent + 1, len(self.tags)))
            
            for t in range(len_sent):
                
                if current_sent[t] not in self.words:
                    current_sent[t] = 'time' #most popular word in corpus
                    
                for i_s in range(len(self.tags)):                    
                    s = self.tags[i_s]
                    
                    q[t + 1][i_s] = np.max(q[t,:] *
                        self.A.loc[:, s] * 
                        self.B.loc[current_sent[t], s])
                    
                    back_point[t + 1][i_s] = (q[t,:] *
                        self.A.loc[:, s] * 
                        self.B.loc[current_sent[t], s]).reset_index()[s].idxmax()
                    
            back_point = back_point.astype('int')
            
            back_tag = deque()
            current_tag = np.argmax(q[len_sent])
            for t in range(len_sent, 0, -1):
                back_tag.appendleft(self.tags[current_tag])
                current_tag = back_point[t, current_tag]
             
            predict_tags[i_sent] = np.array(back_tag)
        
        
        return predict_tags   

In [31]:
markov_model = HiddenMarkovModel()
markov_model.fit(train_sents)

<__main__.HiddenMarkovModel at 0x1dc39367408>

In [47]:
from nltk import word_tokenize
word_tokenize(text[0])

['Leipon',
 ',',
 'or',
 'Pityilu',
 ',',
 'is',
 'an',
 'Austronesian',
 'language',
 'spoken',
 'on',
 'Hauwai',
 ',',
 'Ndrilo',
 ',',
 'and',
 'Pityilu',
 'islands',
 ',',
 'just',
 'off',
 'Manus',
 'Island',
 'in',
 'Papua',
 'New',
 'Guinea',
 '.']

In [59]:
#import OrderedDict
pop_word = 'time'
pred = markov_model.predict([word_tokenize(text[0])])

In [66]:
list(zip(word_tokenize(text[0]), pred[0]))

[('Leipon', 'NOUN'),
 (',', '.'),
 ('or', 'CONJ'),
 ('Pityilu', 'NOUN'),
 (',', '.'),
 ('is', 'VERB'),
 ('an', 'DET'),
 ('Austronesian', 'NOUN'),
 ('language', 'NOUN'),
 ('spoken', 'VERB'),
 ('on', 'ADP'),
 ('Hauwai', 'NOUN'),
 (',', '.'),
 ('Ndrilo', 'NOUN'),
 (',', '.'),
 ('and', 'CONJ'),
 ('Pityilu', 'NOUN'),
 ('islands', 'NOUN'),
 (',', '.'),
 ('just', 'ADV'),
 ('off', 'ADP'),
 ('Manus', 'NOUN'),
 ('Island', 'NOUN'),
 ('in', 'ADP'),
 ('Papua', 'NOUN'),
 ('New', 'NOUN'),
 ('Guinea', 'NOUN'),
 ('.', '.')]

Another example

In [68]:
list(zip(word_tokenize(text[66]), markov_model.predict([word_tokenize(text[66])])[0]))

[('Classical', 'NOUN'),
 ('Arabic', 'NOUN'),
 ('is', 'VERB'),
 ('the', 'DET'),
 ('form', 'NOUN'),
 ('of', 'ADP'),
 ('the', 'DET'),
 ('Arabic', 'NOUN'),
 ('language', 'NOUN'),
 ('used', 'VERB'),
 ('in', 'ADP'),
 ('Umayyad', 'NOUN'),
 ('and', 'CONJ'),
 ('Abbasid', 'NOUN'),
 ('literary', 'ADJ'),
 ('texts', 'NOUN'),
 ('from', 'ADP'),
 ('the', 'DET'),
 ('7th', 'ADJ'),
 ('century', 'NOUN'),
 ('AD', 'NOUN'),
 ('to', 'ADP'),
 ('the', 'DET'),
 ('9th', 'ADJ'),
 ('century', 'NOUN'),
 ('AD', 'NOUN'),
 ('.', '.'),
 ('The', 'NOUN'),
 ('orthography', 'NOUN'),
 ('of', 'ADP'),
 ('the', 'DET'),
 ('Qurʾān', 'NOUN'),
 ('was', 'VERB'),
 ('not', 'ADV'),
 ('developed', 'VERB'),
 ('for', 'ADP'),
 ('the', 'DET'),
 ('standardized', 'VERB'),
 ('form', 'NOUN'),
 ('of', 'ADP'),
 ('Classical', 'NOUN'),
 ('Arabic', 'NOUN'),
 ('.', '.'),
 ('Modern', 'NOUN'),
 ('Standard', 'NOUN'),
 ('Arabic', 'NOUN'),
 ('(', '.'),
 ('MSA', 'NOUN'),
 (')', '.'),
 ('is', 'VERB'),
 ('its', 'DET'),
 ('direct', 'ADJ'),
 ('descendant', 'NO