In [35]:
import pandas as pd
import numpy as np
from src.preprocess.text import SentenceGetter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize

from tqdm.notebook import tqdm

In [2]:
ner_dataset = pd.read_csv("/Users/Mikhail_Bulgakov/GitRepo/pos_ner_task/data/ner_dataset.csv", delimiter=',', encoding='unicode_escape')
ner_dataset = ner_dataset.fillna(method="ffill")

In [3]:
ner_dataset.head(30)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O
5,Sentence: 1,through,IN,O
6,Sentence: 1,London,NNP,B-geo
7,Sentence: 1,to,TO,O
8,Sentence: 1,protest,VB,O
9,Sentence: 1,the,DT,O


Creating init, transition, and emission matrices

In [4]:
sg = SentenceGetter(ner_dataset)

In [5]:
train_data, test_data = train_test_split(sg.get_full_data(), test_size=0.2)

In [6]:
print(len(train_data))
print(len(test_data))

38367
9592


In [7]:
states = ner_dataset["Tag"].unique().tolist()
observations = list(set(ner_dataset["Word"].values))

In [8]:
states_num = len(states)
observations_num = len(observations)

In [9]:
init_prob = np.zeros(shape=(1, states_num))
transition_matrix = np.zeros(shape=(states_num,states_num))
emission_matrix = np.zeros(shape=(states_num, observations_num))

In [10]:
states_to_idx = {state:idx for idx, state in enumerate(states)}
observations_to_idx = {obs:idx for idx, obs in enumerate(observations)}

In [11]:
init_prob

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0.]])

In [12]:
init_prob.shape

(1, 17)

0.0

init matrix

In [14]:
print(train_data[0])

[('The', 'DT', 'O'), ('Islamic', 'JJ', 'O'), ('countries', 'NNS', 'O'), ('called', 'VBN', 'O'), ('on', 'IN', 'O'), ('the', 'DT', 'O'), ('council', 'NN', 'O'), ('to', 'TO', 'O'), ('take', 'VB', 'O'), ('action', 'NN', 'O'), ('on', 'IN', 'O'), ('what', 'WP', 'O'), ('they', 'PRP', 'O'), ('called', 'VBD', 'O'), ('"', '``', 'O'), ('gross', 'JJ', 'O'), ('violations', 'NNS', 'O'), ('of', 'IN', 'O'), ('human', 'JJ', 'O'), ('rights', 'NNS', 'O'), ('by', 'IN', 'O'), ('Israel', 'NNP', 'B-geo'), ('in', 'IN', 'O'), ('Lebanon', 'NNP', 'B-geo'), ('.', '.', 'O'), ('"', '``', 'O')]


In [54]:
init_prob.shape

(1, 17)

transition

In [55]:
transition_matrix.shape

(17, 17)

emission

In [56]:
emission_matrix.shape

(17, 35178)

In [204]:
states = ['A', 'B', 'C']
observ = ['x', 'y', 'z', 'u', 'v']
observ = [*observ, "Unk"]

states_num = len(states)
observations_num = len(observ)

init_prob = np.zeros(shape=(1, states_num))
transition_matrix = np.zeros(shape=(states_num,states_num))
emission_matrix = np.zeros(shape=(states_num, observations_num))

states_to_idx = {state:idx for idx, state in enumerate(states)}
observations_to_idx = {obs:idx for idx, obs in enumerate(observ)}

In [205]:
train_data = [[('x',"", 'A'),('y', "", 'B'),('u', "", "A")],
              [('v', "", "A"),('y', "", 'B'),('z', "", 'B'),('x',"", 'A'),('u',"", 'C')],
              [('u',"", 'C'),('x', "", 'B')]]

In [206]:
for sent in tqdm(train_data):
    init_prob[0,states_to_idx[sent[0][2]]] += 1
    for i in range(len(sent)-1):
        transition_matrix[states_to_idx[sent[i][2]], states_to_idx[sent[i+1][2]]] += 1
        emission_matrix[states_to_idx[sent[i][2]], observations_to_idx[sent[i][0]]] += 1
    emission_matrix[states_to_idx[sent[-1][2]], observations_to_idx[sent[-1][0]]] += 1

emission_matrix += 1
init_prob = init_prob/init_prob.sum()

transition_matrix = normalize(transition_matrix, axis=1, norm='l1')
emission_matrix = normalize(emission_matrix, axis=1, norm='l1')

  0%|          | 0/3 [00:00<?, ?it/s]

In [207]:
emission_matrix

array([[0.3  , 0.1  , 0.1  , 0.2  , 0.2  , 0.1  ],
       [0.2  , 0.3  , 0.2  , 0.1  , 0.1  , 0.1  ],
       [0.125, 0.125, 0.125, 0.375, 0.125, 0.125]])

In [202]:
emission_matrix.sum(axis=0)+len(observ)

array([12., 11., 10., 12., 10.,  9.])

In [203]:
emission_matrix/(emission_matrix.sum(axis=0)+len(observ))

array([[0.25      , 0.09090909, 0.1       , 0.16666667, 0.2       ,
        0.11111111],
       [0.16666667, 0.27272727, 0.2       , 0.08333333, 0.1       ,
        0.11111111],
       [0.08333333, 0.09090909, 0.1       , 0.25      , 0.1       ,
        0.11111111]])

In [73]:
init_prob.shape

(1, 3)

In [74]:
transition_matrix.shape

(3, 3)

In [75]:
emission_matrix.shape

(3, 5)

In [76]:
init_prob.sum()

1.0

In [77]:
transition_matrix.sum(axis=1)

array([1., 1., 1.])

In [78]:
emission_matrix.sum(axis=1)

array([1., 1., 1.])