In [1]:
import nltk
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
import pprint, time

# Preprocessing of data

Here I will aim to form an array of arrays of tuples with each word and the following punctuation mark.

Example of single sentence:
[('I', 'SPACE'),
('love', 'SPACE'),
('kittens', 'COMMA'),
('and', 'SPACE').
('you', 'QUESTIONMARK')]

In [2]:
from os import listdir
from os.path import isfile, join

datapath = 'target-sentences-tokenized'
onlyfiles = [join(datapath, f) for f in listdir(datapath) if isfile(join(datapath, f))]

In [3]:
data = []
punctuations = {',', '.', '!', '?'}
ignore = {'"', "'", '-', '–', '—', ':', ';'}
for file in onlyfiles:
    with open(file, 'r') as f:
        lines = f.readlines()
    for l in lines:
        words = l.split()
        curr_sent = []
        i = 0
        while i < len(words) - 1:
            current_pair = [words[i].lower()]
            if words[i].isnumeric():
                current_pair = ['NUM']
            if words[i] in ignore:
                i += 1
                continue
            if words[i + 1] in punctuations:
                current_pair.append(words[i + 1])
                i += 1
            elif words[i + 1] in ignore:
                current_pair.append(' ')
                i += 1
            else:
                current_pair.append(' ')
            curr_sent.append(tuple(current_pair))
            i += 1
        data.append(curr_sent)

In [4]:
data[:10]

[[('я', ' '),
  ('високий', ','),
  ('з', ' '),
  ('чорною', ' '),
  ('бородою', ' '),
  ('й', ' '),
  ('ненавиджу', ' '),
  ('публічність', '.')],
 [('я', ' '),
  ('не', ' '),
  ('маю', ' '),
  ('взагалі', ' '),
  ('грошей', ','),
  ('не', ' '),
  ('люблю', ' '),
  ('вівсянку', ','),
  ('а', ' '),
  ('єдине', ' '),
  ('прагнення', ' '),
  ('мого', ' '),
  ('життя', ' '),
  ('померти', ' '),
  ('в', ' '),
  ('достатку', '.')],
 [('ставлення', ' '),
  ('до', ' '),
  ('інших', ' '),
  ('холодне', ' '),
  ('й', ' '),
  ('безсердечне', '.')],
 [('я', ' '),
  ('не', ' '),
  ('піклуюсь', ' '),
  ('про', ' '),
  ('моїх', ' '),
  ('колег', ','),
  ('а', ' '),
  ('також', ' '),
  ('я', ' '),
  ('ще', ' '),
  ('не', ' '),
  ('дав', ' '),
  ('жодної', ' '),
  ('копійки', ' '),
  ('жебраку', ' '),
  ('чи', ' '),
  ('на', ' '),
  ('милостиню', '.')],
 [('що', ' '),
  ('ж', ','),
  ('мій', ' '),
  ('любий', ' '),
  ('лікарю', ','),
  ('се', ' '),
  ('справжній', ' '),
  ('опис', ' '),
  ('мене', ','

In [5]:
len(data)

29760

---------------

In [6]:
# split data into training and validation set in the ratio 90:10
train_set, test_set = train_test_split(data, train_size=0.9, test_size=0.1, random_state=101)

# create list of train and test tagged words
train_tagged_words = [tup for sent in train_set for tup in sent]
test_tagged_words = [tup for sent in test_set for tup in sent]
print(len(train_tagged_words))
print(len(test_tagged_words))

319254
35346


In [7]:
# check some tagged words.
train_tagged_words[:5]

[('щось', ' '),
 ('схоже', ' '),
 ('стосується', ' '),
 ('й', ' '),
 ('перекладу', '.')]

In [8]:
#use set datatype to check how many unique tags are present in training data
tags = {tag for word, tag in train_tagged_words}
print(len(tags))
print(tags)

# check total words in vocabulary
vocab = {word for word, tag in train_tagged_words}

5
{'!', ',', ' ', '?', '.'}


In [9]:
# compute Emission Probability
def word_given_tag(word, tag, train_bag = train_tagged_words):
    tag_list = [pair for pair in train_bag if pair[1]==tag]
    count_tag = len(tag_list)#total number of times the passed tag occurred in train_bag
    w_given_tag_list = [pair[0] for pair in tag_list if pair[0]==word]
#now calculate the total number of times the passed word occurred as the passed tag.
    count_w_given_tag = len(w_given_tag_list)


    return (count_w_given_tag, count_tag)

# compute  Transition Probability
def t2_given_t1(t2, t1, train_bag=train_tagged_words):
    tags = [pair[1] for pair in train_bag]
    count_t1 = len([t for t in tags if t==t1])
    count_t2_t1 = 0
    for index in range(len(tags)-1):
        if tags[index]==t1 and tags[index+1] == t2:
            count_t2_t1 += 1
    return (count_t2_t1, count_t1)

# creating t x t transition matrix of tags, t= no of tags
# Matrix(i, j) represents P(jth tag after the ith tag)

tags_matrix = np.zeros((len(tags), len(tags)), dtype='float32')
for i, t1 in enumerate(list(tags)):
    for j, t2 in enumerate(list(tags)):
        tags_matrix[i, j] = t2_given_t1(t2, t1)[0]/t2_given_t1(t2, t1)[1]

print(tags_matrix)

[[0.01500259 0.09622349 0.8556648  0.00206932 0.0305225 ]
 [0.00537951 0.0825138  0.8946513  0.00319408 0.0142613 ]
 [0.00559264 0.11710528 0.7946261  0.00704974 0.07562622]
 [0.01344861 0.08981749 0.8650336  0.00576369 0.0259366 ]
 [0.01138602 0.09963968 0.8527504  0.00614941 0.03007447]]


In [10]:
# convert the matrix to a df for better readability
# the table is same as the transition table shown in section 3 of article
tags_df = pd.DataFrame(tags_matrix, columns=list(tags), index=list(tags))
display(tags_df)

Unnamed: 0,!,",",Unnamed: 3,?,.
!,0.015003,0.096223,0.855665,0.002069,0.030523
",",0.00538,0.082514,0.894651,0.003194,0.014261
,0.005593,0.117105,0.794626,0.00705,0.075626
?,0.013449,0.089817,0.865034,0.005764,0.025937
.,0.011386,0.09964,0.85275,0.006149,0.030074


In [11]:
def Viterbi(words, train_bag=train_tagged_words):
    state = []
    T = list(set([pair[1] for pair in train_bag]))

    for key, word in enumerate(words):
        # initialise list of probability column for a given observation
        p = []
        for tag in T:
            if key == 0:
                transition_p = tags_df.loc['.', tag] + tags_df.loc['?', tag] + tags_df.loc['!', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]

            # compute emission and state probabilities
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
            state_probability = emission_p * transition_p
            p.append(state_probability)

        pmax = max(p)
        # getting state for which probability is maximum
        state_max = T[p.index(pmax)]
        state.append(state_max)
    return list(zip(words, state))


In [12]:
# Let's test our Viterbi algorithm on a few sample sentences of test dataset
random.seed(1234)      #define a random seed to get same sentences when run multiple times

# choose random 10 numbers
rndom = [random.randint(1, len(test_set)) for x in range(10)]

# list of 10 sents on which we test the model
test_run = [test_set[i] for i in rndom]

# list of tagged words
test_run_base = [tup for sent in test_run for tup in sent]

# list of untagged words
test_tagged_words = [tup[0] for sent in test_run for tup in sent]

# Here We will only test 10 sentences to check the accuracy
# as testing the whole training set takes huge amount of time
start = time.time()
tagged_seq = Viterbi(test_tagged_words)
end = time.time()
difference = end-start

print("Time taken in seconds: ", difference)

# accuracy
check = [i for i, j in zip(tagged_seq, test_run_base) if i == j]

accuracy = len(check)/len(tagged_seq)
print('Viterbi Algorithm Accuracy: ', accuracy * 100)

Time taken in seconds:  14.385432004928589
Viterbi Algorithm Accuracy:  69.92481203007519


In [13]:
tagged_seq

[('щодо', ' '),
 ('дсту', '!'),
 ('дуже', ' '),
 ('важко', ' '),
 ('його', ' '),
 ('притягти', ' '),
 ('до', ' '),
 ('цієї', ' '),
 ('статті', ' '),
 ('член', ' '),
 ('правління', ' '),
 ('пятнажко', '!'),
 ('запитав', ' '),
 ('у', ' '),
 ('сидячого', '!'),
 ('прибулий', ' '),
 ('NUM', ' '),
 ('тепер', ' '),
 ('ще', ' '),
 ('декілька', ' '),
 ('разів', ' '),
 ('повторіть', '.'),
 ('свою', ' '),
 ('брехню', ','),
 ('про', ' '),
 ('відсутність', ' '),
 ('цього', ' '),
 ('набряку', '!'),
 ('у', ' '),
 ('моєму', ' '),
 ('лівому', '!'),
 ('боці', ' '),
 ('і', ' '),
 ('думаю', ','),
 ('я', ' '),
 ('можу', ' '),
 ('вважатись', '!'),
 ('вилікуваним', '!'),
 ('щоб', ' '),
 ('ласувати', '!'),
 ('сосисками', ','),
 ('і', ' '),
 ('гречаними', '!'),
 ('тортами', '!'),
 ('»', ' '),
 ('у', ' '),
 ('цьому', ' '),
 ('підвалі', ' '),
 ('вже', ' '),
 ('давно', ' '),
 ('живе', ' '),
 ('інша', ' '),
 ('людина', ' '),
 ('і', ' '),
 ('взагалі', ' '),
 ('не', ' '),
 ('буває', ' '),
 ('так', ' '),
 ('щоб', ' '

In [None]:
# Code to test all the test sentences
# (takes alot of time to run s0 we won't run it here)
# tagging the test sentences()
test_tagged_words = [tup for sent in test_set for tup in sent]
test_untagged_words = [tup[0] for sent in test_set for tup in sent]


print('Starting testing')
start = time.time()
tagged_seq = Viterbi(test_untagged_words)
end = time.time()
difference = end-start

print("Time taken in seconds: ", difference)

# accuracy
check = [i for i, j in zip(test_tagged_words, test_untagged_words) if i == j]

accuracy = len(check) / len(tagged_seq)
print('Viterbi Algorithm Accuracy: ', accuracy * 100)


Starting testing
