<a href="https://colab.research.google.com/github/aithaprasad/NLP_Kreyol_Segmentation/blob/main/Kreyol_With_HMM_Supervised.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import csv
with open('kreyol_segmentation_train.tsv', encoding="utf-8") as file:
  f = csv.reader(file, delimiter="\t")
  train_data = []
  for line in f:
    train_data.append(line)

In [2]:
train_data
print(len(train_data))

12812


In [3]:
res = []
for word_division in train_data:
  split_word = word_division[1].split('-')
  temp = []
  for letter in split_word:
    temp.append((letter[0], 'B'))
    if len(letter) > 1:
      for item in letter[1:]:
        temp.append((item, 'I'))
  res.append(temp)

In [4]:
res[0]

[('k', 'B'),
 ('o', 'B'),
 ('n', 'I'),
 ('s', 'B'),
 ('i', 'B'),
 ('l', 'B'),
 ('t', 'B'),
 ('a', 'B'),
 ('n', 'I')]

In [5]:
letter_tag_count = dict()
B_count, I_count = 0, 0 
unique_letters = []
for train in res:
  for letter_tag in train:
    letter, tag = letter_tag[0], letter_tag[1]
    
    if tag == 'B': B_count += 1
    else: I_count += 1

    if letter not in letter_tag_count:
      letter_tag_count[letter] = {tag : 1}
      unique_letters.append(letter)
    else:
      if tag not in letter_tag_count[letter]: letter_tag_count[letter][tag] = 1
      else: letter_tag_count[letter][tag] += 1

In [6]:
print(len(letter_tag_count))
print(len(unique_letters))
print(B_count)
print(I_count)

51
51
79696
7608


In [7]:
total_count_per_tag = {'B' : B_count, 'I': I_count}
for letter in unique_letters:
  new_dict = {'B' : 0, 'I': 0}
  tags = ['B', 'I']
  for tag in tags:
    if tag in letter_tag_count[letter]:
      new_dict[tag] = letter_tag_count[letter][tag] / total_count_per_tag[tag] 
  letter_tag_count[letter] = new_dict

In [8]:
word_context_tag_count = {'B' : {'B': 0, 'I': 0}, 'I' : {'B': 0, 'I': 0}}

for i in range(1, len(res)):
  for j in range(len(res[i])):
    if j == 0: continue
    letter, tag = res[i][j][0], res[i][j][1]
    prev_letter, prev_tag = res[i][j - 1][0], res[i][j - 1][1]
    word_context_tag_count[prev_tag][tag] += 1

In [9]:
word_context_tag_count

{'B': {'B': 61975, 'I': 7598}, 'I': {'B': 4903, 'I': 8}}

In [10]:
for tag in ['B', 'I']:
  total_tag_sum = sum(word_context_tag_count[tag].values())
  for next_tag in ['B', 'I']:
    word_context_tag_count[tag][next_tag] = (1 + word_context_tag_count[tag][next_tag]) / total_tag_sum

In [11]:
word_context_tag_count

{'B': {'B': 0.8908053411524586, 'I': 0.10922340563149498},
 'I': {'B': 0.9985746283852576, 'I': 0.0018326206475259622}}

In [12]:
def viterbi(sentence):
  state = []
  tags = ['B', 'I']
  for key, word in enumerate(sentence):
    p = []
    for tag in tags:
      emission_p = 0
      if key == 0: transmission_prob = word_context_tag_count['B'][tag]
      else: transmission_prob = word_context_tag_count[state[-1]][tag]
      if word in letter_tag_count.keys(): emission_p = letter_tag_count[word][tag]
  
      state_prop = emission_p * transmission_prob
      p.append(state_prop)
    max_value=max(p)
    value=tags[p.index(max_value)]
    state.append(value)
  return list(zip(sentence, state)), state

In [13]:
viterbi('dechouke')

([('d', 'B'),
  ('e', 'B'),
  ('c', 'B'),
  ('h', 'I'),
  ('o', 'B'),
  ('u', 'I'),
  ('k', 'B'),
  ('e', 'B')],
 ['B', 'B', 'B', 'I', 'B', 'I', 'B', 'B'])

In [24]:
def get_graphene(word):
  b_i_tags = viterbi(word)[1]
  ans = [word[0]]
  for i in range(1, len(word)):
    if b_i_tags[i] == 'B': ans.append('-')
    ans.append(word[i])
  return "".join(ans)

In [25]:
get_graphene('dechouke')

'd-e-ch-ou-k-e'

In [33]:
def get_all_graphenes(file):
  all_graphenes = []
  test_file = open(file, 'r')
  for line in test_file:
    word = line.rstrip("\n")
    all_graphenes.append([word, get_graphene(word)])
  return all_graphenes

In [34]:
graphenes_test = get_all_graphenes('kreyol_test.txt')

In [36]:
graphenes_test[0]

['Oradye', 'O-r-a-d-y-e']

In [41]:
import csv

with open('super_hmm.tsv', 'wt') as out_file:
    tsv_writer = csv.writer(out_file, delimiter='\t')
    for graphene in graphenes_test:
      tsv_writer.writerow(graphene)

In [43]:
def verify_results(file, num_of_lines):
  test_file = open(file, 'r')
  total = 0
  for line in test_file:
    total += 1
  return total == num_of_lines

In [44]:
verify_results('super_hmm.tsv', 1427)

True