In [67]:
import pandas as pd
import numpy as np

from collections import defaultdict
import string

In [68]:
punct = set(string.punctuation)

noun_suffix = ["action", "age", "ance", "cy", "dom", "ee", "ence", "er", "hood", "ion", "ism", "ist", "ity", "ling", "ment", "ness", "or", "ry", "scape", "ship", "ty"]
verb_suffix = ["ate", "ify", "ise", "ize"]
adj_suffix = ["able", "ese", "ful", "i", "ian", "ible", "ic", "ish", "ive", "less", "ly", "ous"]
adv_suffix = ["ward", "wards", "wise"]

In [69]:
def assign_unk(token):

  if any(char.isdigit() for char in token):
    return '--unk_digit--'

  elif any(char in punct for char in token):
    return '--unk_punct--'

  elif any(char.isupper() for char in token):
    return '--unk__upper--'

  elif any(token.endswith(suffix) for suffix in noun_suffix):
    return '--unk_noun--'

  elif any(token.endswith(suffix) for suffix in verb_suffix):
    return '--unk_verb--'

  elif any(token.endswith(suffix) for suffix in adj_suffix):
    return '--unk_adj--'

  elif any(token.endswith(suffix) for suffix in adv_suffix):
    return '--unk_adv--'

  return '--unk--'

In [70]:
def preprocess(vocab, file_path):

  original, preprocessed = [], []

  with open(file_path, 'r') as f:

    for idx, word in enumerate(f):

      if not word.split():
        original.append(word.strip())
        word = '--n--'
        preprocessed.append(word)

      elif word.strip() not in vocab:
        original.append(word.strip())
        word = assign_unk(word.strip())
        preprocessed.append(word)

      else:
        original.append(word.strip())
        preprocessed.append(word.strip())

  return original, preprocessed

In [71]:
def get_word_tag(vocab, word_tag):

  if not word_tag.split():
    word = '--n--'
    tag = '--s--'

    return word, tag

  else:
    word, tag = word_tag.split()
    if word not in vocab:
      word = assign_unk(word)

    return word, tag

  return None

In [72]:
with open('/content/WSJ_02-21.pos', 'r') as f:
  training_corpus = f.readlines()

training_corpus[:5]

['In\tIN\n', 'an\tDT\n', 'Oct.\tNNP\n', '19\tCD\n', 'review\tNN\n']

In [73]:
with open('/content/hmm_vocab.txt', 'r') as f:
  vocab_l = f.read().split('\n')

In [74]:
# vocab_l[:50], vocab_l[-50:]

In [75]:
vocab = {}

for i, word in enumerate(sorted(vocab_l)):
  vocab[word] = i

In [76]:
# vocab

In [77]:
with open('/content/WSJ_24.pos', 'r') as f:
  test_corpus = f.readlines()

In [78]:
# test_corpus

In [79]:
# with open('/content/test.words', 'r') as f:
#   for idx, word in enumerate(f):
#     print(' | ', word, ' | ', word.split())

In [80]:
_, preprocessed = preprocess(vocab, '/content/test.words')

In [81]:
# preprocessed

In [82]:
for word_tag in training_corpus:
  word = word_tag.split()[0]
  tag = word_tag.split()[1]
  print(word)
  print(tag)
  break

In
IN


In [83]:
def create_dictionaries(vocab, training_corpus):
  tag_counts = defaultdict(int)
  transition_counts = defaultdict(int)
  emission_counts = defaultdict(int)

  prev_tag = '--s--'
  iter = 0

  for word_tag in training_corpus:
    iter += 1
    if iter % 10000 == 0:
      print(f'Words traversed = {iter}')

    word, tag = get_word_tag(vocab, word_tag)

    transition_counts[(prev_tag, tag)] += 1
    emission_counts[(tag, word)] += 1
    tag_counts[tag] += 1

    prev_tag = tag

  return transition_counts, emission_counts, tag_counts

In [84]:
transition_counts, emission_counts, tag_counts = create_dictionaries(training_corpus, vocab)

AttributeError: 'dict' object has no attribute 'split'

In [None]:
for word_tag in training_corpus:
  word, tag = word_tag.split()[0], word_tag.split()[1]
  print(word)
  print(tag)
  break