In [3]:
from collections import defaultdict, Counter

In [4]:
fn = r"train.tsv"
handle  = open(fn, "r")
tokens = list()

for line in handle:
  entry = line.strip().split("\t")
  tokens.append( ( entry[0], entry[1]) )

In [5]:
#frequencies of all the mutations
frequency = Counter([t for (w,t) in tokens])
frequency

Counter({'N': 8584022, 'S': 973469, 'U': 327110, 'H': 80504, 'T': 34895})

In [6]:
def flat2sent(sents, end = "<S>"):
  """This function turns flat array (1d) of sentences into list of sentences (2d).

    End indicates start of sentence.
  Args:
      sents (): 1d array of sentences
      end (str, optional): start of sentence symbol. Defaults to "<S>".

  Returns:
      _type_: 2d array (list of sentences)
  """
  list_of_sent = list()
  sent = []
  for char in sents:
    sent.append(char)
    if char[0] == end:
      list_of_sent.append(sent)
      sent = []
  return list_of_sent


In [7]:
#train-split for sentences
train_index = int([ind for (ind, token) in enumerate(tokens) if token[0] == "<S>"][-1]*0.8)
sentTrain = flat2sent ( tokens[:train_index] )
sentTest = flat2sent( tokens[train_index:] )


In [8]:
# map of n preceding words to mutations
LAGGED_PRECEDE_MUTATE = defaultdict(lambda: defaultdict(int))
def lag_to_create_n_grams(sent: list, ngram=1):
    sent = [("<IGNORE>", "N") for _ in range(ngram)] + sent #padding in front
    for ind, piece in enumerate(sent):
        if ind < ngram:
            continue
        for n in range(1,ngram+1):

            prev_phrase = ' '.join( [w[0] for w in sent[ind-n: ind]] )
            if "<IGNORE>" in prev_phrase:
                continue
            tag = piece[1]
            LAGGED_PRECEDE_MUTATE[prev_phrase][tag] += 1
            LAGGED_PRECEDE_MUTATE[prev_phrase]["occurence"] += 1
            #if (prev_phrase == ""):
            #    print(ind, n, prev_phrase, tag, piece)


In [9]:
# example: The weather is nice with tag D N V A with ngram = 2
# {The: {N: 1}, weather: {V: 1}, The weather: {V : 1}, is: {A: 1}, weather is: {A: 1}
#lag_to_create_n_grams([("The", "D"), ("weather", "N"), ("is", "V"), ("nice", "A")], ngram=2)

for sent in sentTrain:
    lag_to_create_n_grams(sent,3)
#LAGGED_PRECEDE_MUTATE


In [10]:
# building the transition matrix
def prob_tag_givenWord(t,w):
    try:
        return LAGGED_PRECEDE_MUTATE[w][t]/LAGGED_PRECEDE_MUTATE[w]["occurence"]
    except ZeroDivisionError:
        return 0


In [11]:
prob_tag_givenWord("T","bean")
print(LAGGED_PRECEDE_MUTATE["bean"]["occurence"])

2343


In [12]:
train_index

7999999

In [13]:
TAG_TO_WORD =defaultdict(lambda: defaultdict(int))
INITIAL_DISTRIBUTION_PROB = defaultdict(int)
#if we were to generate a mutation, how many of it is word w_i
def build_emission(sent: list):
    for ind, piece in enumerate(sent):

        word = piece[0]
        tag = piece[1]

        TAG_TO_WORD[tag][word] += 1
        TAG_TO_WORD[tag]["occurence"] += 1
        if ind == 0:
            INITIAL_DISTRIBUTION_PROB[tag] +=1


        #if (prev_phrase == ""):
        #    print(ind, n, prev_phrase, tag, piece)
for sent in sentTrain:
    build_emission(sent)
    


In [14]:
def get_initial_distribution_prob(initial_count):
    total = sum(initial_count[i] for i in initial_count)
    print(total)
    return dict([(k,initial_count[k]/total) for k in initial_count])


In [15]:
INITIAL_DISTRIBUTION_PROB = get_initial_distribution_prob(
    INITIAL_DISTRIBUTION_PROB)


316834


In [28]:
def argmax(Vit_matrix, tag, ind, phrase):
    ans = -1
    for t in ["N", "S", "U", "T", "H"]:
        temp = Vit_matrix[t][ind-1]*LAGGED_PRECEDE_MUTATE[phrase][t]
        if temp > ans:
            ans = temp
        print(t, end=' ')
    return ans

def viterbi(sent):
    Vit_matrix = defaultdict(lambda: defaultdict(float)) #V[state][word]
    for t in ["N","S","U","T","H"]:
        # initial probability distribution * emission
        print(INITIAL_DISTRIBUTION_PROB[t])
        print(TAG_TO_WORD[t][sent[0]])
        Vit_matrix[t][0] = INITIAL_DISTRIBUTION_PROB[t] * TAG_TO_WORD[t][sent[0]]
    for i in range(1, len(sent)):
        for t in ["N", "S", "U", "T", "H"]:
            phrase = sent[i-1] #TODO: dynamically chose the prhase could be 2 word precede, 1 word precede
            val = argmax(Vit_matrix, t, i, phrase)
            Vit_matrix[t][i] = val*TAG_TO_WORD[t][sent[i]] # transimition_matrix *emission probability
    return Vit_matrix


In [29]:
viterbi(sentTrain[241])
sentTrain[241]

0.901251128351124
0
0.09597770441303648
0
0.0016728002676480427
0
7.259321916208487e-05
0
0.0010257737490294602
0
N S U T H N S U T H N S U T H N S U T H N S U T H N S U T H N S U T H N S U T H N S U T H N S U T H N S U T H N S U T H N S U T H N S U T H N S U T H N S U T H N S U T H N S U T H N S U T H N S U T H 

[('cosaint', 'N'), ('uiscí', 'N'), ('nádúrtha', 'N'), ('.', 'N'), ('<S>', 'N')]

In [None]:
# argument sent is a list of [token,label] pairs; return number of correctly predicted labels
def predict_from_scratch(sent, model=None):
  correct = 0
  for token in sent:
    guess = random.choice(['S','U','T','H','N'])
    if guess == token[1]:
      correct += 1
  return correct

In [None]:
# argument sent is a list of [token,label] pairs; return number of correctly predicted labels
def predict_anything_goes(sent):
  correct = 0
  for token in sent:
    guess = 'N'
    if guess == token[1]:
      correct += 1
  return correct

In [None]:
#sentence is a list of tuples(x,y)
def evaluate():
  total = 0
  correct_from_scratch = 0
  correct_anything_goes = 0
  testfile = open('test.tsv', 'r')
  sentence = []
  for line in testfile:
    total += 1
    pieces = line.rstrip("\n").split("\t")
    if pieces[0]=='<S>':
      correct_from_scratch += predict_from_scratch(sentence)
      correct_anything_goes += predict_anything_goes(sentence)
      sentence = []
    else:
      sentence.append(pieces)
  correct_from_scratch += predict_from_scratch(sentence)
  correct_anything_goes += predict_anything_goes(sentence)
  return (correct_from_scratch/total, correct_anything_goes/total)

In [None]:
evaluate()

(0.191809, 0.819613)