In [1]:
from collections import defaultdict

In [2]:
def flat2sent(sents, end = "<S>"):
  """This function turns flat array (1d) of sentences into list of sentences (2d).

    End indicates the last token of the sentence.
  Args:
      sents (): 1d array of sentences
      end (str, optional): start of sentence symbol. Defaults to "<S>".

  Returns:
      _type_: 2d array (list of sentences)
  """
  list_of_sent = list()
  sent = []
  for char in sents:
    sent.append(char)
    if char[0] == end:
      list_of_sent.append(sent)
      sent = []
  return list_of_sent

In [3]:
#train-split for sentences
#train_index = int([ind for (ind, token) in enumerate(tokens) if token[0] == "<S>"][-1]*0.8)
file_train = open('train.tsv')
file_test = open('test.tsv')
tokens_train = []
tokens_test = []
for line in file_train:
    word = line.strip().split("\t")
    tokens_train.append(tuple(word))
for line in file_test:
    word = line.strip().split("\t")
    tokens_test.append(tuple(word))
sentTrain = flat2sent ( tokens_train )
sentTest = flat2sent( tokens_test )


In [4]:
# map of n preceding words to mutations
LAGGED_PRECEDE_MUTATE = defaultdict(lambda: defaultdict(int))
def lag_to_create_n_grams(sent: list, ngram=1):
    sent = [("<IGNORE>", "N") for _ in range(ngram)] + sent #padding in front
    for ind, piece in enumerate(sent):
        if ind < ngram:
            continue
        for n in range(1,ngram+1):

            prev_phrase = ' '.join( [w[0] for w in sent[ind-n: ind]] )
            if "<IGNORE>" in prev_phrase:
                continue
            tag = piece[1]
            LAGGED_PRECEDE_MUTATE[prev_phrase][tag] += 1
            LAGGED_PRECEDE_MUTATE[prev_phrase]["occurence"] += 1
            #if (prev_phrase == ""):
            #    print(ind, n, prev_phrase, tag, piece)


In [5]:
# example: The weather is nice with tag D N V A with ngram = 2
# {The: {N: 1}, weather: {V: 1}, The weather: {V : 1}, is: {A: 1}, weather is: {A: 1}
#lag_to_create_n_grams([("The", "D"), ("weather", "N"), ("is", "V"), ("nice", "A")], ngram=2)
#creaeting bigram
for sent in sentTrain:
    lag_to_create_n_grams(sent,2)
#LAGGED_PRECEDE_MUTATE


In [6]:
# building the transition matrix
def prob_from_count_dict(count_dict,k1,k2):
    try:
        return count_dict[k1][k2]/count_dict[k1]["occurence"]
    except ZeroDivisionError:
        return 0

In [7]:
TAG_TO_WORD_COUNT =defaultdict(lambda: defaultdict(int))
INITIAL_DISTRIBUTION_COUNT = defaultdict(int)
#if we were to generate a mutation, how many of it is word w_i
def build_emission(sent: list):
    for ind, piece in enumerate(sent):

        word = piece[0]
        tag = piece[1]

        TAG_TO_WORD_COUNT[tag][word] += 1
        TAG_TO_WORD_COUNT[tag]["occurence"] += 1
        if ind == 0:
            INITIAL_DISTRIBUTION_COUNT[tag] +=1
            INITIAL_DISTRIBUTION_COUNT["all"] += 1


        #if (prev_phrase == ""):
        #    print(ind, n, prev_phrase, tag, piece)
for sent in sentTrain:
    build_emission(sent)


In [8]:
def argmax(Vit_matrix, ind, phrase):
    ans = -1
    bestTag = None
    for t in ["N", "S", "U", "T", "H"]:
        #prob_from_count_dict(count_dict,k1,k2)
        temp = Vit_matrix[t][ind-1]*prob_from_count_dict(LAGGED_PRECEDE_MUTATE,phrase,t)
        #print(temp)
        #print(prob_from_count_dict(LAGGED_PRECEDE_MUTATE, phrase, t))
        #print(Vit_matrix[t][ind-1])
        if temp > ans:
            ans = temp
            bestTag = t
        #print(t, end=' ')
    return ans, bestTag

def viterbi(sent):
    best_tags = defaultdict(lambda: defaultdict(str))
    Vit_matrix = defaultdict(lambda: defaultdict(float)) #V[state][word]
    for t in ["N","S","U","T","H"]:
        # initial probability distribution * emission

        start_state_prob = INITIAL_DISTRIBUTION_COUNT[t]/INITIAL_DISTRIBUTION_COUNT["all"]

        #prob_from_count_dict(count_dict,k1,k2)
        emission = prob_from_count_dict(TAG_TO_WORD_COUNT, t, sent[0][0])

        Vit_matrix[t][0] = start_state_prob * emission
    for i in range(1, len(sent)):
        for t in ["N", "S", "U", "T", "H"]:
            phrase = sent[i-1][0] #TODO: dynamically chose the prhase could be 2 word precede, 1 word precede
            val,tag = argmax(Vit_matrix, i, phrase)
            emission = prob_from_count_dict(TAG_TO_WORD_COUNT, t,sent[i][0])
            Vit_matrix[t][i] = val*emission # transimition_matrix *emission probability
            best_tags[t][i] = tag
    best_ending = None
    best_max = -1
    for tag in ["N", "S", "U", "T", "H"]:
        if Vit_matrix[tag][len(sent) - 1] > best_max:
            best_max = Vit_matrix[tag][len(sent) - 1]
            best_ending = tag
    seq = [best_ending]
    for i in reversed(range(1,len(sent))):
        seq.append(best_tags[seq[-1]][i])
    return seq[::-1]

    


In [9]:

def eval_scratch(sent):
    accuracy = 0
    seq = viterbi(sent)
    for i in range(len(sent)):
        accuracy += int(sent[i][1] == seq[i])
    return accuracy
#TAG_TO_WORD_COUNT["N"]["cosaint"]/TAG_TO_WORD_COUNT["N"]["occurence"]
#print( prob_from_count_dict(TAG_TO_WORD_COUNT, "T", "cosaint") )
#LAGGED_PRECEDE_MUTATE



In [12]:
#sum(eval_scratch(sent)for sent in sentTrain)/sum(len(sent) for sent in sentTrain)

In [11]:
sum(eval_scratch(sent)for sent in sentTest) /sum(len(sent) for sent in sentTest)

0.882078