In [None]:
import numpy as np
import pandas as pd
from reformat import *
from architecture.WordSegPreProcessing import *

fn = "train.tsv"
x, y = file_to_table(read_file(fn))


## Prepping Data

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.1)
preprocessor = WordSegPreProcessing(X_train, y_train, segment_to_tag)

In [None]:
X_train, y_train = preprocessor.x, preprocessor.y
X_test, y_test = preprocessor.extract_pairs(X_test, y_test)


## Supervised NGramTagger with Backoff

In [None]:
from architecture.NgramSupervisedTagger import NGramSupervisedTagger
ng = NGramSupervisedTagger(X_train, y_train, ngram_choice=2)
ng.create_n_gram_tagger()
ng.f1_by_tags(X_test, y_test)["I"] #F1 score for just I tags

## Supervised HMM

In [None]:
from architecture.HMMSupervisedTagger import HMMSupervisedTagger
hmm = HMMSupervisedTagger(X_train, y_train)
hmm.train()
hmm.f1_by_tags(X_test, y_test)["I"]


The same thing as HMM Tagger but instead of just using the provided characters, I manually engineered some features (probably not the best bleh)... You can check it out in `architecture.WordSegPreProcessing.apply_features`

In [None]:
from architecture.HMMSupervisedTagger import HMMSupervisedTagger
hmm2 = HMMSupervisedTagger(preprocessor.generate_features(X_train), y_train)
hmm2.train()
hmm2.f1_by_tags(preprocessor.generate_features(X_test), y_test)["I"]


In [None]:
hmm2.tagger.best_path_simple(preprocessor.apply_features(X_test[3]))

## Unsupervised

In [None]:
from architecture.utils import *
x, y = file_to_table(read_file(fn))
feedX, feedY = preprocessor.extract_pairs(x[:650], y[:650])


In [None]:
testX2, testY2 = preprocessor.extract_pairs(x[650:], y[650:])


In [None]:
feedX = list(map(lambda x: preprocessor.let2index(x), feedX ))
feedY = list(map(lambda y: preprocessor.tag2index(y), feedY))
testX2 = list(map(lambda x: preprocessor.let2index(x), testX2))
testY2 = list(map(lambda x: preprocessor.tag2index(x), testY2))


In [None]:
#initialize with some probs by running HMM on the feeder set

import nltk
import numpy as np
init_tmat = np.zeros((len(preprocessor.index_tag),
                     len(preprocessor.index_tag)))

init_emission = np.zeros(
    (len(preprocessor.index_tag), len(preprocessor.index_vocab)))
trainer = nltk.HiddenMarkovModelTrainer()
tagger = trainer.train_supervised(
           tuple_xy4nltk(feedX, feedY))
for k in tagger._transitions.keys():
    for v in tagger._transitions[k].samples():
        init_tmat[k][v] = tagger._transitions[k].prob(v)
    for let in tagger._outputs[k].samples():
        init_emission[k][let] = tagger._outputs[k].prob(let)
init_state_distrib = np.array([0.5, 0.5])  # because always start with B

#init_tmat


In [None]:
#add 0.05 alpha smoothing here cause I can't absorb to itself but heurestically, we know it is not true
init_tmat[preprocessor.index_tag["I"]][preprocessor.index_tag["I"]] += 0.05
init_tmat[preprocessor.index_tag["I"]][preprocessor.index_tag["B"]] -= 0.05
print(init_tmat)


In [None]:
from architecture.Unsupervised import UnSupervised
unsupervised = UnSupervised(init_tmat, init_emission, init_state_distrib)


In [None]:
y_predicted = []
for ind in range(len(testX2)):
    tmat, emission, start, end = unsupervised.baum_welch(testX2[ind], 10)
    resultant = unsupervised.viterbi(testX2[ind], emission, tmat, {i: v for i, v in enumerate(start)})
    resultant[0] = preprocessor.index_tag["B"]
    y_predicted.append(resultant)


In [None]:
f1_by_tags(y_predicted, testY2)[preprocessor.index_tag["I"]]
