# Hidden Markov Model on Conll (2003) dataset. 

In [1]:
from seqlearn.hmm import MultinomialHMM

In [2]:
import nltk
from sklearn.metrics import classification_report, confusion_matrix
import sklearn


In [3]:
import sys
def read_data(filename):
    X_train, y_train, lengths  = [], [], []
    count = 0
    with open(filename) as f:
        count_words = 0
        for line in f:
            count += 1
            if count < 3:
                continue                 
            words = line.strip().split(" ")
            if len(words) < 2:
                lengths.append(count_words)
                count_words = 0
            elif len(words) == 4:
                X_train.append(words[0])
                count_words +=1                
                y_train.append(words[3])
    #             print(tmp)
            else:
                print(count)
                sys.exit(0)
    return X_train, y_train, lengths



In [4]:
X_train, y_train, lengths = read_data("eng.train")

In [5]:
len(X_train)

204566

In [6]:
len(y_train)

204566

In [7]:
len(lengths)

14985

In [8]:
def dumb_tokenizer(t):
    return t.split()

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(tokenizer=dumb_tokenizer, stop_words='english',max_df=0.9, min_df=1, binary=True)
X_train_counts = count_vect.fit_transform(X_train)
X_train_counts.shape

(204566, 20738)

In [10]:
hmm_model = MultinomialHMM(alpha=0.01, decode='viterbi')

In [12]:
hmm_model.fit(X_train_counts, y_train, lengths)

MultinomialHMM(alpha=0.01, decode='viterbi')

In [13]:
X_test, y_test, lengths_test = read_data("eng.testb")

In [14]:
len(X_test)

46665

In [15]:
X_test_counts = count_vect.transform(X_test)

In [16]:
X_test_counts.shape

(46665, 20738)

In [17]:
y_pred = hmm_model.predict(X_test_counts)

In [18]:
len(y_pred)

46665

In [19]:
len(y_test)

46665

In [23]:
from sklearn.metrics import confusion_matrix, classification_report, f1_score
from collections import Counter
from itertools import chain
from sklearn.preprocessing import LabelBinarizer
def F1_classification_report(y_true, y_pred):
    """
    Classification report for a list of BIO-encoded sequences.
    It computes token-level metrics and discards "O" labels.

    Note that it requires scikit-learn 0.15+ (or a version from github master)
    to calculate averages properly!
    """
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))

    tagset = set(lb.classes_) - {'O'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}

    return f1_score(y_true, y_pred, labels = ['B-PER','I-PER','B-LOC','I-LOC','B-ORG','I-ORG','B-MISC','I-MISC'], average='weighted')

In [24]:
print(F1_classification_report(y_test, y_pred))

0.6181451002789499


#### Conclusion: Thus, we see that performing HMM classification on Conll dataset gives us the accuracy of 61.81%. The reason behind getting such low accuracy is that HMM does not use any features or postags for understanding the context, while classifying the sentences. Our next approach is to classify the same dataset using pycrfsuite which is conditional random field classification, and improve the accuracy.  