*The following is a referenced implementation of a CRF (conditional random fields) based discriminative markov model for Part of Speech tagging*<br/>
*Several improvements are possible: word preprocessing, handling OOV words etc.*<br/>
*The current state of the art transformers do POS tagging with an accuracy of ~98%*<br/>
*I was able to acheive a decent matching (96.7% on 20% held out test corpus) as well with this statistical model*<br/>

In [2]:
import nltk

In [3]:
dataset = nltk.corpus.treebank
# tagged_sents = list(dataset.tagged_sents(tagset="universal"))
tagged_sents = list(dataset.tagged_sents())

In [4]:
# build the vocab
word_tag_list = [pair for sent in tagged_sents for pair in sent]
print(f"The word_tag_list is: {len(word_tag_list)} long")
vocab = list(set([pair[0] for pair in word_tag_list]))
vocab_size = len(vocab)
tags = list(set([pair[1] for pair in word_tag_list]))
num_tags = len(tags)
word_to_index = {word:i for i, word in enumerate(vocab)}
index_to_word = {i:word for i, word in enumerate(vocab)}
tag_to_index = {tag:i for i, tag in enumerate(tags)}
tag_to_index = {i:tag for tag, i in enumerate(tags)}
print(f"Vocab_size: {vocab_size}, the number of tags: {num_tags}")

The word_tag_list is: 100676 long
Vocab_size: 12408, the number of tags: 46


**[reference](https://sklearn-crfsuite.readthedocs.io/en/latest/tutorial.html#let-s-use-conll-2002-data-to-build-a-ner-system)**

In [5]:
print(word_to_index["will"], index_to_word[word_to_index["will"]])

9093 will


In [6]:
print(tags)

['WP$', '-LRB-', 'RBR', 'NNS', 'IN', '$', 'VBD', '-NONE-', 'CC', 'PRP$', 'JJ', 'RBS', 'SYM', 'VBZ', 'NN', 'RB', 'WDT', ',', 'JJS', "''", 'TO', ':', '#', 'CD', 'VBN', 'DT', 'LS', 'JJR', 'MD', 'VBG', 'RP', '-RRB-', 'PRP', 'EX', 'VB', 'POS', '``', 'WP', '.', 'PDT', 'UH', 'FW', 'NNPS', 'VBP', 'WRB', 'NNP']


In [7]:
n = len(tagged_sents)
import random
random.shuffle(tagged_sents)
train_data = tagged_sents[:int(0.8*n)]
test_data = tagged_sents[int(0.8*n):]
print(len(train_data), len(test_data))

3131 783


In [8]:
train_data[0]

[('I', 'PRP'),
 ('would', 'MD'),
 ('be', 'VB'),
 ('very', 'RB'),
 ('surprised', 'VBN'),
 ('if', 'IN'),
 ('his', 'PRP$'),
 ('departure', 'NN'),
 ('signals', 'VBZ'),
 ('any', 'DT'),
 ('change', 'NN'),
 ('in', 'IN'),
 ('strategy', 'NN'),
 ('or', 'CC'),
 ('change', 'NN'),
 ('in', 'IN'),
 ('profit', 'NN'),
 ('expectations', 'NNS'),
 ('.', '.'),
 ("''", "''")]

**Developing features for out text** <br/>
Features can be: <br/>
1. Capital Letters {generally proper noun, beginning of sentences} <br/>
2. First word {determiners, and propositions} <br/>
3. Last word <br/>
4. Number and alphabets <br/>
5. Contains hyphen <br/>
6. Prefixes and suffiexes <br/>
etc..

In [9]:
import re
def get_word_features(sentence, index):
  """
    Parameters:
      ~sentence: [list of tuples: (word, tag)]~
      sentence: [list of words]
      index: position of word in the list (sentence)
    Returns:
      the feature dict
  """
#   word = sentence[index][0]
#   tag = sentence[index][1]
  word = sentence[index]
  features = {
    'bias': 1.0,
    'BOS': int(index==0),
    'EOS':int(index==len(sentence)-1),
    'is_complete_capital': int(word.upper()==word),
    'prev_word':'' if index==0 else sentence[index-1],
    'next_word':'' if index==len(sentence)-1 else sentence[index+1],
    'word.lower()': word.lower(),
    'suffix_4': word[-4:], # suffixes
    'suffix_3': word[-3:], 
    'suffix_2': word[-2:],
    'prefix_1': word[0],
    'prefix_2': word[:2],
    'prefix_3': word[:3],
    'prefix_4': word[:4],
    'word.isupper()': word.isupper(),
    'word.istitle()': word.istitle(),
    'word.isdigit()': word.isdigit(),
    'is_alphanumeric': int(bool((re.match('^(?=.*[0-9]$)(?=.*[a-zA-Z])',word)))),
#     'postag': tag,
    'word_has_hyphen': 1 if '-' in word else 0,
  }
  return features

In [10]:
def sent_from_tagged_sent(tagged_sent):
  return [word for word, _ in tagged_sent]

In [11]:
def get_dataset(tagged_sentences):
  """
    Parameters:
      tagged_sentences: [list of tuples: (word, tag)]
    Returns:
      X: [feature set]
      y: [target tags]
  """
  X, y = [], []
  for sentence in tagged_sentences:
    X.append([get_word_features(sent_from_tagged_sent(sentence), index) for index in range(len(sentence))])
    y.append([tag for _, tag in sentence]) 
    
  return X, y

In [12]:
X, y = get_dataset([train_data[0]])
X, y

([[{'bias': 1.0,
    'BOS': 1,
    'EOS': 0,
    'is_complete_capital': 1,
    'prev_word': '',
    'next_word': 'would',
    'word.lower()': 'i',
    'suffix_4': 'I',
    'suffix_3': 'I',
    'suffix_2': 'I',
    'prefix_1': 'I',
    'prefix_2': 'I',
    'prefix_3': 'I',
    'prefix_4': 'I',
    'word.isupper()': True,
    'word.istitle()': True,
    'word.isdigit()': False,
    'is_alphanumeric': 0,
    'word_has_hyphen': 0},
   {'bias': 1.0,
    'BOS': 0,
    'EOS': 0,
    'is_complete_capital': 0,
    'prev_word': 'I',
    'next_word': 'be',
    'word.lower()': 'would',
    'suffix_4': 'ould',
    'suffix_3': 'uld',
    'suffix_2': 'ld',
    'prefix_1': 'w',
    'prefix_2': 'wo',
    'prefix_3': 'wou',
    'prefix_4': 'woul',
    'word.isupper()': False,
    'word.istitle()': False,
    'word.isdigit()': False,
    'is_alphanumeric': 0,
    'word_has_hyphen': 0},
   {'bias': 1.0,
    'BOS': 0,
    'EOS': 0,
    'is_complete_capital': 0,
    'prev_word': 'would',
    'next_word': 'v

In [13]:
X_train, y_train = get_dataset(train_data)
X_test, y_test = get_dataset(test_data)

In [14]:
import sklearn_crfsuite

crf = sklearn_crfsuite.CRF(algorithm="lbfgs", c1=0.01, c2=0.1, max_iterations=150, all_possible_transitions=True)

In [15]:
crf.fit(X_train, y_train)

In [16]:
from sklearn_crfsuite import metrics
y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(y_test, y_pred, labels=crf.classes_, digits=3))

              precision    recall  f1-score   support

         PRP      0.997     0.991     0.994       317
          MD      1.000     0.995     0.997       182
          VB      0.942     0.944     0.943       481
          RB      0.930     0.901     0.915       545
         VBN      0.912     0.908     0.910       434
          IN      0.969     0.988     0.979      1988
        PRP$      0.994     0.994     0.994       171
          NN      0.945     0.960     0.952      2639
         VBZ      0.983     0.926     0.954       432
          DT      0.994     0.991     0.993      1607
          CC      0.998     0.995     0.997       440
         NNS      0.963     0.975     0.969      1160
           .      1.000     1.000     1.000       767
          ''      1.000     1.000     1.000       148
         NNP      0.963     0.983     0.973      1875
         VBD      0.948     0.939     0.943       603
      -NONE-      1.000     1.000     1.000      1271
          TO      1.000    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
