In [1]:
from itertools import chain
import nltk
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
import sklearn
import pycrfsuite

print(sklearn.__version__)

0.18.1


In [17]:
import _pickle as pickle

import numpy as np
from pycrfsuite import Tagger
from flexcrf_tp.models.linear_chain import (_feat_fun_values,
                                            _compute_all_potentials,
                                            _forward_score,
                                            _backward_score,
                                            _partition_fun_value,
                                            _posterior_score)

from flexcrf_tp.crfsuite2flexcrf import convert_data_to_flexcrf

In [20]:
nltk.download('conll2002')
nltk.corpus.conll2002.fileids()
train_sents = list(nltk.corpus.conll2002.iob_sents('esp.train'))
test_sents = list(nltk.corpus.conll2002.iob_sents('esp.testb'))


def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'postag=' + postag,
        'postag[:2]=' + postag[:2],
    ]
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:postag=' + postag1,
            '-1:postag[:2]=' + postag1[:2],
        ])
    else:
        features.append('BOS')
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:postag=' + postag1,
            '+1:postag[:2]=' + postag1[:2],
        ])
    else:
        features.append('EOS')
                
    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent] 

X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

pickle.dump({'X': X_test, 'y': y_test},open('CRFSUITE_TEST_DATA_FILE', 'wb'))

[nltk_data] Downloading package conll2002 to
[nltk_data]     /Users/auredt7892/nltk_data...
[nltk_data]   Package conll2002 is already up-to-date!


In [21]:
X_train[0]

[['bias',
  'word.lower=melbourne',
  'word[-3:]=rne',
  'word[-2:]=ne',
  'word.isupper=False',
  'word.istitle=True',
  'word.isdigit=False',
  'postag=NP',
  'postag[:2]=NP',
  'BOS',
  '+1:word.lower=(',
  '+1:word.istitle=False',
  '+1:word.isupper=False',
  '+1:postag=Fpa',
  '+1:postag[:2]=Fp'],
 ['bias',
  'word.lower=(',
  'word[-3:]=(',
  'word[-2:]=(',
  'word.isupper=False',
  'word.istitle=False',
  'word.isdigit=False',
  'postag=Fpa',
  'postag[:2]=Fp',
  '-1:word.lower=melbourne',
  '-1:word.istitle=True',
  '-1:word.isupper=False',
  '-1:postag=NP',
  '-1:postag[:2]=NP',
  '+1:word.lower=australia',
  '+1:word.istitle=True',
  '+1:word.isupper=False',
  '+1:postag=NP',
  '+1:postag[:2]=NP'],
 ['bias',
  'word.lower=australia',
  'word[-3:]=lia',
  'word[-2:]=ia',
  'word.isupper=False',
  'word.istitle=True',
  'word.isdigit=False',
  'postag=NP',
  'postag[:2]=NP',
  '-1:word.lower=(',
  '-1:word.istitle=False',
  '-1:word.isupper=False',
  '-1:postag=Fpa',
  '-1:post

In [None]:
def viterbi_decoder(m_xy, n=None, log_version=True):
    """
    Performs MAP inference, determining $y = \argmax_y P(y|x)$, using the
    Viterbi algorithm.

    Parameters
    ----------
    m_xy : ndarray, shape (n_obs, n_labels, n_labels)
        Values of log-potentials ($\log M_i(y_{i-1}, y_i, x)$)
        computed based on feature functions f_xy and/or user-defined potentials
        `psi_xy`. At t=0, m_xy[0, 0, :] contains values of $\log M_1(y_0, y_1)$
        with $y_0$ the fixed initial state.

    n : integer, default=None
        Time position up to which to decode the optimal sequence; if not
        specified (default) the score is computed for the whole sequence.

    Returns
    -------
    y_pred : ndarray, shape (n_obs,)
        Predicted optimal sequence of labels.

    """

    # YOUR CODE HERE .....

    pass
