In [2]:
from bs4 import BeautifulSoup as bs
from bs4.element import Tag
import codecs

In [7]:
# Read data file and parse the XML
with codecs.open("../datasets/CRF/reuters.xml", "r", "utf-8") as infile:
    soup = bs(infile, "html5lib")

In [22]:
docs = []
for elem in soup.find_all("document"):
    texts = []
    # Loop through each child of the element under "textwithnamedentities"
    for c in elem.find("textwithnamedentities").children:
        if type(c) == Tag:
            if c.name == "namedentityintext":
                label = "N"  # part of a named entity
            else:
                label = "I"  # irrelevant word
            for w in c.text.split(" "):
                if len(w) > 0:
                    texts.append((w, label))
    docs.append(texts)    

In [11]:
print len(docs)

128


In [23]:
print docs[0]

[(u'Paxar', 'N'), (u'Corp', 'N'), (u'said', 'I'), (u'it', 'I'), (u'has', 'I'), (u'acquired', 'I'), (u'Thermo-Print', 'N'), (u'GmbH', 'N'), (u'of', 'I'), (u'Lohn', 'N'), (u',', 'I'), (u'West', 'N'), (u'Germany', 'N'), (u',', 'I'), (u'a', 'I'), (u'distributor', 'I'), (u'of', 'I'), (u'Paxar', 'N'), (u'products,', 'I'), (u'for', 'I'), (u'undisclosed', 'I'), (u'terms.', 'I')]


In [25]:
import nltk
data = []
for i, doc in enumerate(docs):

    # Obtain the list of tokens in the document
    tokens = [t for t, label in doc]

    # Perform POS tagging
    tagged = nltk.pos_tag(tokens)

    # Take the word, POS tag, and its label
    data.append([(w, pos, label) for (w, label), (word, pos) in zip(doc, tagged)])

The output of the above process will be a list of documents, each of which is a list of tuples with the word, its POS tag and its label:

In [26]:
data[0]

[(u'Paxar', 'NNP', 'N'),
 (u'Corp', 'NNP', 'N'),
 (u'said', 'VBD', 'I'),
 (u'it', 'PRP', 'I'),
 (u'has', 'VBZ', 'I'),
 (u'acquired', 'VBN', 'I'),
 (u'Thermo-Print', 'NNP', 'N'),
 (u'GmbH', 'NNP', 'N'),
 (u'of', 'IN', 'I'),
 (u'Lohn', 'NNP', 'N'),
 (u',', ',', 'I'),
 (u'West', 'NNP', 'N'),
 (u'Germany', 'NNP', 'N'),
 (u',', ',', 'I'),
 (u'a', 'DT', 'I'),
 (u'distributor', 'NN', 'I'),
 (u'of', 'IN', 'I'),
 (u'Paxar', 'NNP', 'N'),
 (u'products,', 'NN', 'I'),
 (u'for', 'IN', 'I'),
 (u'undisclosed', 'JJ', 'I'),
 (u'terms.', 'NN', 'I')]

In [27]:
def word2features(doc, i):
    word = doc[i][0]
    postag = doc[i][1]

    # Common features for all words
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'postag=' + postag
    ]

    # Features for words that are not
    # at the beginning of a document
    if i > 0:
        word1 = doc[i-1][0]
        postag1 = doc[i-1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:word.isdigit=%s' % word1.isdigit(),
            '-1:postag=' + postag1
        ])
    else:
        # Indicate that it is the 'beginning of a document'
        features.append('BOS')

    # Features for words that are not
    # at the end of a document
    if i < len(doc)-1:
        word1 = doc[i+1][0]
        postag1 = doc[i+1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:word.isdigit=%s' % word1.isdigit(),
            '+1:postag=' + postag1
        ])
    else:
        # Indicate that it is the 'end of a document'
        features.append('EOS')

    return features

In [30]:
from sklearn.model_selection import train_test_split

# A function for extracting features in documents
def extract_features(doc):
    return [word2features(doc, i) for i in range(len(doc))]

# A function fo generating the list of labels for each document
def get_labels(doc):
    return [label for (token, postag, label) in doc]

X = [extract_features(doc) for doc in data]
y = [get_labels(doc) for doc in data]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [33]:
import pycrfsuite
trainer = pycrfsuite.Trainer(verbose=True)

# Submit training data to the trainer
for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

# Set the parameters of the model
trainer.set_params({
    # coefficient for L1 penalty
    'c1': 0.1,

    # coefficient for L2 penalty
    'c2': 0.01,  

    # maximum number of iterations
    'max_iterations': 200,

    # whether to include transitions that
    # are possible, but not observed
    'feature.possible_transitions': True
})

# Provide a file name as a parameter to the train function, such that
# the model will be saved to the file when training is finished
trainer.train('crf.model')

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 13117
Seconds required: 0.045

L-BFGS optimization
c1: 0.100000
c2: 0.010000
num_memories: 6
max_iterations: 200
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 5427.568082
Feature norm: 1.000000
Error norm: 5991.676403
Active features: 12673
Line search trials: 1
Line search step: 0.000045
Seconds required for this iteration: 0.012

***** Iteration #2 *****
Loss: 4349.991006
Feature norm: 0.842893
Error norm: 5369.050130
Active features: 12779
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.010

***** Iteration #3 *****
Loss: 3831.990000
Feature norm: 0.820710
Error norm: 11722.892541
Active features: 8715
Line search trials: 2
Line search step: 0.500000
Seconds required for this 

***** Iteration #49 *****
Loss: 222.453567
Feature norm: 41.063764
Error norm: 12.851035
Active features: 2289
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.007

***** Iteration #50 *****
Loss: 221.331294
Feature norm: 41.170921
Error norm: 11.063810
Active features: 2208
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.007

***** Iteration #51 *****
Loss: 220.403704
Feature norm: 41.429101
Error norm: 26.444005
Active features: 2172
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.007

***** Iteration #52 *****
Loss: 219.686809
Feature norm: 41.604241
Error norm: 15.829741
Active features: 2152
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.007

***** Iteration #53 *****
Loss: 219.080724
Feature norm: 41.855410
Error norm: 8.592471
Active features: 2138
Line search trials: 1
Line search step: 1.000000
Seconds required for this

***** Iteration #114 *****
Loss: 211.514373
Feature norm: 44.141030
Error norm: 3.705594
Active features: 1650
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.012

***** Iteration #115 *****
Loss: 211.507466
Feature norm: 44.147832
Error norm: 7.411465
Active features: 1650
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.013

***** Iteration #116 *****
Loss: 211.484703
Feature norm: 44.160367
Error norm: 2.988694
Active features: 1650
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.013

***** Iteration #117 *****
Loss: 211.477864
Feature norm: 44.167882
Error norm: 6.736515
Active features: 1645
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.013

***** Iteration #118 *****
Loss: 211.476812
Feature norm: 44.184197
Error norm: 10.435710
Active features: 1645
Line search trials: 1
Line search step: 1.000000
Seconds required for th

***** Iteration #175 *****
Loss: 210.775858
Feature norm: 44.413373
Error norm: 1.201778
Active features: 1600
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.007

***** Iteration #176 *****
Loss: 210.771059
Feature norm: 44.412756
Error norm: 2.298704
Active features: 1594
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.007

***** Iteration #177 *****
Loss: 210.764690
Feature norm: 44.408309
Error norm: 1.226555
Active features: 1590
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.007

***** Iteration #178 *****
Loss: 210.759169
Feature norm: 44.403666
Error norm: 2.397825
Active features: 1587
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.007

***** Iteration #179 *****
Loss: 210.753325
Feature norm: 44.400312
Error norm: 1.605628
Active features: 1589
Line search trials: 1
Line search step: 1.000000
Seconds required for thi

In [34]:
tagger = pycrfsuite.Tagger()
tagger.open('crf.model')
y_pred = [tagger.tag(xseq) for xseq in X_test]

# Let's take a look at a random sample in the testing set
i = 12
for x, y in zip(y_pred[i], [x[1].split("=")[1] for x in X_test[i]]):
    print("%s (%s)" % (y, x))


synergen (N)
inc (N)
said (I)
it (I)
has (I)
filed (I)
for (I)
an (I)
offering (I)
of (I)
1,500,000 (I)
common (I)
shares (I)
through (I)
underwriters (I)
led (I)
by (I)
alex. (N)
brown (N)
inc (N)
absb (N)
, (I)
hambrecht (N)
and (I)
quist (N)
inc (N)
and (I)
boettcher (N)
and (N)
co (N)
inc (N)
. (I)
it (I)
said (I)
proceeds (I)
will (I)
be (I)
used (I)
to (I)
fund (I)
research (I)
and (I)
testing (I)
and (I)
for (I)
other (I)
general (I)
corporate (I)
purposes. (I)
reuter (N)


In [35]:
import numpy as np
from sklearn.metrics import classification_report

# Create a mapping of labels to indices
labels = {"N": 1, "I": 0}

# Convert the sequences of tags into a 1-dimensional array
predictions = np.array([labels[tag] for row in y_pred for tag in row])
truths = np.array([labels[tag] for row in y_test for tag in row])

# Print out the classification report
print(classification_report(
    truths, predictions,
    target_names=["I", "N"]))

             precision    recall  f1-score   support

          I       0.98      0.99      0.98      3595
          N       0.90      0.81      0.85       413

avg / total       0.97      0.97      0.97      4008

