In [1]:
data = [(['Linux', 'is', 'the', 'best', 'OS'], ['OS','IR','IR','IR','IR']),
(['Ubuntu', 'is', 'my', 'favourite', 'OS'], ['OS','IR','IR','IR','IR'])]

In [2]:
corpus = []
for (doc, tags) in data:
    doc_tag = []
    for word, tag in zip(doc,tags):
        doc_tag.append((word, tag))
    corpus.append(doc_tag)
print(corpus)

[[('Linux', 'OS'), ('is', 'IR'), ('the', 'IR'), ('best', 'IR'), ('OS', 'IR')], [('Ubuntu', 'OS'), ('is', 'IR'), ('my', 'IR'), ('favourite', 'IR'), ('OS', 'IR')]]


In [4]:
def doc2features(doc, i):
    word = doc[i][0]
    
    # Features from current word
    features={
        'word.word': word,
    }
    # Features from previous word
    if i > 0:
        prevword = doc[i-1][0]
        features['word.prevword'] = prevword
    else:
        features['BOS'] = True # Special "Beginning of Sequence" tag
        
    # Features from next word
    if i < len(doc)-1:
        nextword = doc[i+1][0]
        features['word.nextword'] = nextword
    else:
        features['EOS'] = True # Special "End of Sequence" tag
    return features

def extract_features(doc):
    return [doc2features(doc, i) for i in range(len(doc))]
X = [extract_features(doc) for doc in corpus]
print(X)

[[{'word.nextword': 'is', 'BOS': True, 'word.word': 'Linux'}, {'word.nextword': 'the', 'word.prevword': 'Linux', 'word.word': 'is'}, {'word.nextword': 'best', 'word.prevword': 'is', 'word.word': 'the'}, {'word.nextword': 'OS', 'word.prevword': 'the', 'word.word': 'best'}, {'EOS': True, 'word.prevword': 'best', 'word.word': 'OS'}], [{'word.nextword': 'is', 'BOS': True, 'word.word': 'Ubuntu'}, {'word.nextword': 'my', 'word.prevword': 'Ubuntu', 'word.word': 'is'}, {'word.nextword': 'favourite', 'word.prevword': 'is', 'word.word': 'my'}, {'word.nextword': 'OS', 'word.prevword': 'my', 'word.word': 'favourite'}, {'EOS': True, 'word.prevword': 'favourite', 'word.word': 'OS'}]]


In [5]:
def get_labels(doc):
    return [tag for (token,tag) in doc]
y = [get_labels(doc) for doc in corpus]
print(y)

[['OS', 'IR', 'IR', 'IR', 'IR'], ['OS', 'IR', 'IR', 'IR', 'IR']]


In [7]:
import sklearn_crfsuite
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=20,
    all_possible_transitions=False,
)
crf.fit(X, y);

In [9]:
test = [['CentOS', 'is', 'my', 'favourite', 'OS']]
X_test = extract_features(test)
print(crf.predict_single(X_test))

['OS']
