Данные для обучения возьмем из проекта [opencorpora.org](http://opencorpora.org/?page=downloads)
Скачаем размеченный корпус со снятой омонимией без UNKN как `oc.xml`

Разберем XML:

In [1]:
from xml.etree import ElementTree as ET

with open("/home/katty/Загрузки/os.xml", mode="r", encoding="utf8") as f:
    doc = ET.parse(f)

In [2]:
all_sentences = []

for text in doc.getroot():
    paragraphs = text.find('paragraphs')
    for paragraph in paragraphs:
        for sentence in paragraph:
            tokens = sentence.find('tokens')

            new_sentence = []
            all_sentences.append(new_sentence)
            for token in tokens:
                word = token.get('text')
                g = token.find('tfr').find('v').find('l').find('g')
                pos = g.get('v')
                new_sentence.append((word, pos))

In [3]:
len(all_sentences)

10971

In [19]:
train_set = all_sentences[:round(0.8*len(all_sentences))]
test_set =  all_sentences[round(0.8*len(all_sentences)):round(0.9*len(all_sentences))]
valid_set = all_sentences[round(0.9*len(all_sentences)):]

Необходимо научиться превращать слова в набор признаков:

In [20]:

def get_features(sentence, word_index):
    word, pos = sentence[word_index]

    features = {
        '[-2:]': word[-2:],
        '[-3:]': word[-3:],
        '[-1:]': word[-1:]
    }
    if word_index == 0:
        features['START'] = True
    elif word_index == len(sentence) -1:
        features['END'] = True
    else:
        features["PREV"] = sentence[word_index-1][1]

    return features

In [21]:
all_features = [
    [get_features(sentence, ind) for ind in range(len(sentence))]
    for sentence in all_sentences
]

In [22]:
train_features = all_features[:round(0.8*len(all_features))]
test_features =  all_features[round(0.8*len(all_features)):round(0.9*len(all_features))]
valid_features = all_features[round(0.9*len(all_features)):]

In [23]:
all_pos = [
    [pos for word, pos in sentence]
    for sentence in all_sentences
]

train_pos = all_pos[:round(0.8*len(all_pos))]
test_pos=  all_pos[round(0.8*len(all_pos)):round(0.9*len(all_pos))]
valid_pos = all_pos[round(0.9*len(all_pos)):]

# Теперь модель обучена!

In [24]:
%%time

import sklearn_crfsuite

algorithms = ["lbfgs"]
c1_vals = [0.05, 0.075, 0.1, 0.15, 0.15, 0.2, 0.25]
#c2_vals = [1.05, 1.075, 1.1, 1.15, 0.95, 0.8, 1.25]
max_iterations_vals = [100, 200, 300, 400, 500]
best_params = ["", 0, 0]
max_arg = 0


for algo in algorithms:
    for c1_val in c1_vals:
            for max_iterations_val in max_iterations_vals:
                crf = sklearn_crfsuite.CRF(
                    algorithm=algo,
                    c1=c1_val,
                    c2 = 1,
                    max_iterations=max_iterations_val,
                    all_possible_transitions=True
                )
                crf.fit(train_features, train_pos)

                results = []
                for sent in test_set:
                    test_sentence = sent
                    test_features = [
                        [get_features(test_sentence, ind) for ind in range(len(test_sentence))]
                    ]

                    results.append(crf.predict(test_features)[0])
                y_true = []
                y_pred = []

                for i in range(len(results)):
                    for elem in test_pos[i]:
                        y_true.append(elem)
                    for elem in results[i]:
                        y_pred.append(elem)

                from sklearn import metrics

                current_arg = metrics.f1_score(y_true, y_pred, labels=['NOUN',  'VERB',  "ADJF", "ADJS", "COMP","INFN","PRTF","PRTS","GRND","NUMR","ADVB","NPRO","PRED","PREP","CONJ","PRCL","INTJ"], average="weighted")
                if current_arg > max_arg:
                    max_arg = current_arg
                    best_params = [algo, c1_val, max_iterations_val]

print(max_arg, best_params)

0.9072135471096163 ['lbfgs', 0.05, 200]
CPU times: user 6min 27s, sys: 340 ms, total: 6min 27s
Wall time: 6min 28s


In [25]:
from sklearn import metrics
print(max_arg, best_params)
crf = sklearn_crfsuite.CRF(
    algorithm=best_params[0],
    c1=best_params[1],
    c2 = 1,
    max_iterations=best_params[2],
    all_possible_transitions=True
)

crf.fit(train_features, train_pos)


results = []
for sent in valid_set:
    test_sentence = sent
    test_features = [
        [get_features(test_sentence, ind) for ind in range(len(test_sentence))]
    ]

    results.append(crf.predict(test_features)[0])
y_true = []
y_pred = []

for i in range(len(results)):
    for elem in valid_pos[i]:
        y_true.append(elem)
    for elem in results[i]:
        y_pred.append(elem)


print(metrics.classification_report(y_true, y_pred, labels=['NOUN',  'VERB',  "ADJF", "ADJS", "COMP","INFN","PRTS","ADVB","NPRO","PRED","PREP","CONJ","PRCL","INTJ"], digits=3))

0.9072135471096163 ['lbfgs', 0.05, 200]
              precision    recall  f1-score   support

        NOUN      0.923     0.951     0.937      1582
        VERB      0.920     0.938     0.929       480
        ADJF      0.843     0.939     0.888       445
        ADJS      0.933     0.467     0.622        30
        COMP      1.000     0.500     0.667        18
        INFN      0.967     0.956     0.961        91
        PRTS      0.938     0.789     0.857        19
        ADVB      0.688     0.782     0.732       110
        NPRO      0.951     0.901     0.925       192
        PRED      1.000     1.000     1.000        20
        PREP      0.973     0.979     0.976       328
        CONJ      0.849     0.616     0.714        73
        PRCL      0.989     0.898     0.941        98
        INTJ      0.036     0.100     0.053        10

   micro avg      0.905     0.924     0.914      3496
   macro avg      0.858     0.773     0.800      3496
weighted avg      0.911     0.924     0.