In [97]:
from conllu import parse_incr
from sklearn_crfsuite import CRF, metrics
import os

# Подготовка данных

Для работы был выбран корпус SynTagRus https://github.com/UniversalDependencies/UD_Russian-SynTagRus/tree/master

In [98]:
stop_toks = ["SYM", "PUNCT", "X"]

Изначально хотел привести данные в формат `предыдущий токен, текущий токен, следующий токен, POS-теги из pymorphy2`, но в таком случае разметка всех данных с помощью pymorphy2 проходила очень долго. Вместо этого принял решение использовать косвенные признаки: приставки и окончания слов, а также их длина.

In [99]:
def get_features(sent) -> list:

    new_sent = list()
    tags = list
    for i in range(len(sent)):
        if sent[i]["upos"] not in stop_toks:
            new_sent.append(sent[i])

    temp = dict()
    res = list()
    for i in range(len(new_sent)):
        temp = dict()
        temp["current"] = new_sent[i]["lemma"]
        if i - 1 > 0 and i + 1 < len(new_sent):
            temp["previous"] = new_sent[i - 1]["lemma"]
            temp["next"] = new_sent[i + 1]["lemma"]
        elif i - 1 <= 0 and i + 1 < len(new_sent):
            temp["previous"] = ''
            temp["next"] = new_sent[i + 1]["lemma"]
        elif i - 1 > 0 and i + 1 >= len(new_sent):
            temp["previous"] = new_sent[i - 1]["lemma"]
            temp["next"] = ''
        else:
            temp["previous"] = ''
            temp["next"] = ''
        
        temp["length"] = len(new_sent[i]["lemma"])
        temp["prefix[:4]"] = new_sent[i]["lemma"][:4]
        temp["prefix[:3]"] = new_sent[i]["lemma"][:3]
        temp["prefix[:2]"] = new_sent[i]["lemma"][:2]
        temp["postfix[-3:]"] = new_sent[i]["lemma"][-3:]
        temp["postfix[-2:]"] = new_sent[i]["lemma"][-2:]
        temp["postfix[-4:]"] = new_sent[i]["lemma"][-4:]

        res.append(temp)
    
    return res

In [100]:
def get_labels(sent) -> list:
    res = list()
    for word in sent:
        if word["upos"] not in stop_toks:
            res.append(word["upos"])
    
    return res

In [101]:
def get_data(ipath: str):
    X = []
    y = []
    with open(ipath, 'r', encoding="utf-8") as data_file:
        for tokenlist in parse_incr(data_file):
            X.append(get_features(tokenlist))
            y.append(get_labels(tokenlist))
    
    return X, y

In [102]:
train_path = os.path.join(os.getcwd(), "deep", "UD_Russian-SynTagRus", "ru_syntagrus-ud-train.conllu")
test_path = os.path.join(os.getcwd(), "deep", "UD_Russian-SynTagRus", "ru_syntagrus-ud-test.conllu")
dev_path = os.path.join(os.getcwd(), "deep", "UD_Russian-SynTagRus", "ru_syntagrus-ud-dev.conllu")

In [103]:
X_train, y_train = get_data(train_path)
X_test, y_test = get_data(test_path)
X_dev, y_dev = get_data(dev_path)

# Построение модели

In [104]:
model = CRF(
    algorithm="lbfgs",
    c1 = 0.1,
    c2 = 0.1,
    max_iterations = 150,
    all_possible_states=False,
    verbose = True
)

In [106]:
model.fit(X_train, y_train, X_dev, y_dev)

loading training data to CRFsuite: 100%|██████████| 48814/48814 [00:11<00:00, 4171.40it/s]





loading dev data to CRFsuite: 100%|██████████| 6584/6584 [00:01<00:00, 4075.35it/s]



Holdout group: 2

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 0
0....1....2....3....4....5....6....7....8....9....10
Number of features: 300009
Seconds required: 2.969

L-BFGS optimization
c1: 0.100000
c2: 0.100000
num_memories: 6
max_iterations: 150
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

Iter 1   time=3.05  loss=1574932.54 active=295282 precision=0.019  recall=0.067  F1=0.030  Acc(item/seq)=0.292 0.008  feature_norm=0.50
Iter 2   time=0.99  loss=1495975.49 active=288451 precision=0.019  recall=0.067  F1=0.030  Acc(item/seq)=0.292 0.008  feature_norm=0.54
Iter 3   time=1.18  loss=1482697.52 active=295165 precision=0.019  recall=0.067  F1=0.030  Acc(item/seq)=0.292 0.008  feature_norm=0.58
Iter 4   time=1.12  loss=1459879.60 active=290466 precision=0.019  recall=0.067  F1=0.030  Acc(item/seq)=0.292 0.008  feature_norm=0.69
Iter 5   time=1.44  loss=1425155.72 

CRF(algorithm='lbfgs', all_possible_states=False, c1=0.1, c2=0.1,
    max_iterations=150, verbose=True)

In [107]:
labels = list(model.classes_)
len(labels)

15

In [108]:
y_pred = model.predict(X_test)

In [109]:
y_pred[0]

['ADP',
 'ADJ',
 'NOUN',
 'NOUN',
 'NOUN',
 'PROPN',
 'NOUN',
 'ADP',
 'PROPN',
 'VERB',
 'ADP',
 'NUM',
 'NOUN']

In [110]:
y_test[0]

['ADP',
 'ADJ',
 'NOUN',
 'NOUN',
 'NOUN',
 'PROPN',
 'NOUN',
 'ADP',
 'PROPN',
 'VERB',
 'ADP',
 'NUM',
 'NOUN']

In [111]:
print(metrics.flat_classification_report(y_test, y_pred, labels, digits=3))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

        NOUN      0.997     0.996     0.997     28663
         ADJ      0.993     0.998     0.995     10992
       PROPN      0.996     0.991     0.993      4754
         AUX      0.913     0.935     0.924      1042
        VERB      0.993     0.994     0.994     13048
         ADP      0.999     0.999     0.999     11516
         ADV      0.977     0.972     0.975      5849
       CCONJ      0.955     0.976     0.966      4156
        PART      0.954     0.924     0.939      3322
        PRON      0.986     0.978     0.982      5024
         DET      0.995     0.989     0.992      2755
       SCONJ      0.909     0.946     0.927      2048
         NUM      0.976     0.983     0.980      2691
           _      0.000     0.000     0.000         0
        INTJ      1.000     0.273     0.429        11

   micro avg      0.988     0.988     0.988     95871
   macro avg      0.910     0.864     0.873     95871
weighted avg      0.988   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
