In [1]:
# !pip install transformers==3.1.0
import torch
import transformers
import itertools

In [2]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

In [58]:
# pip install git+https://github.com/MeMartijn/updated-sklearn-crfsuite.git#egg=sklearn_crfsuite
from conllu import parse
from conllu import parse_tree
import pandas as pd
from razdel import tokenize
import re
import numpy as np
from conllu.models import TokenList, Token
import random

from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.preprocessing import MultiLabelBinarizer

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
import pickle

import scipy
import scipy.stats
from sklearn.metrics import make_scorer
# from sklearn.cross_validation import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from collections import Counter

In [78]:
model = transformers.BertModel.from_pretrained('bert-base-multilingual-cased')
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-multilingual-cased')

def get_align(src, tgt_idx):

    tgt = ' '.join(list(tgt_idx.keys()))

    sent_src, sent_tgt = src.strip().split(), tgt.strip().split()
    token_src, token_tgt = [tokenizer.tokenize(word) for word in sent_src], [tokenizer.tokenize(word) for word in sent_tgt]
    wid_src, wid_tgt = [tokenizer.convert_tokens_to_ids(x) for x in token_src], [tokenizer.convert_tokens_to_ids(x) for x in token_tgt]
    ids_src, ids_tgt = tokenizer.prepare_for_model(list(itertools.chain(*wid_src)), return_tensors='pt', model_max_length=tokenizer.model_max_length, truncation=True)['input_ids'], tokenizer.prepare_for_model(list(itertools.chain(*wid_tgt)), return_tensors='pt', truncation=True, model_max_length=tokenizer.model_max_length)['input_ids']
    sub2word_map_src = []
    for i, word_list in enumerate(token_src):
        sub2word_map_src += [i for x in word_list]
    sub2word_map_tgt = []
    for i, word_list in enumerate(token_tgt):
        sub2word_map_tgt += [i for x in word_list]

    # alignment
    align_layer = 8
    threshold = 0.9
    model.eval()
    with torch.no_grad():
        out_src = model(ids_src.unsqueeze(0), output_hidden_states=True)[2][align_layer][0, 1:-1]
        out_tgt = model(ids_tgt.unsqueeze(0), output_hidden_states=True)[2][align_layer][0, 1:-1]
        dot_prod = torch.matmul(out_src, out_tgt.transpose(-1, -2))
        softmax_srctgt = torch.nn.Softmax(dim=-1)(dot_prod)
        softmax_tgtsrc = torch.nn.Softmax(dim=-2)(dot_prod)
        softmax_inter = (softmax_srctgt > threshold)*(softmax_tgtsrc > threshold)

    align_subwords = torch.nonzero(softmax_inter, as_tuple=False)
    align_words = set()
    for i, j in align_subwords:
        align_words.add( (sub2word_map_src[i], sub2word_map_tgt[j]) )


    if len(list(align_words)) == 0:
        return random.choice(list(tgt_idx.keys()))
    else:
        for i, j in sorted(align_words):
            return sent_tgt[j]
    

# get_align(tgt, src)

In [96]:
class Synthesized_sent:
    def __init__(self, sent_source, sent_target):
        
        self.sent_source = sent_source
        self.sent_target = sent_target
        
        self.rows_source = self._split_rows(self.sent_source)
        self.rows_target = self._split_rows(self.sent_target)
        self.sent_synt = TokenList([])
        self.synt = TokenList([])
        self.synt_target = TokenList([])
        self.query = []
        self.clausal_tags = ['root', 'parataxis', 'csubj', 'xcomp', 'ccomp', 'advcl', 'acl', 'conj']
        self.POS_tags = ['ADJ', 'ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'VERB', 'X']
        
        
    
    def _split_rows(self, sentence):
        
        """split the tree on rows using metadata info"""
        
        list_rows = [x.split() for x in re.split('\d{4}\s', sentence.metadata['text'])[1:]]
        index_word = 0
        sent_tokens = []

        for row in list_rows:
            row_heads = []
            row_id = []
            row_tokens = TokenList([])
            for word in row:
                row_tokens.append(Token(id = sentence[index_word]['id'], 
                            form = sentence[index_word]['form'], 
                            lemma = sentence[index_word]['lemma'],
                            upos = sentence[index_word]['upos'],
                            head = sentence[index_word]['head'],
                            deprel = sentence[index_word]['deprel']))
                index_word += 1

            sent_tokens.append(row_tokens)
        return sent_tokens

    
    
    def predict_deprel(self):
        
        """predict tag deprel using the pretrained model CRF"""
        
        train_sents = []
        train_sents.append((trans2format(self.sent_target.filter(id=lambda x: type(x) is int), self.sent_source)))
        X_train = [sent2features(s) for s in train_sents]
        return crf.predict(X_train)[0]

    
            
    def fill_deprel(self):
        
        """fills deprel tags in the tree"""
        
        tags = self.predict_deprel()
        for i, tok in enumerate(self.sent_target.filter(id=lambda x: type(x) is int)):
            tok['pred_deprel'] = tags[i]            

     

    def fill_synt_sent(self, word_rus):
        
        """fills syntenized kyrgyz tree self.synt_target"""

        for id_row, row in enumerate(self.rows_source):
            if list(row.filter(id = word_rus['id'])) != []:
                tgt = {}
                list_upos = {x['form']: x['id'] for x in self.rows_target[id_row].filter(upos = word_rus['upos']).filter(deprel = word_rus['deprel'])}

                index_in_row = [y['id'] for y in self.rows_target[id_row]]
                list_deprel = {x['form']: x['id'] for x in self.sent_target.filter(pred_deprel = word_rus['deprel']) if x['id'] in index_in_row}
                
                tgt.update(list_upos)
                tgt.update(list_deprel)


                if len(tgt.keys()) != 0:
                    if len(tgt.keys()) > 1:
                        aligned_word = get_align(word_rus['form'], tgt)
                    elif len(tgt.keys()) == 1:
                        aligned_word = list(tgt.keys())[0]

                    self.synt_target.append(Token(rus_deprel = word_rus['deprel'],
                                                  aligned_word_form = aligned_word,
                                                  id_in_kyr_sent = tgt[aligned_word], 
                                                  id_rus = word_rus['id'],
                                                  id_head_rus = word_rus['head']
                    ))
                    
        
    
    def parse_rus_sent(self, word):
        
        """itterates russian tree and fills synt kyrgyz tree"""
        
        if word.children != None:
            self.query += word.children

        self.fill_synt_sent(word.token)
        if self.query != []:
            
            return self.parse_rus_sent(self.query.pop(0))
        
        
    
    def head_renumerate(self):
        for tok in self.synt.filter(id=lambda x: type(x) is int):

            if tok['predicted'] == 'root':
                self.synt[tok['id']-1]['head'] = 0
            if tok['rus_head'] != '_':
                for word_match in self.synt.filter(rus_word_id = tok['rus_head']).filter(id=lambda x: type(x) is int):

                    self.synt[tok['id']-1]['head'] = word_match['id']
    

    def find_head_of_the_clause(self, token):
        for row in self.rows_target:
            list_ids = [x['id'] for x in row.filter(id=lambda x: type(x) is int)]
            if token['id'] in list_ids:
                for id_tok in list_ids:
                    if self.sent_target.filter(id=id_tok)[0]['pred_deprel'] in self.clausal_tags:
                        return id_tok
                
                
    def easy_rules(self):
        
        """rules for case, appos, cop, det"""
        
        for tok in self.sent_target.filter(id=lambda x: type(x) is int):
            if tok['pred_deprel'] == 'case':
                tok['pred_head'] = int(tok['id']) - 1
                
            if tok['pred_deprel'] == 'cop':
                tok['pred_head'] = int(tok['id']) - 1
                
            elif tok['pred_deprel'] == 'appos':
                tok['pred_head'] = int(tok['id']) - 1
                
            elif tok['pred_deprel'] == 'aux':
                tok['pred_head'] = int(tok['id']) - 1
                
            elif tok['pred_deprel'] == 'det' and tok['pred_head'] == '_':
                for row in self.rows_target: 
                    list_id_row = [x['id'] for x in row]
                    if tok['id'] in list_id_row:
                        break
                id_pred = self.sent_target.filter(id=lambda x: x in list_id_row).filter(upos = 'NOUN')
                if len(id_pred) != 0:
                    tok['pred_head'] = id_pred[0]['id']
                    
            elif tok['pred_deprel'] == 'root' and tok['pred_head'] != 0:
                tok['pred_head'] = 0
                
            elif tok['pred_deprel'] == 'punct':
                head_clause = self.find_head_of_the_clause(tok)
                
                if head_clause != None:
                    tok['pred_head'] = head_clause
                    

                    
    def add_dep_to_target(self):
        for tok in self.synt_target:
            for x in self.synt_target.filter(id_head_rus = tok['id_rus']):
                self.sent_target[x['id_in_kyr_sent'] - 1]['pred_head'] = tok['id_in_kyr_sent']
#                 self.sent_target[x['id_in_kyr_sent'] - 1]['pred_deprel'] = tok['rus_deprel']

        for i, x in enumerate(self.sent_target):
            if 'pred_head' not in self.sent_target[i]:
                self.sent_target[i]['pred_head'] = '_'
                
        self.easy_rules()
        
                
    def cal_uas(self):
        score_uas = 0
        score_las = 0
        for i, tok in enumerate(self.sent_target.filter(id=lambda x: type(x) is int)):
            if tok['pred_head'] == tok['head'] and tok['pred_deprel'] == tok['deprel']:
                score_uas += 1
            elif tok['pred_head'] == tok['head']:
                score_las += 1
        score_uas = score_uas/len(self.sent_target.filter(id=lambda x: type(x) is int))
        score_las = score_las/len(self.sent_target.filter(id=lambda x: type(x) is int))
        return score_uas, score_las

                            
    def fill_gapes(self):
        root_id = self.sent_target.to_tree().token['id']
        for x in self.sent_target.filter(pred_head = '_').filter(id=lambda x: type(x) is int):
            x['pred_head'] = root_id
            


In [80]:
def word2features(sent, i):

    word = sent[i][0]
    lemma = sent[i][1]
    postag = sent[i][2]
    set_deprel = [x['deprel'] for x in sent[i][4]]
#     print(set_deprel)
#     id_row = sent[5]
    
    rus_deprel = [x['deprel'] for x in sent[i][4].filter(upos = postag)]
    


            
    features = {
#         'bias': 1.0,
        'word.lower()': word.lower(),
        'len(word)': len(word),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'postag': postag,
        'lemma.lower()': lemma.lower(),
#         'id_row': id_row,
        'id_word_in_row': i,
        'lemma.lower() == word.lower()':lemma.lower() == word.lower()
    }
    

    feat_transl = {str(i)+'_rus_deprel': x for i, x in enumerate(rus_deprel)}
    features.update(feat_transl)

    
    if i > 1:
        word1 = sent[i-2][0]
        lemma1 = sent[i-2][1]
        postag1 = sent[i-2][2]
        features.update({
            '-2:word.lower()': word1.lower(),
            '-2:len(word)': len(word1),
            '-2:postag': postag1,
            '-2:lemma.lower()': lemma1.lower(),
            '-2:id_word_in_row': i,
            '-2:lemma.lower() == word.lower()': lemma1.lower() == word1.lower()
        })

    if i > 0:
        word1 = sent[i-1][0]
        lemma1 = sent[i-1][1]
        postag1 = sent[i-1][2]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:len(word)': len(word1),
            '-1:postag': postag1,
            '-1:lemma.lower()': lemma1.lower(),
            '-1:id_word_in_row': i,
            '-1:lemma.lower() == word.lower()': lemma1.lower() == word1.lower()
        })
        
    if i < len(sent)-2:
        word1 = sent[i+2][0]
        lemma1 = sent[i+2][1]
        postag1 = sent[i+2][2]
        features.update({
            '+2:word.lower()': word1.lower(),
            '+2:len(word)': len(word1),
            '+2:postag': postag1,
            '+2:lemma.lower()' : lemma1.lower(),
            '+2:id_word_in_row': i,
            '+2:lemma.lower() == word.lower()':lemma1.lower() == word1.lower()
        })
            
            
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        lemma1 = sent[i+1][1]
        postag1 = sent[i+1][2]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:len(word)': len(word1),
            '+1:postag': postag1,
            '+1:lemma.lower()' : lemma1.lower(),
            '+1:id_word_in_row': i,
            '+1:lemma.lower() == word.lower()':lemma1.lower() == word1.lower()
        })


    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, lemma, postag, label, set_deprel in sent]

def sent2tokens(sent):
    return [token for token, lemma, postag, label, set_deprel in sent]



def trans2format(sent_ky, sent_ru):
    new_sent = []   
    for word in sent_ky:
        new_sent.append((word['form'], word['lemma'], word['upos'], word['deprel'], sent_ru))
    return new_sent

In [81]:
# загрузить данные

folder = 'path/to_dir/'
with open(folder + 'kyr.conllu', 'r', encoding = 'utf-8') as f:
    file_ky = parse(f.read())


with open(folder + 'rus.conllu', 'r', encoding = 'utf-8') as f:
    file_ru = parse(f.read())

map_ky_ru = {}
for ru_sent in file_ru:
    ky_sent = [x for x in file_ky if x.metadata['sent_id'] == ru_sent.metadata['sent_id']]
    if len(ky_sent) == 1:
        map_ky_ru[ru_sent.metadata['sent_id']] = {'ru': ru_sent, 'ky':ky_sent[0]}
    else:
        pass

# map_ky_ru['5134']['ky']

In [82]:
# разбить на трейн и текст и обучить модель

train_sents = []
test_sents = []

with open('test_ids.txt', 'r', encoding = 'utf-8') as f:
    test_sents_id = sorted(f.read().split('\n'))

for i, ky_ru_sent in map_ky_ru.items():

    syntenizer = Synthesized_sent(map_ky_ru[i]['ru'], map_ky_ru[i]['ky'])
    if i in ['4059', '4647', '4698', '4858', '4902', '5064', '4729', '5452']:
        pass
    elif i not in test_sents_id:
        train_sents.append((trans2format(syntenizer.sent_target.filter(id=lambda x: type(x) is int), syntenizer.sent_source)))
    else:
        test_sents.append((trans2format(syntenizer.sent_target.filter(id=lambda x: type(x) is int), syntenizer.sent_source)))

            
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True,
    verbose=True
)

crf.fit(X_train, y_train)
y_pred = crf.predict(X_test)


m = MultiLabelBinarizer().fit(y_test)

print("F1-score is : {:.1%}".format(f1_score(m.transform(y_test),
         m.transform(y_pred),
         average='macro')))
print(classification_report(m.transform(y_test), m.transform(y_pred), target_names=m.classes_))

labels = list(crf.classes_)

loading training data to CRFsuite: 100%|███████████████████████████████████████████| 393/393 [00:00<00:00, 3119.25it/s]



Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 33468
Seconds required: 0.060

L-BFGS optimization
c1: 0.100000
c2: 0.100000
num_memories: 6
max_iterations: 100
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

Iter 1   time=0.10  loss=16364.03 active=33059 feature_norm=0.12
Iter 2   time=0.02  loss=15688.10 active=33042 feature_norm=0.13
Iter 3   time=0.02  loss=15053.14 active=33150 feature_norm=0.21
Iter 4   time=0.02  loss=14553.51 active=33108 feature_norm=0.28
Iter 5   time=0.02  loss=13933.78 active=33226 feature_norm=0.43
Iter 6   time=0.02  loss=13325.60 active=33231 feature_norm=0.55
Iter 7   time=0.02  loss=12625.46 active=33094 feature_norm=0.76
Iter 8   time=0.02  loss=12136.37 active=33121 feature_norm=0.96
Iter 9   time=0.02  loss=11683.83 active=33244 feature_norm=1.16
Iter 10  time=

  _warn_prf(average, modifier, msg_start, len(result))


# Кроссвалидация

In [64]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True,
    verbose=True,
    keep_tempfiles=None
)

params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}
# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)

rs.fit(X_train, y_train)

    
predictions = rs.predict(X_test[0])

Fitting 3 folds for each of 50 candidates, totalling 150 fits


loading training data to CRFsuite: 100%|███████████████████████████████████████████| 393/393 [00:00<00:00, 3144.19it/s]



Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 33468
Seconds required: 0.062

L-BFGS optimization
c1: 0.078174
c2: 0.116035
num_memories: 6
max_iterations: 100
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

Iter 1   time=0.10  loss=16363.97 active=33078 feature_norm=0.12
Iter 2   time=0.02  loss=15688.02 active=33091 feature_norm=0.13
Iter 3   time=0.02  loss=15052.98 active=33176 feature_norm=0.21
Iter 4   time=0.02  loss=14553.31 active=33106 feature_norm=0.28
Iter 5   time=0.02  loss=13933.50 active=33261 feature_norm=0.43
Iter 6   time=0.02  loss=13325.21 active=33235 feature_norm=0.55
Iter 7   time=0.02  loss=12624.90 active=33118 feature_norm=0.76
Iter 8   time=0.02  loss=12135.60 active=33157 feature_norm=0.96
Iter 9   time=0.02  loss=11682.89 active=33258 feature_norm=1.16
Iter 10  time=

In [65]:
crf = rs.best_estimator_
y_pred = crf.predict(X_test)

m = MultiLabelBinarizer().fit(y_test)

print("F1-score is : {:.1%}".format(f1_score(m.transform(y_test),
         m.transform(y_pred),
         average='macro')))
print(classification_report(m.transform(y_test), m.transform(y_pred), target_names=m.classes_))

F1-score is : 75.8%
              precision    recall  f1-score   support

           _       1.00      1.00      1.00         5
         acl       0.76      0.76      0.76        25
       advcl       0.91      0.92      0.91        52
      advmod       0.94      0.92      0.93        37
 advmod:emph       1.00      1.00      1.00         6
        amod       0.82      0.92      0.87        25
       appos       0.80      0.62      0.70        13
         aux       0.86      0.94      0.90        32
        case       0.89      0.80      0.84        10
          cc       1.00      1.00      1.00         4
       ccomp       0.92      0.61      0.73        18
    compound       0.75      0.50      0.60        18
        conj       0.71      0.51      0.60        39
         cop       0.88      1.00      0.93         7
         det       1.00      0.62      0.76        13
   discourse       1.00      1.00      1.00         4
       fixed       0.00      0.00      0.00         3
       

  _warn_prf(average, modifier, msg_start, len(result))


In [66]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(30))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-30:])

Top positive:
5.347810 advmod   postag:ADV
4.918523 det      postag:DET
4.916751 aux      postag:AUX
4.666810 _        postag:_
4.611634 amod     postag:ADJ
4.049232 nummod   postag:NUM
3.611629 obl      word[-2:]:га
3.419329 case     postag:ADP
3.419078 cop      lemma.lower():э
3.147244 obl      word[-2:]:го
3.126445 obj      word[-2:]:дү
3.045331 ccomp    +1:lemma.lower():де
2.908759 advcl    word[-2:]:ып
2.840948 obj      word[-2:]:ды
2.762383 _        lemma.lower():_
2.739380 obl      word[-2:]:да
2.692968 obl      word[-2:]:ге
2.598093 advcl    word[-2:]:са
2.460780 advcl    word[-2:]:ап
2.424242 advcl    word[-2:]:еп
2.415259 cc       postag:CCONJ
2.407820 vocative 0_rus_deprel:vocative
2.363789 punct    postag:PUNCT
2.344148 obl      word[-2:]:ка
2.316544 obl      word[-3:]:ден
2.283101 nmod:poss word[-2:]:ын
2.229121 acl      word[-2:]:ан
2.170252 ccomp    +2:lemma.lower():де
2.157551 nsubj    postag:PROPN
2.149154 advcl    word[-2:]:уп

Top negative:
-1.012278 parataxis -1:pos

# Перенос синтаксической разметки

In [92]:
uas_score = []
las_score = []
clausal_tags = ['root', 'parataxis', 'csubj', 'xcomp', 'ccomp', 'advcl', 'acl', 'conj']

for i in test_sents_id:

    sentenizer = Synthesized_sent(map_ky_ru[i]['ru'], map_ky_ru[i]['ky'])
    sentenizer.fill_deprel()

    root = sentenizer.sent_source.to_tree()
    sentenizer.parse_rus_sent(root)
    sentenizer.add_dep_to_target()
    sentenizer.head_renumerate()

    sentenizer.fill_gapes()
    uas, las = sentenizer.cal_uas()
    uas_score.append(uas)
    las_score.append(las)
#     print(sentenizer.sent_target.serialize())


In [95]:
# UAS and LAS scores

np.mean(uas_score), np.mean(las_score)

(0.46169246710687056, 0.07856310078045521)

# Эксперименты со строковым разделением. Обучение модели

In [67]:
def word2features(sent, i):

    word = sent[i][0]
    lemma = sent[i][1]
    postag = sent[i][2]
    set_deprel = [x['deprel'] for x in sent[i][4]]
    id_row = sent[i][5]
    rus_deprel = [x['deprel'] for x in sent[i][4].filter(upos = postag)]
            
    features = {
#         'bias': 1.0,
        'word.lower()': word.lower(),
        'len(word)': len(word),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'postag': postag,
        'lemma.lower()': lemma.lower(),
#         'id_row': id_row,
        'id_word_in_row': i,
    }
    

    feat_transl = {str(i)+'_rus_deprel': x for i, x in enumerate(rus_deprel)}
    features.update(feat_transl)
    
    if i > 1:
        word1 = sent[i-2][0]
        lemma1 = sent[i-2][1]
        postag1 = sent[i-2][2]
        features.update({
            '-2:word.lower()': word1.lower(),
            '-2:len(word)': len(word1),
            '-2:postag': postag1,
            '-2:lemma.lower()': lemma1.lower(),
            '-2:id_word_in_row': i})
    
    if i > 0:
        word1 = sent[i-1][0]
        lemma1 = sent[i-1][1]
        postag1 = sent[i-1][2]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:len(word)': len(word1),
            '-1:postag': postag1,
            '-1:lemma.lower()': lemma1.lower(),
            '-1:id_word_in_row': i
#             '-1:postag[:2]': postag1[:2],
        })
#     else:
#         features['BOS'] = True

    if i < len(sent)-2:
        word1 = sent[i+2][0]
        lemma1 = sent[i+2][1]
        postag1 = sent[i+2][2]
        features.update({
            '+2:word.lower()': word1.lower(),
            '+2:len(word)': len(word1),
            '+2:postag': postag1,
            '+2:lemma.lower()' : lemma1.lower(),
            '+2:id_word_in_row': i})

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        lemma1 = sent[i+1][1]
        postag1 = sent[i+1][2]
        features.update({
            '+1:word.lower()': word1.lower(),
            '-1:len(word)': len(word1),
            '+1:postag': postag1,
            '+1:lemma.lower()' : lemma1.lower(),
            '+1:id_word_in_row': i

        })

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, lemma, postag, label, set_deprel, id_row in sent]

def sent2tokens(sent):
    return [token for token, lemma, postag, label, set_deprel, id_row in sent]



def trans2format_rows(sent_ky, sent_ru, i_row):
    new_sent = []   

    for word in sent_ky.filter(id=lambda x: type(x) is int):
        new_sent.append((word['form'], word['lemma'], word['upos'], word['deprel'], sent_ru, i_row))
        
    return new_sent


In [70]:
train_sents = []
test_sents = []

for i, ky_ru_sent in map_ky_ru.items():
    syntenizer = Synthesized_sent(map_ky_ru[i]['ru'], map_ky_ru[i]['ky'])
    
    for i_row, row in enumerate(syntenizer.rows_target):
        if i in ['4059', '4647', '4698', '4858', '4902', '5064', '4729', '5452']:
            pass
        elif i not in test_sents_id:
#             print(i)
            train_sents.append((trans2format_rows(row, syntenizer.rows_source[i_row], i_row)))
        else:
            
            test_sents.append((trans2format_rows(row, syntenizer.rows_source[i_row], i_row)))

            
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]


crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True,
    verbose=True
)

crf.fit(X_train, y_train)
y_pred = crf.predict(X_test)


m = MultiLabelBinarizer().fit(y_test)

print("F1-score is : {:.1%}".format(f1_score(m.transform(y_test),
         m.transform(y_pred),
         average='macro')))
print(classification_report(m.transform(y_test), m.transform(y_pred), target_names=m.classes_))

labels = list(crf.classes_)
labels

loading training data to CRFsuite: 100%|████████████████████████████████████████| 1226/1226 [00:00<00:00, 15919.34it/s]



Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 26412
Seconds required: 0.042

L-BFGS optimization
c1: 0.100000
c2: 0.100000
num_memories: 6
max_iterations: 100
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

Iter 1   time=0.03  loss=18001.20 active=26115 feature_norm=1.00
Iter 2   time=0.01  loss=13659.98 active=26018 feature_norm=0.78
Iter 3   time=0.01  loss=12936.63 active=25881 feature_norm=0.69
Iter 4   time=0.01  loss=12350.09 active=26226 feature_norm=0.82
Iter 5   time=0.01  loss=11935.93 active=26121 feature_norm=0.99
Iter 6   time=0.01  loss=11265.23 active=25703 feature_norm=1.35
Iter 7   time=0.01  loss=10767.31 active=25923 feature_norm=1.78
Iter 8   time=0.01  loss=10279.56 active=26048 feature_norm=2.31
Iter 9   time=0.02  loss=9639.45  active=26066 feature_norm=3.27
Iter 10  time=

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


['obl',
 'root',
 'nsubj',
 'appos',
 'punct',
 'conj',
 'advmod',
 'advcl',
 'ccomp',
 'obj',
 'amod',
 'nmod',
 'det',
 'vocative',
 'parataxis',
 'nmod:poss',
 '_',
 'aux',
 'compound',
 'advmod:emph',
 'nummod',
 'acl',
 'cc',
 'case',
 'xcomp',
 'cop',
 'fixed',
 'csubj',
 'discourse']

In [71]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True,
    verbose=True,
    keep_tempfiles=None
)

params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}
# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)

rs.fit(X_train, y_train)

    
predictions = rs.predict(X_test[0])

Fitting 3 folds for each of 50 candidates, totalling 150 fits


loading training data to CRFsuite: 100%|████████████████████████████████████████| 1226/1226 [00:00<00:00, 15521.42it/s]



Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 26412
Seconds required: 0.042

L-BFGS optimization
c1: 0.138868
c2: 0.014639
num_memories: 6
max_iterations: 100
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

Iter 1   time=0.03  loss=18002.07 active=25919 feature_norm=1.00
Iter 2   time=0.01  loss=13660.57 active=26041 feature_norm=0.78
Iter 3   time=0.02  loss=12937.55 active=25854 feature_norm=0.69
Iter 4   time=0.01  loss=12351.07 active=26187 feature_norm=0.82
Iter 5   time=0.02  loss=11937.31 active=26096 feature_norm=0.99
Iter 6   time=0.01  loss=11267.43 active=25624 feature_norm=1.35
Iter 7   time=0.02  loss=10770.48 active=25836 feature_norm=1.78
Iter 8   time=0.02  loss=10284.26 active=25914 feature_norm=2.31
Iter 9   time=0.02  loss=9646.58  active=25929 feature_norm=3.26
Iter 10  time=

In [72]:
crf = rs.best_estimator_
y_pred = crf.predict(X_test)

m = MultiLabelBinarizer().fit(y_test)

print("F1-score is : {:.1%}".format(f1_score(m.transform(y_test),
         m.transform(y_pred),
         average='macro')))
print(classification_report(m.transform(y_test), m.transform(y_pred), target_names=m.classes_))

F1-score is : 68.8%
              precision    recall  f1-score   support

           _       1.00      1.00      1.00         6
         acl       0.80      0.69      0.74        35
       advcl       0.72      0.80      0.76        81
      advmod       0.93      0.88      0.90        48
 advmod:emph       1.00      1.00      1.00         8
        amod       0.72      0.93      0.81        30
       appos       0.78      0.47      0.58        15
         aux       0.87      0.83      0.85        41
        case       0.83      0.77      0.80        13
          cc       1.00      1.00      1.00         4
       ccomp       0.83      0.76      0.79        25
    compound       0.56      0.47      0.51        19
        conj       0.60      0.47      0.53        72
         cop       0.90      1.00      0.95         9
         det       1.00      0.69      0.82        13
   discourse       1.00      0.75      0.86         4
       fixed       0.00      0.00      0.00         3
       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [73]:
print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(30))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-30:])

Top positive:
8.493134 det      postag:DET
8.332186 _        postag:_
7.708944 aux      postag:AUX
6.954878 cop      lemma.lower():э
6.648028 advmod   postag:ADV
5.965671 case     postag:ADP
5.939333 amod     postag:ADJ
5.807218 nummod   postag:NUM
5.632895 cc       postag:CCONJ
5.318930 obl      word[-2:]:го
5.222256 punct    postag:PUNCT
4.975901 punct    0_rus_deprel:punct
4.917686 ccomp    +1:lemma.lower():де
4.915084 obl      word[-3:]:ден
4.879851 vocative 0_rus_deprel:vocative
4.848275 compound +1:lemma.lower():мылтык
4.753372 obl      word[-2:]:га
4.737401 obj      word[-2:]:дү
4.613810 compound +1:word.lower():бери
4.509783 conj     word.lower():кирейин
4.476326 parataxis lemma.lower():каш
4.420366 ccomp    word.lower():кебиңди
4.380569 obl      word[-2:]:ге
4.233163 obl      word[-2:]:да
4.148691 advcl    +1:word.lower():албай
4.139104 obj      word.lower():аны
4.132528 vocative lemma.lower():арбак
4.081854 advcl    word[-2:]:са
4.071219 compound -1:word.lower():чабдарга
3.97