In [2]:
#!/usr/bin/env python
# coding: utf8
from __future__ import unicode_literals, print_function

import plac
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding
from tqdm.auto import tqdm

In [18]:
USE_RU2 = False
if USE_RU2:
    import ru2e
    nlp = ru2e.load_ru2('../../ru2_pos_dep_stemming')
else:
    nlp = spacy.load('../../ru2_ner')
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x7f3aa716d390>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x7f3a797adac8>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x7f3a797addc8>)]

In [19]:
from spacy.gold import GoldCorpus

In [20]:
CFG = {'device': 0, 'cpu_count': 4}
TESTS = False
spacy.require_gpu()

True

In [16]:
g=GoldCorpus('../../data/UD_Russian-SynTagRus/ru_syntagrus-ud-train.json', 
             '../../data/UD_Russian-SynTagRus/ru_syntagrus-ud-test.json')

In [7]:
from utils.corpus import Corpus, tag_morphology, iter_corpora

In [8]:
SynTagRus = Corpus.from_gold('ru', g)

In [21]:
from utils.pluck import pluck, pluck_list, pluck_dict
from utils.tqdm import tqdm_batches

In [22]:
import spacy
import random
from spacy.gold import GoldParse
from spacy.scorer import Scorer
import pandas
pandas.set_option('display.precision', 3) 

def init_model(model, CFG):
    for n, m in model.pipeline:
        if m.model is True:
            print(f"Initializing model because of {n}!")
            model.begin_training(**CFG)
            break

def _evaluate(model, batches):
    scorer = Scorer()
    for batch in batches:
        # print("Example batch:", batch)
        orig_docs, golds = zip(*batch)
        docs = model.pipe([b.text for b in orig_docs])
        for doc, parse in zip(docs, golds):
            scorer.score(doc, parse)
    return scorer.scores

def evaluate(model, dataset, limit=None, batch_size=32):
    generator = dataset.iter(model, limit=limit)
    batches = tqdm_batches(minibatch(generator, batch_size), total=limit or len(dataset), leave=False)
    return _evaluate(model, batches)

def evaluate_data_source(model, ds, count=None, batch_size=32):
    # enable_entities(model, ds.ds_test.ner)
    res = evaluate(model, ds.ds_test, limit=count)
    return res

def get_ent_scores(res):
    return {k:v for k,v in res['ents_per_type'].items() if k in ds.ents}

def display_scores(list_of_scores):
    if isinstance(list_of_scores, dict):
        list_of_scores = [list_of_scores]
    display(pandas.DataFrame.from_records(list_of_scores))

def display_scores_t(list_of_scores):
    if isinstance(list_of_scores, dict):
        list_of_scores = [list_of_scores]
    display(pandas.DataFrame.from_records(list_of_scores).T)

scores = evaluate_data_source(nlp, SynTagRus, count=None, batch_size=32)
display_scores(scores)

HBox(children=(FloatProgress(value=0.0, max=6491.0), HTML(value='')))



Unnamed: 0,uas,las,ents_p,ents_r,ents_f,ents_per_type,tags_acc,token_acc
0,88.537,84.996,0.0,0.0,0.0,{},92.932,98.161


In [17]:
scores = evaluate_data_source(nlp, SynTagRus, count=None, batch_size=32)
display_scores(scores)

HBox(children=(FloatProgress(value=0.0, max=6491.0), HTML(value='')))



Unnamed: 0,uas,las,ents_p,ents_r,ents_f,ents_per_type,tags_acc,token_acc
0,88.537,84.996,0.0,0.0,0.0,{},92.932,98.161


In [23]:
spacy.displacy.render(nlp('20 декабря 2019 года на улице Советской, город Новосибирск, мы с Сашей пошли гулять'), 
                      style='ent')