In [1]:
#!/usr/bin/env python
# coding: utf8
from __future__ import unicode_literals, print_function

import plac
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding
from tqdm.auto import tqdm

In [2]:
from spacy.gold import GoldCorpus

In [3]:
CFG = {'device': 1, 'cpu_count': 4}
TESTS = False
spacy.require_gpu()

True

In [4]:
import gzip, json
def load_entries(fn): # '../data/datasets/nerus.jsonl.gz'
    entries = []
    with gzip.open(fn, 'r') as f:
        for line in tqdm(f):
            entry = json.loads(line)
            entries.append(entry)
    return entries
    #del entries

In [5]:
g=GoldCorpus('../../data/UD_Russian-SynTagRus/ru_syntagrus-ud-train.json', 
             '../../data/UD_Russian-SynTagRus/ru_syntagrus-ud-test.json')

In [6]:
g.limit = None

In [7]:
nlp_ = spacy.blank('ru')
d, gp = next(g.train_docs(nlp_))
gp.tags

['ADP___',
 'NUM___',
 'NOUN__Animacy=Inan|Case=Loc|Gender=Masc|Number=Sing',
 'ADP___',
 'PROPN__Animacy=Inan|Case=Loc|Gender=Fem|Number=Sing',
 'VERB__Aspect=Perf|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Mid',
 'ADP___',
 'NOUN__Animacy=Inan|Case=Acc|Gender=Masc|Number=Sing',
 'ADJ__Case=Nom|Degree=Pos|Gender=Fem|Number=Sing',
 'NOUN__Animacy=Inan|Case=Nom|Gender=Fem|Number=Sing',
 'NOUN__Animacy=Inan|Case=Gen|Gender=Fem|Number=Plur',
 'PUNCT___',
 'ADJ__Case=Nom|Degree=Pos|Gender=Masc|Number=Sing',
 'NOUN__Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing',
 'NOUN__Animacy=Anim|Case=Gen|Gender=Masc|Number=Sing',
 'ADP___',
 'NOUN__Animacy=Inan|Case=Ins|Gender=Neut|Number=Sing',
 'ADP___',
 'NOUN__Animacy=Inan|Case=Loc|Gender=Neut|Number=Sing',
 'NOUN__Animacy=Inan|Case=Gen|Gender=Neut|Number=Plur',
 'NOUN__Animacy=Anim|Case=Gen|Gender=Masc|Number=Sing',
 'PUNCT___']

In [8]:
for i in g.train_docs(nlp_):
    print(i)
    break

(С. ГРИНФИЛД , директор Королевского института Великобритании . , <spacy.gold.GoldParse object at 0x7f1f3ccb9480>)


In [9]:
from importlib import reload
import utils.corpus
reload(utils.corpus)

<module 'utils.corpus' from '/Projects/nlp/spacy/spacy-ru/utils/corpus.py'>

In [10]:
from utils.corpus import Corpus, tag_morphology, iter_corpora

In [11]:
SynTagRus = Corpus.from_gold('ru', g)

In [12]:
assert len(list(iter_corpora([SynTagRus]))) == 2

In [13]:
print(len(SynTagRus.ds_train), len(SynTagRus.ds_test))
print(len(SynTagRus.ds_train.pos), len(SynTagRus.ds_test.pos))
print(len(SynTagRus.ds_train.dep), len(SynTagRus.ds_test.dep))
print(len(SynTagRus.ds_train.ner), len(SynTagRus.ds_test.ner))

48814 6491
733 608
294 39
0 0


In [14]:
USE_RU2 = False
if USE_RU2:
    import ru2e
    nlp = ru2e.load_ru2('../../ru2e/')
else:
    nlp = spacy.blank('ru')
nlp.pipeline

[]

In [15]:
nlp.vocab.morphology.tag_map.clear()
nlp.vocab.morphology.tag_map

{}

In [16]:
def add_submodel(nlp, name, rebuild=True):
    if name in nlp.pipe_names and rebuild:
        nlp.disable_pipes(name)
    if name not in nlp.pipe_names:
        print("Creating new submodel for {}...".format(name))
        submodel = nlp.create_pipe(name)
        nlp.add_pipe(submodel)
    submodel = nlp.get_pipe(name)
    return submodel

In [17]:
def setup_ner(nlp, *corpora, rebuild=True):
    ner = add_submodel(nlp, 'ner', rebuild=rebuild)
    for c in iter_corpora(corpora):
        for l in c.ents:
            ner.add_label(l)

def setup_tagger(nlp, *corpora, rebuild=True):
    pos = add_submodel(nlp, 'tagger', rebuild=rebuild)
    for c in iter_corpora(corpora):
        for l in c.pos:
            pos.add_label(l, tag_morphology(l))
    print(len(pos.labels))

def setup_parser(nlp, *corpora, rebuild=True):
    dep = add_submodel(nlp, 'parser', rebuild=rebuild)
    for c in iter_corpora(corpora):
        for l in c.dep:
            dep.add_label(l)
    print(len(dep.labels))

setup_tagger(nlp, SynTagRus, rebuild=True)
setup_parser(nlp, SynTagRus, rebuild=True)
#setup_ner(nlp, SynTagRus, rebuild=True)
print(f"Morphology tag map: {len(nlp.vocab.morphology.tag_map)}")
nlp.pipeline

Creating new submodel for tagger...
745
Creating new submodel for parser...
294
Morphology tag map: 745


[('tagger', <spacy.pipeline.pipes.Tagger at 0x7f1f39d28278>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x7f1f38e309a8>)]

In [18]:
TESTS = 0

In [19]:
from utils.pluck import pluck, pluck_list, pluck_dict
from utils.tqdm import tqdm_batches

In [20]:
from IPython.core.debugger import set_trace

In [21]:
import spacy
import random
from spacy.gold import GoldParse
from spacy.scorer import Scorer
import pandas
pandas.set_option('display.precision', 3) 

def init_model(model, CFG):
    for n, m in model.pipeline:
        if m.model is True:
            print(f"Initializing model because of {n}!")
            model.begin_training(**CFG)
            break

def _evaluate(model, batches):
    scorer = Scorer()
    for batch in batches:
        # print("Example batch:", batch)
        orig_docs, golds = zip(*batch)
        docs = model.pipe([b.text for b in orig_docs])
        for doc, parse in zip(docs, golds):
            scorer.score(doc, parse)
    return scorer.scores

def evaluate(model, dataset, limit=None, batch_size=32):
    generator = dataset.iter(model, limit=limit)
    batches = tqdm_batches(minibatch(generator, batch_size), total=limit or len(dataset), leave=False)
    return _evaluate(model, batches)

def evaluate_data_source(model, ds, count=None, batch_size=32):
    # enable_entities(model, ds.ds_test.ner)
    res = evaluate(model, ds.ds_test, limit=count)
    return res

def get_ent_scores(res):
    return {k:v for k,v in res['ents_per_type'].items() if k in ds.ents}

def display_scores(list_of_scores):
    if isinstance(list_of_scores, dict):
        list_of_scores = [list_of_scores]
    display(pandas.DataFrame.from_records(list_of_scores))

def display_scores_t(list_of_scores):
    if isinstance(list_of_scores, dict):
        list_of_scores = [list_of_scores]
    display(pandas.DataFrame.from_records(list_of_scores).T)

if TESTS or 0:
    init_model(nlp, CFG)
    scores = evaluate_data_source(nlp, SynTagRus, count=None, batch_size=32)
    display_scores(scores)

In [22]:
import numpy

def get_other_pipes(nlp, *x):
    return [pipe for pipe in nlp.pipe_names if pipe not in x]

def _train_epoch(model, labels, batches):
    init_model(model, CFG)
    with model.disable_pipes(*get_other_pipes(model, 'tagger', 'parser')):
        print("Training only:", model.pipe_names)
        optimizer = model.resume_training(**CFG)
        losses = {}
        n_docs = 0
        for batch in batches:
            texts, anns = zip(*batch)
            # enable_entities(model, labels)
            model.update(texts, anns, drop=0.2, losses=losses, sgd=optimizer)
            n_docs += len(batch)
        meta = {'loss_'+k: numpy.log(1e-10 + (v / n_docs)) for k,v in losses.items()}
        meta['docs'] = n_docs
    # enable_all_entities(model)
    return meta

def train_epoch(model, ds, batch_size, count=None):
    batches = minibatch(ds.iter(model, limit=count), size=size_)
    return _train_epoch(model, ds.ner, tqdm_batches(batches, total=count or len(ds)))

if TESTS or 0:
    size_ = compounding(1., 32., 1.001)
    meta = train_epoch(nlp, SynTagRus.ds_train, batch_size=size_, count=1000)
    display_scores(meta)

if TESTS or 0:
    display_scores(evaluate_data_source(nlp, SynTagRus))    

In [23]:
if TESTS or 0:
    scorer = spacy.scorer.Scorer()
    for doc, parse in tqdm(SynTagRus.ds_test.iter(nlp, limit=1000), total=1000):
        doc = nlp(doc.text)
        #explacy.print_parse_info(nlp, doc.text)
        #parse.tags = [t.split('__', 1)[0] for t in parse.tags]
        #print([(dt.tag_, pt) for dt,pt in zip(doc, parse.tags)])
        scorer.score(doc, parse, verbose=False)
    display_scores(scorer.scores)
    #display_scores(evaluate_data_source(nlp, SynTagRus, count=1000))

In [24]:
import spacy
from notebooks.examples import explacy
if TESTS or 0:
    doc = nlp(u'Жизнь после двух тысяч первого года на Марсе стала сложней.')
    explacy.print_parse_info(nlp, doc.text)
    explacy.print_parse_info(nlp, d.text)

In [25]:
CORPORA = [SynTagRus]

In [26]:
g.limit = None
size_ = compounding(4., 32., 1.0005)
for e in tqdm(range(60)):
    res = {'epoch': e+1}
    for c in CORPORA:
        meta = train_epoch(nlp, c.ds_train, batch_size=size_, count=len(c.ds_train)//2)
        res.update(meta)
    for c in CORPORA:
        res.update(evaluate_data_source(nlp, c, count=None))
    display_scores(res)

HBox(children=(FloatProgress(value=0.0, max=60.0), HTML(value='')))

Initializing model because of tagger!
Training only: ['tagger', 'parser']


HBox(children=(FloatProgress(value=0.0, max=24407.0), HTML(value='')))

KeyboardInterrupt: 

In [None]:
import spacy.displacy
def view_example(nlp, s):
    print('Text:', s['raw'])
    doc = nlp(s['raw'])
#     print("Actual:", [(e, e.label_) for e in doc.ents])
    print("Expected:", [(s['raw'][a:b],c,a,b) for a,b,c in s['entities']])
    spacy.displacy.render(doc, style='ent')

enable_all_entities(nlp)

for s in NERUS.ds_test[:2]:
    view_example(nlp, s)
for s in KR.ds_test[:3]:
    view_example(nlp, s)

In [84]:
from pathlib import Path
# save model to output directory
def save_model(nlp, output_dir):
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)
save_model(nlp, '../../ru2_pos_dep')

Saved model to ../../ru2_pos_dep


In [None]:
nlp.tokenizer('приветы всем'.split())

In [None]:
spacy.displacy.render(nlp('20 декабря 2019 года на улице Советской, город Новосибирск, мы с Сашей пошли гулять'), 
                      style='ent')