In [1]:
import spacy

In [44]:
nlp = spacy.blank('ru')

In [45]:
ner = nlp.create_pipe('ner')

In [46]:
nlp.add_pipe(ner, last=True)

In [47]:
ner.add_label('codex')
ner.add_label('person')
ner.add_label('term')

In [48]:
optimizer = nlp.begin_training()

In [49]:
from spacy.util import minibatch, compounding
from spacy.gold import GoldParse

In [50]:
import json
from pathlib import Path
examples = json.loads(Path('../data/example.json').read_text())
str(examples)[:100]

"[{'id': 3, 'text': 'Маквивальевич здравствуйте удобно разговаривать по вопросу по поводу регистрации"

In [69]:
for e in range(4):
    for batch in minibatch([e for e in examples if e['labels']], size=4):
        print([t['labels'] for t in batch])
        docs = [nlp(t['text']) for t in batch]
        goldparses = [GoldParse(d,labels=t['labels']) for d, t in zip(docs, batch)]
        losses = {}
        nlp.update(docs, goldparses, drop=0.5, losses=losses, sgd=optimizer)

[[[69, 80, 'term'], [104, 106, 'term'], [131, 140, 'term'], [147, 168, 'term'], [196, 218, 'term'], [433, 445, 'term'], [576, 670, 'codex'], [797, 804, 'term'], [858, 875, 'term'], [981, 993, 'term'], [1001, 1008, 'term'], [1058, 1065, 'term'], [1093, 1102, 'term'], [1170, 1173, 'term'], [1211, 1227, 'term'], [1455, 1467, 'term'], [1504, 1511, 'term'], [1513, 1521, 'term'], [1528, 1538, 'term'], [1655, 1672, 'term'], [0, 13, 'person'], [236, 248, 'person'], [331, 340, 'term'], [398, 407, 'term'], [565, 571, 'codex'], [515, 530, 'codex'], [885, 897, 'person'], [724, 736, 'person'], [754, 766, 'person'], [1684, 1716, 'person'], [1720, 1748, 'person'], [1749, 1795, 'person'], [1807, 1817, 'person'], [1925, 1938, 'person'], [2172, 2176, 'person'], [2197, 2206, 'term'], [2232, 2241, 'term'], [2326, 2335, 'term'], [2364, 2376, 'term'], [2444, 2454, 'person'], [2482, 2492, 'person'], [2547, 2562, 'codex'], [2583, 2591, 'term'], [2609, 2631, 'term']], [[0, 15, 'person'], [146, 154, 'person'], 

In [70]:
doc=nlp('Валентин Петрович здравствуйте! Маквивальевич здравствуйте')
doc, doc.ents

(Валентин Петрович здравствуйте! Маквивальевич здравствуйте, ())

In [71]:
len(doc.text), doc[-1].idx, doc[-1], doc[-1].idx+len(doc[-1].text)

(58, 46, здравствуйте, 58)

In [80]:
from collections import Counter
ner = nlp.get_pipe('ner')
docs = [nlp.make_doc(t['text']) for t in examples]
beams = ner.beam_parse(docs, beam_width=16)
r = []
for doc, beam in zip(docs, beams):
    entities = ner.moves.get_beam_annot(beam)
    #print(entities)
    #toks = {t.idx: t for t in doc}
    #lens = {t.idx: doc[t.i+1].idx - t.idx for t in doc[:-1]}
    #lens[doc[-1].idx] = len(doc.text) - doc[-1].idx
    words = Counter()
    for e,v in entities.items():
        estart, eend, etype = e
        if eend == estart + 1:
            #print(estart, eend, etype)
            continue
        #print(estart, eend, len(doc))
        etype = doc.vocab.strings[etype]
        
        #while estart>0 and estart not in toks:
        #    estart -= 1
        #if estart not in toks: 
        #    continue
        words[estart, eend, etype] = v # / lens[estart]

    words_items = sorted(words.items(), key=lambda x: (-x[1], x[0]))
    for (estart, eend, etype), escore in words_items:
        r.append((doc[estart:eend].text, etype, escore))
        #print(e[0], e[1], nlp.vocab.strings[e[2]], v)
    #break

In [81]:
import pandas
df=pandas.DataFrame(r, columns=['text', 'type', 'score'])
df[df.type != 'codex']

Unnamed: 0,text,type,score
1,на наседания,person,0.0625
2,на наседания,term,0.0625
4,всего доброгостране,person,0.0625
5,всего доброгостране,term,0.0625
7,на свидания,person,0.0625
8,на свидания,term,0.0625
10,перезвоним лу,person,0.0625
11,перезвоним лу,term,0.0625
13,поняла павсемчитание,person,0.0625
14,поняла павсемчитание,term,0.0625
