In [1]:
import stanza

In [2]:
import torch

In [3]:
torch.cuda.is_available()

True

In [4]:
!ls -all ../saved_models/ner

total 191648
drwxr-xr-x 2 captain captain     4096 июн 25 14:52 .
drwxr-xr-x 3 captain captain     4096 июн  5 13:12 ..
-rw-r--r-- 1 captain captain 49059548 июн 16 19:56 ru_kasha_nertagger.pt
-rw-r--r-- 1 captain captain 49059561 июн 21 13:29 ru_pampers_kasha_pure_nertagger.pt
-rw-r--r-- 1 captain captain 49058278 июн  8 13:18 ru_pampers_nertagger.pt
-rw-r--r-- 1 captain captain 49050374 июн 25 14:54 ru_pure_nertagger.pt


In [5]:
stanza.download('ru')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.0.0.json: 115kB [00:00, 7.37MB/s]                    
2020-06-25 15:20:51 INFO: Downloading default packages for language: ru (Russian)...
2020-06-25 15:20:52 INFO: File exists: /home/captain/stanza_resources/ru/default.zip.
2020-06-25 15:20:59 INFO: Finished downloading models and saved to /home/captain/stanza_resources.


In [7]:
nlp = stanza.Pipeline('ru', processors='tokenize,ner', tokenize_pretokenized=True, ner_model_path="../saved_models/ner/ru_pure_nertagger.pt")

2020-06-25 15:22:43 INFO: Loading these models for language: ru (Russian):
| Processor | Package                 |
---------------------------------------
| tokenize  | syntagrus               |
| ner       | ../saved_m...rtagger.pt |

2020-06-25 15:22:43 INFO: Use device: gpu
2020-06-25 15:22:43 INFO: Loading: tokenize
2020-06-25 15:22:43 INFO: Loading: ner
2020-06-25 15:22:44 INFO: Done loading processors!


In [8]:
print(nlp.loaded_processors[1].pipeline.processors["tokenize"].config)

{'model_path': '/home/captain/stanza_resources/ru/tokenize/syntagrus.pt', 'pretokenized': True, 'lang': 'ru', 'mode': 'predict'}


In [9]:
doc = nlp("ФрутоНяня пюре десерт из вишни рябины,яблока	и смородины с 5 месяцев,90 г")
print(*[f'entity: {ent.text}\ttype: {ent.type}' for sent in doc.sentences for ent in sent.ents], sep='\n')

entity: ФрутоНяня	type: ORG
entity: пюре	type: PRODUCT
entity: десерт	type: PRODUCT
entity: из	type: PRODUCT
entity: вишни	type: PRODUCT
entity: рябины,яблока	type: PRODUCT
entity: и	type: PRODUCT
entity: смородины	type: PRODUCT
entity: с	type: QUANTITY
entity: 5	type: QUANTITY
entity: месяцев,90	type: CARDINAL
entity: г	type: CARDINAL


In [10]:
from stanza.models.ner.data import DataLoader

In [11]:
def predict(line):
    doc = nlp(line)
    labels = [ent.type for sent in doc.sentences for ent in sent.ents]
    text = [ent.text for sent in doc.sentences for ent in sent.ents]
    return (labels, text)
predict("ФрутоНяня пюре десерт из вишни рябины,яблока	и смородины с 5 месяцев,90 г")

(['ORG',
  'PRODUCT',
  'PRODUCT',
  'PRODUCT',
  'PRODUCT',
  'PRODUCT',
  'PRODUCT',
  'PRODUCT',
  'QUANTITY',
  'QUANTITY',
  'CARDINAL',
  'CARDINAL'],
 ['ФрутоНяня',
  'пюре',
  'десерт',
  'из',
  'вишни',
  'рябины,яблока',
  'и',
  'смородины',
  'с',
  '5',
  'месяцев,90',
  'г'])

 ### получить данные как остальные классификаторы

In [12]:
from collections import Counter

counter_TP= Counter()
counter_FN = Counter()
counter_FP = Counter()

def calculate_match(true_values, pred_values):
    for i, label in enumerate(filter(lambda v: v !="O",true_values)):
        if len(pred_values) <= i:
            counter_FN[label] += 1
            #counter_FP[pred_value] +=1
            continue
            
        pred_value = pred_values[i]    
        if  label == pred_value:
            counter_TP[label] += 1
        else:
            counter_FN[label] += 1
            counter_FP[pred_value] +=1

### прочитать валидационный корпус

In [15]:
# отдельно каши
valid_df_path="../data/ner/Russian-pure/test.bio"
test_dataset = []
with open(valid_df_path, "r") as fp:
    acc = []
    for line in fp:
        if (line != "\n"):
            tmp = line.strip().split("\t")
            acc.append((tmp[0], tmp[1]))
        else:
            test_dataset.append(acc)
            acc = []

In [16]:
len(test_dataset)

808

In [17]:
test_labels = []
test_texts = []
for line in test_dataset:
    test_labels.append(list(map(lambda x: x[1], line)))
    test_texts.append(" ".join(map(lambda x: x[0], line)))
    
print(test_labels[:2])
print(test_texts[:2])

[['O', 'ORG', 'PRODUCT', 'PRODUCT', 'PRODUCT', 'PRODUCT', 'PRODUCT', 'PRODUCT', 'PRODUCT', 'PRODUCT', 'QUANTITY', 'QUANTITY', 'QUANTITY', 'O', 'CARDINAL', 'CARDINAL', 'O'], ['O', 'ORG', 'ORG', 'PRODUCT', 'PRODUCT', 'O', 'CARDINAL', 'CARDINAL', 'O']]
['" ФрутоНяня пюре десерт из вишни, рябины, яблока и смородины с 5 месяцев , 90 г "', '" Егор Иваныч Пюре Говядина , 80 г "']


In [18]:
len(test_texts)

808

### сохранить в файл транзакции

In [19]:
with open("/tmp/pure_test.tsv", "w+") as fo:
    for line in test_texts:
        fo.write(line+"\n")

In [20]:
!head /tmp/pure_test.tsv

" ФрутоНяня пюре десерт из вишни, рябины, яблока и смородины с 5 месяцев , 90 г "
" Егор Иваныч Пюре Говядина , 80 г "
" Semper Пюре яблочно-персиковое с кашей с 6 месяцев , 12 шт по 120 г "
ПЮРЕ ПАУЧ БАБ ЛУКОШ ЯБ/МАЛ/ВИШ С5М 90Г
Пюре Спелёнок Брокколи с 4 мес . 80 г .
" Semper Пюре яблоко и банан с 6 месяцев , 12 шт по 90 г "
Пюре Fleur Alpine Organic Яблоко с 4 мес . 90 г .
Овощное пюре ' Бабушкино Лукошко ' Тыква с 5 месяцев 100г
" Bebivita пюре картофель и морковь с цыпленком , с 9 месяцев , 6 шт по 190 г "
" Hame яблоко - морковь фруктовое пюре , 190 г "


### получить предсказания

In [21]:
%%time
pred_labels = []
pred_text =[]
for line in test_texts:
    tmp = predict(line)
    pred_labels.append(tmp[0])
    pred_text.append(tmp[1])
    
len([item for sublist in pred_labels for item in sublist])

CPU times: user 6.37 s, sys: 30.2 ms, total: 6.4 s
Wall time: 6.41 s


7323

In [22]:
len(test_texts)/6 # скорость порядка 130-140 транзакций в секунду

134.66666666666666

In [23]:
counter_TP= Counter()
counter_FN = Counter()
counter_FP = Counter()

for y_test_l, y_pred_l in zip(test_labels, pred_labels):
    calculate_match(y_test_l, y_pred_l)
    
labels=["ORDINAL", "ORG", "PRODUCT", "MODEL", "QUANTITY"]
print("Label\t\tPrecision\tRecall\tF1\tTruePositive\tFalsePositiv\tFalseNegative")
for label in labels:
    precision = counter_TP.get(label, 0) / (counter_TP.get(label, 1) + counter_FP.get(label, 0))
    recall = counter_TP.get(label, 0) / (counter_TP.get(label, 1) + counter_FN.get(label, 0))
    f1 = 2 * (precision*recall) / max((precision + recall),1)
    print("%s\t\t%.3f\t\t%.3f\t%.3f\t%d\t\t%d\t\t%d" % (label, precision, recall, f1, counter_TP.get(label, 0), counter_FP.get(label, 0), counter_FN.get(label, 0)))

Label		Precision	Recall	F1	TruePositive	FalsePositiv	FalseNegative
ORDINAL		0.939		0.969	0.954	31		2		1
ORG		0.994		0.992	0.993	943		6		8
PRODUCT		0.994		0.996	0.995	3006		18		12
MODEL		0.987		0.962	0.974	76		1		3
QUANTITY		0.995		0.995	0.995	1355		7		7


In [24]:
def calculate_match_debug(true_values, pred_values):
    for i, label in enumerate(filter(lambda v: v !="O",true_values)): #stamza не показывает O
        if len(pred_values) <= i:
            counter_FN[label] += 1
            #counter_FP[pred_value] +=1
            continue
            
        pred_value = pred_values[i]    
        if  label == pred_value:
            print (f"{i}:{label} == {pred_value}")
            counter_TP[label] += 1
        else:
            print(f"{i}:{label} <> {pred_value}")
            counter_FN[label] += 1
            counter_FP[pred_value] +=1

In [29]:
counter_TP= Counter()
counter_FN = Counter()
counter_FP = Counter()

index = 9 #11, 15
print(test_dataset[index])
print(list(zip(pred_text[index], pred_labels[index])))
print("-------------------------------------------------------------------------")
calculate_match_debug(test_labels[index], pred_labels[index])
labels=["ORDINAL", "ORG", "PRODUCT", "MODEL", "QUANTITY"]
print("-------------------------------------------------------------------------")
print("Label\t\tPrecision\tRecall\tF1\tTruePositive\tFalsePositiv\tFalseNegative")
for label in labels:
    precision = counter_TP.get(label, 0) / (counter_TP.get(label, 1) + counter_FP.get(label, 0))
    recall = counter_TP.get(label, 0) / (counter_TP.get(label, 1) + counter_FN.get(label, 0))
    f1 = 2 * (precision*recall) / max((precision + recall),1)
    print("%s\t\t%.3f\t\t%.3f\t%.3f\t%d\t\t%d\t\t%d" % (label, precision, recall, f1, counter_TP.get(label, 0), counter_FP.get(label, 0), counter_FN.get(label, 0)))

[('"', 'O'), ('Hame', 'ORG'), ('яблоко', 'PRODUCT'), ('-', 'PRODUCT'), ('морковь', 'PRODUCT'), ('фруктовое', 'PRODUCT'), ('пюре', 'PRODUCT'), (',', 'O'), ('190', 'CARDINAL'), ('г', 'CARDINAL'), ('"', 'O')]
[('Hame', 'ORG'), ('яблоко', 'PRODUCT'), ('-', 'PRODUCT'), ('морковь', 'PRODUCT'), ('фруктовое', 'PRODUCT'), ('пюре', 'PRODUCT'), ('190', 'CARDINAL'), ('г', 'CARDINAL')]
-------------------------------------------------------------------------
0:ORG == ORG
1:PRODUCT == PRODUCT
2:PRODUCT == PRODUCT
3:PRODUCT == PRODUCT
4:PRODUCT == PRODUCT
5:PRODUCT == PRODUCT
6:CARDINAL == CARDINAL
7:CARDINAL == CARDINAL
-------------------------------------------------------------------------
Label		Precision	Recall	F1	TruePositive	FalsePositiv	FalseNegative
ORDINAL		0.000		0.000	0.000	0		0		0
ORG		1.000		1.000	1.000	1		0		0
PRODUCT		1.000		1.000	1.000	5		0		0
MODEL		0.000		0.000	0.000	0		0		0
QUANTITY		0.000		0.000	0.000	0		0		0


In [26]:
print(test_labels[:10])

[['O', 'ORG', 'PRODUCT', 'PRODUCT', 'PRODUCT', 'PRODUCT', 'PRODUCT', 'PRODUCT', 'PRODUCT', 'PRODUCT', 'QUANTITY', 'QUANTITY', 'QUANTITY', 'O', 'CARDINAL', 'CARDINAL', 'O'], ['O', 'ORG', 'ORG', 'PRODUCT', 'PRODUCT', 'O', 'CARDINAL', 'CARDINAL', 'O'], ['O', 'ORG', 'PRODUCT', 'PRODUCT', 'PRODUCT', 'PRODUCT', 'QUANTITY', 'QUANTITY', 'QUANTITY', 'O', 'CARDINAL', 'CARDINAL', 'O', 'CARDINAL', 'CARDINAL', 'O'], ['PRODUCT', 'O', 'ORG', 'ORG', 'PRODUCT', 'QUANTITY', 'CARDINAL'], ['PRODUCT', 'ORG', 'PRODUCT', 'QUANTITY', 'QUANTITY', 'QUANTITY', 'O', 'CARDINAL', 'CARDINAL', 'O'], ['O', 'ORG', 'PRODUCT', 'PRODUCT', 'PRODUCT', 'PRODUCT', 'QUANTITY', 'QUANTITY', 'QUANTITY', 'O', 'CARDINAL', 'CARDINAL', 'O', 'CARDINAL', 'CARDINAL', 'O'], ['PRODUCT', 'ORG', 'ORG', 'MODEL', 'PRODUCT', 'QUANTITY', 'QUANTITY', 'QUANTITY', 'O', 'CARDINAL', 'CARDINAL', 'O'], ['PRODUCT', 'PRODUCT', 'O', 'ORG', 'ORG', 'O', 'PRODUCT', 'QUANTITY', 'QUANTITY', 'QUANTITY', 'CARDINAL'], ['O', 'ORG', 'PRODUCT', 'PRODUCT', 'PRODUCT'

In [27]:
print(pred_labels[:10])

[['ORG', 'PRODUCT', 'PRODUCT', 'PRODUCT', 'PRODUCT', 'PRODUCT', 'PRODUCT', 'PRODUCT', 'PRODUCT', 'QUANTITY', 'QUANTITY', 'QUANTITY', 'CARDINAL', 'CARDINAL'], ['ORG', 'ORG', 'PRODUCT', 'PRODUCT', 'CARDINAL', 'CARDINAL'], ['ORG', 'PRODUCT', 'PRODUCT', 'PRODUCT', 'PRODUCT', 'QUANTITY', 'QUANTITY', 'QUANTITY', 'CARDINAL', 'CARDINAL', 'CARDINAL', 'CARDINAL'], ['PRODUCT', 'ORG', 'ORG', 'PRODUCT', 'QUANTITY', 'CARDINAL'], ['PRODUCT', 'ORG', 'PRODUCT', 'QUANTITY', 'QUANTITY', 'QUANTITY', 'CARDINAL', 'CARDINAL'], ['ORG', 'PRODUCT', 'PRODUCT', 'PRODUCT', 'PRODUCT', 'QUANTITY', 'QUANTITY', 'QUANTITY', 'CARDINAL', 'CARDINAL', 'CARDINAL', 'CARDINAL'], ['PRODUCT', 'ORG', 'ORG', 'MODEL', 'PRODUCT', 'QUANTITY', 'QUANTITY', 'QUANTITY', 'CARDINAL', 'CARDINAL'], ['PRODUCT', 'PRODUCT', 'ORG', 'ORG', 'PRODUCT', 'QUANTITY', 'QUANTITY', 'QUANTITY', 'CARDINAL'], ['ORG', 'PRODUCT', 'PRODUCT', 'PRODUCT', 'PRODUCT', 'PRODUCT', 'PRODUCT', 'QUANTITY', 'QUANTITY', 'QUANTITY', 'CARDINAL', 'CARDINAL', 'CARDINAL', 'CA