In [1]:
import stanza

In [2]:
import torch

In [3]:
torch.cuda.is_available()

True

In [4]:
!ls -all ../saved_models/ner

total 95832
drwxr-xr-x 2 captain captain     4096 июн 16 13:46 .
drwxr-xr-x 3 captain captain     4096 июн  5 13:12 ..
-rw-r--r-- 1 captain captain 49059548 июн 16 19:56 ru_kasha_nertagger.pt
-rw-r--r-- 1 captain captain 49058278 июн  8 13:18 ru_pampers_nertagger.pt


In [5]:
stanza.download('ru')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.0.0.json: 115kB [00:00, 5.03MB/s]                    
2020-06-16 20:49:34 INFO: Downloading default packages for language: ru (Russian)...
2020-06-16 20:49:35 INFO: File exists: /home/captain/stanza_resources/ru/default.zip.
2020-06-16 20:49:41 INFO: Finished downloading models and saved to /home/captain/stanza_resources.


In [6]:
nlp = stanza.Pipeline('ru', processors='tokenize,ner', tokenize_pretokenized=True, ner_model_path="../saved_models/ner/ru_kasha_nertagger.pt")

2020-06-16 20:49:45 INFO: Loading these models for language: ru (Russian):
| Processor | Package                 |
---------------------------------------
| tokenize  | syntagrus               |
| ner       | ../saved_m...rtagger.pt |

2020-06-16 20:49:45 INFO: Use device: gpu
2020-06-16 20:49:45 INFO: Loading: tokenize
2020-06-16 20:49:45 INFO: Loading: ner
2020-06-16 20:49:47 INFO: Done loading processors!


In [7]:
print(nlp.loaded_processors[1].pipeline.processors["tokenize"].config)

{'model_path': '/home/captain/stanza_resources/ru/tokenize/syntagrus.pt', 'pretokenized': True, 'lang': 'ru', 'mode': 'predict'}


In [8]:
doc = nlp("Каша ФЛЕР	льняная	с	кэробом	и кунжутом,	400	г")
print(*[f'entity: {ent.text}\ttype: {ent.type}' for sent in doc.sentences for ent in sent.ents], sep='\n')

entity: Каша	type: PRODUCT
entity: ФЛЕР	type: ORG
entity: льняная	type: PRODUCT
entity: с	type: PRODUCT
entity: кэробом	type: PRODUCT
entity: и	type: PRODUCT
entity: кунжутом,	type: PRODUCT
entity: 400	type: CARDINAL
entity: г	type: CARDINAL


In [9]:
from stanza.models.ner.data import DataLoader

In [10]:
def predict(line):
    doc = nlp(line)
    labels = [ent.type for sent in doc.sentences for ent in sent.ents]
    text = [ent.text for sent in doc.sentences for ent in sent.ents]
    return (labels, text)
predict("Каша ФЛЕР	льняная	с	кэробом	и кунжутом,	400	г")

(['PRODUCT',
  'ORG',
  'PRODUCT',
  'PRODUCT',
  'PRODUCT',
  'PRODUCT',
  'PRODUCT',
  'CARDINAL',
  'CARDINAL'],
 ['Каша', 'ФЛЕР', 'льняная', 'с', 'кэробом', 'и', 'кунжутом,', '400', 'г'])

 ### получить данные как остальные классификаторы

In [11]:
from collections import Counter

counter_TP= Counter()
counter_FN = Counter()
counter_FP = Counter()

def calculate_match(true_values, pred_values):
    for i, label in enumerate(filter(lambda v: v !="O",true_values)):
        if len(pred_values) <= i:
            counter_FN[label] += 1
            #counter_FP[pred_value] +=1
            continue
            
        pred_value = pred_values[i]    
        if  label == pred_value:
            counter_TP[label] += 1
        else:
            counter_FN[label] += 1
            counter_FP[pred_value] +=1

### прочитать валидационный корпус

In [23]:
# отдельно каши
valid_df_path="../data/ner/Russian-kasha/test.bio"
test_dataset = []
with open(valid_df_path, "r") as fp:
    acc = []
    for line in fp:
        if (line != "\n"):
            tmp = line.strip().split("\t")
            acc.append((tmp[0], tmp[1]))
        else:
            test_dataset.append(acc)
            acc = []
# отдельно памперсы
valid_df_path="../data/ner/Russian-pampers/test.bio"
with open(valid_df_path, "r") as fp:
    acc = []
    for line in fp:
        if (line != "\n"):
            tmp = line.strip().split("\t")
            acc.append((tmp[0], tmp[1]))
        else:
            test_dataset.append(acc)
            acc = []
test_dataset[-2:]

[[('ПОДГУЗ-ТРУС', 'PRODUCT'),
  ('PAMPERS', 'ORG'),
  ('А.Б', 'MODEL'),
  ('МАКСИ', 'ORDINAL'),
  ('8-14', 'QUANTITY'),
  ('104ШТ', 'CARDINAL')],
 [('ПОДГУЗ', 'PRODUCT'),
  ('PAMPERS', 'ORG'),
  ('АКТ.Б', 'MODEL'),
  ('МАКСИ', 'ORDINAL'),
  ('9-14КГ', 'QUANTITY'),
  ('132ШТ', 'CARDINAL')]]

In [32]:
len(test_dataset)

2398

In [24]:
test_labels = []
test_texts = []
for line in test_dataset:
    test_labels.append(list(map(lambda x: x[1], line)))
    test_texts.append(" ".join(map(lambda x: x[0], line)))
    
print(test_labels[:2])
print(test_texts[:2])

[['O', 'ORG', 'O', 'PRODUCT', 'PRODUCT', 'PRODUCT', 'O', 'CARDINAL', 'CARDINAL', 'CARDINAL', 'O', 'O', 'CARDINAL', 'O'], ['PRODUCT', 'PRODUCT', 'PRODUCT', 'PRODUCT', 'PRODUCT']]
['" УМНИЦА , КАША кукурузная, низкоаллерг ., с 4 мес ., ( 200г )"', 'Каша овсяная с печёным яблоком']


In [25]:
len(test_texts)

2398

### сохранить в файл транзакции

In [26]:
with open("/tmp/kasha_pampers_test.tsv", "w+") as fo:
    for line in test_texts:
        fo.write(line+"\n")

In [28]:
!head /tmp/kasha_pampers_test.tsv

" УМНИЦА , КАША кукурузная, низкоаллерг ., с 4 мес ., ( 200г )"
Каша овсяная с печёным яблоком
" Nestle каша безмолочная мультизлаковая 5 злаков , с 6 месяцев , 200 г "
" Remedia каша манная безмолочная пауч, с 5 месяцев , 200 г "
" Русский продукт Геркулес овсяная каша ассорти: черника, малина, земляника , 6 шт по 35 г "
" Каша молочная с 5 месяцев Агуша Овсянка , 10 шт по 200 г "
Каша гороховая с копченостями
" Каша Heinz молочная пшеничная с тыквой , 5 месяцев , 7 шт по 250 г "
КАША ХАЙНЦ Я БОЛЬШ ПАУЧ ГРЕЧ С 1Г 250Г
" Овсяная каша на кокосовом молоке , 200 гр ."


### получить предсказания

In [35]:
%%time
pred_labels = []
pred_text =[]
for line in test_texts:
    tmp = predict(line)
    pred_labels.append(tmp[0])
    pred_text.append(tmp[1])
    
len([item for sublist in pred_labels for item in sublist])

CPU times: user 18.4 s, sys: 0 ns, total: 18.4 s
Wall time: 18.4 s


18590

In [36]:
len(test_texts)/18 # скорость порядка 130-140 транзакций в секунду

133.22222222222223

In [31]:
counter_TP= Counter()
counter_FN = Counter()
counter_FP = Counter()

for y_test_l, y_pred_l in zip(test_labels, pred_labels):
    calculate_match(y_test_l, y_pred_l)
    
labels=["ORDINAL", "ORG", "PRODUCT", "MODEL", "QUANTITY"]
print("Label\t\tPrecision\tRecall\tF1\tTruePositive\tFalsePositiv\tFalseNegative")
for label in labels:
    precision = counter_TP.get(label, 0) / (counter_TP.get(label, 1) + counter_FP.get(label, 0))
    recall = counter_TP.get(label, 0) / (counter_TP.get(label, 1) + counter_FN.get(label, 0))
    f1 = 2 * (precision*recall) / max((precision + recall),1)
    print("%s\t\t%.3f\t\t%.3f\t%.3f\t%d\t\t%d\t\t%d" % (label, precision, recall, f1, counter_TP.get(label, 0), counter_FP.get(label, 0), counter_FN.get(label, 0)))

Label		Precision	Recall	F1	TruePositive	FalsePositiv	FalseNegative
ORDINAL		0.951		0.961	0.956	2064		106		83
ORG		0.951		0.968	0.959	2414		125		80
PRODUCT		0.980		0.977	0.979	4859		97		112
MODEL		0.946		0.918	0.932	1547		88		138
QUANTITY		0.970		0.957	0.964	3120		96		139


In [19]:
def calculate_match_debug(true_values, pred_values):
    for i, label in enumerate(filter(lambda v: v !="O",true_values)): #stamza не показывает O
        if len(pred_values) <= i:
            counter_FN[label] += 1
            #counter_FP[pred_value] +=1
            continue
            
        pred_value = pred_values[i]    
        if  label == pred_value:
            print (f"{i}:{label} == {pred_value}")
            counter_TP[label] += 1
        else:
            print(f"{i}:{label} <> {pred_value}")
            counter_FN[label] += 1
            counter_FP[pred_value] +=1

In [20]:
counter_TP= Counter()
counter_FN = Counter()
counter_FP = Counter()

index = 15 #11, 15
print(test_dataset[index])
print(list(zip(pred_text[index], pred_labels[index])))
print("-------------------------------------------------------------------------")
calculate_match_debug(test_labels[index], pred_labels[index])
labels=["ORDINAL", "ORG", "PRODUCT", "MODEL", "QUANTITY"]
print("-------------------------------------------------------------------------")
print("Label\t\tPrecision\tRecall\tF1\tTruePositive\tFalsePositiv\tFalseNegative")
for label in labels:
    precision = counter_TP.get(label, 0) / (counter_TP.get(label, 1) + counter_FP.get(label, 0))
    recall = counter_TP.get(label, 0) / (counter_TP.get(label, 1) + counter_FN.get(label, 0))
    f1 = 2 * (precision*recall) / max((precision + recall),1)
    print("%s\t\t%.3f\t\t%.3f\t%.3f\t%d\t\t%d\t\t%d" % (label, precision, recall, f1, counter_TP.get(label, 0), counter_FP.get(label, 0), counter_FN.get(label, 0)))

[('"', 'O'), ('Льняная', 'PRODUCT'), ('каша', 'PRODUCT'), ('кэроб,кокос,белый', 'PRODUCT'), ('лен', 'PRODUCT'), ("'", 'O'), ('Здоровка', 'ORG'), ('\'"', 'O')]
[('Льняная', 'PRODUCT'), ('каша', 'PRODUCT'), ('кэроб,кокос,белый', 'PRODUCT'), ('лен', 'PRODUCT'), ('Здоровка', 'ORG')]
-------------------------------------------------------------------------
0:PRODUCT == PRODUCT
1:PRODUCT == PRODUCT
2:PRODUCT == PRODUCT
3:PRODUCT == PRODUCT
4:ORG == ORG
-------------------------------------------------------------------------
Label		Precision	Recall	F1	TruePositive	FalsePositiv	FalseNegative
ORDINAL		0.000		0.000	0.000	0		0		0
ORG		1.000		1.000	1.000	1		0		0
PRODUCT		1.000		1.000	1.000	4		0		0
MODEL		0.000		0.000	0.000	0		0		0
QUANTITY		0.000		0.000	0.000	0		0		0


In [None]:
print(test_labels[:10])

In [None]:
print(pred_labels[:10])