In [1]:
import stanza

In [2]:
import torch

In [3]:
torch.cuda.is_available()

True

In [4]:
!ls -all ../saved_models/ner

total 191648
drwxr-xr-x 2 captain captain     4096 июн 26 01:07 .
drwxr-xr-x 3 captain captain     4096 июн  5 13:12 ..
-rw-r--r-- 1 captain captain 49059548 июн 16 19:56 ru_kasha_nertagger.pt
-rw-r--r-- 1 captain captain 49061717 июн 26 01:30 ru_pampers_kasha_pure_nertagger.pt
-rw-r--r-- 1 captain captain 49058278 июн  8 13:18 ru_pampers_nertagger.pt
-rw-r--r-- 1 captain captain 49050374 июн 25 14:54 ru_pure_nertagger.pt


In [5]:
stanza.download('ru')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.0.0.json: 115kB [00:00, 6.00MB/s]                    
2020-06-26 09:52:58 INFO: Downloading default packages for language: ru (Russian)...
2020-06-26 09:52:59 INFO: File exists: /home/captain/stanza_resources/ru/default.zip.
2020-06-26 09:53:05 INFO: Finished downloading models and saved to /home/captain/stanza_resources.


In [6]:
nlp = stanza.Pipeline('ru', processors='tokenize,ner', tokenize_pretokenized=True, ner_model_path="../saved_models/ner/ru_pampers_kasha_pure_nertagger.pt")

2020-06-26 09:53:05 INFO: Loading these models for language: ru (Russian):
| Processor | Package                 |
---------------------------------------
| tokenize  | syntagrus               |
| ner       | ../saved_m...rtagger.pt |

2020-06-26 09:53:05 INFO: Use device: gpu
2020-06-26 09:53:05 INFO: Loading: tokenize
2020-06-26 09:53:05 INFO: Loading: ner
2020-06-26 09:53:07 INFO: Done loading processors!


In [7]:
print(nlp.loaded_processors[1].pipeline.processors["tokenize"].config)

{'model_path': '/home/captain/stanza_resources/ru/tokenize/syntagrus.pt', 'pretokenized': True, 'lang': 'ru', 'mode': 'predict'}


In [8]:
doc = nlp("Каша ФЛЕР	льняная	с	кэробом	и кунжутом,	400	г")
print(*[f'entity: {ent.text}\ttype: {ent.type}' for sent in doc.sentences for ent in sent.ents], sep='\n')

entity: Каша	type: PRODUCT
entity: ФЛЕР	type: ORG
entity: льняная	type: PRODUCT
entity: кэробом	type: PRODUCT
entity: кунжутом,	type: PRODUCT
entity: 400	type: CARDINAL
entity: г	type: CARDINAL


In [9]:
from stanza.models.ner.data import DataLoader

In [10]:
def predict(line):
    doc = nlp(line)
    labels = [ent.type for sent in doc.sentences for ent in sent.ents]
    text = [ent.text for sent in doc.sentences for ent in sent.ents]
    return (labels, text)
print(predict("Каша ФЛЕР	льняная	с	кэробом	и кунжутом,	400	г"))
print(predict("Фруто Няня пюре десерт из вишни рябины,яблока	и смородины с 5 месяцев,90 г"))
print(predict("Влажные салфетки Bella Sensitive, детские, 208 шт"))

(['PRODUCT', 'ORG', 'PRODUCT', 'PRODUCT', 'PRODUCT', 'CARDINAL', 'CARDINAL'], ['Каша', 'ФЛЕР', 'льняная', 'кэробом', 'кунжутом,', '400', 'г'])
(['ORG', 'ORG', 'PRODUCT', 'PRODUCT', 'PRODUCT', 'PRODUCT', 'PRODUCT', 'QUANTITY', 'QUANTITY', 'CARDINAL'], ['Фруто', 'Няня', 'пюре', 'десерт', 'вишни', 'рябины,яблока', 'смородины', '5', 'месяцев,90', 'г'])
(['PRODUCT', 'PRODUCT', 'ORG', 'CARDINAL', 'CARDINAL'], ['Влажные', 'салфетки', 'Bella', '208', 'шт'])


 ### получить данные как остальные классификаторы

In [11]:
from collections import Counter

counter_TP= Counter()
counter_FN = Counter()
counter_FP = Counter()

def calculate_match(true_values, pred_values):
    for i, label in enumerate(filter(lambda v: v !="O",true_values)):
        if len(pred_values) <= i:
            counter_FN[label] += 1
            #counter_FP[pred_value] +=1
            continue
            
        pred_value = pred_values[i]    
        if  label == pred_value:
            counter_TP[label] += 1
        else:
            counter_FN[label] += 1
            counter_FP[pred_value] +=1

### прочитать валидационный корпус

In [12]:
# весь валидационный корпус
valid_df_path="../data/ner/Russian-pampers_kasha_pure/test.bio"
test_dataset = []
with open(valid_df_path, "r") as fp:
    acc = []
    for line in fp:
        if (line != "\n"):
            tmp = line.strip().split("\t")
            if len(tmp) != 2:
                print(line)
            acc.append((tmp[0], tmp[1]))
        else:
            test_dataset.append(acc)
            acc = []

test_dataset[-2:]

[[('"', 'O'),
  ('Халеда', 'ORG'),
  ('пюре', 'PRODUCT'),
  ('из', 'O'),
  ('брокколи', 'PRODUCT'),
  (',', 'O'),
  ('с', 'O'),
  ('4', 'QUANTITY'),
  ('месяцев', 'QUANTITY'),
  (',', 'O'),
  ('90', 'CARDINAL'),
  ('г', 'CARDINAL'),
  ('"', 'O')],
 [('"', 'O'),
  ('Hipp', 'ORG'),
  ('пюре', 'PRODUCT'),
  ('яблоко', 'PRODUCT'),
  (',', 'O'),
  ('банан', 'PRODUCT'),
  (',', 'O'),
  ('малина', 'PRODUCT'),
  (',', 'O'),
  ('злаки', 'PRODUCT'),
  ("'", 'O')]]

In [13]:
len(test_dataset)

3138

In [14]:
test_labels = []
test_texts = []
for line in test_dataset:
    test_labels.append(list(map(lambda x: x[1], line)))
    test_texts.append(" ".join(map(lambda x: x[0], line)))
    
print(test_labels[:2])
print(test_texts[:2])

[['ORG', 'PRODUCT', 'MODEL', 'MODEL', 'QUANTITY', 'QUANTITY', 'O', 'ORDINAL', 'ORDINAL', 'O', 'CARDINAL', 'CARDINAL', 'O', 'O', 'O'], ['ORG', 'PRODUCT', 'O', 'ORDINAL', 'ORDINAL', 'O', 'QUANTITY', 'QUANTITY', 'O', 'CARDINAL', 'CARDINAL', 'O', 'O', 'O', 'O', 'O']]
['Huggies Подгузники Elite Soft 12-22 кг ( размер 5 ) 112 шт Уцененный товар (№12)', '"OONIES Подгузники , размер S ( 3-7 кг ), 72 шт .(JOONIES?BUEBCHEN новые соски и бутылочки"']


### сохранить в файл транзакции

In [15]:
with open("/tmp/pampers_kasha_pure_test.tsv", "w+") as fo:
    for line in test_texts:
        fo.write(line+"\n")

In [16]:
!head /tmp/pampers_kasha_pure_test.tsv

Huggies Подгузники Elite Soft 12-22 кг ( размер 5 ) 112 шт Уцененный товар (№12)
"OONIES Подгузники , размер S ( 3-7 кг ), 72 шт .(JOONIES?BUEBCHEN новые соски и бутылочки"
Бамбуковые трусики Ракета на голубом (классика)
Merries Подгузники-трусики M 6-11 кг 58 шт 3 упаковки Уцененный товар (№23)
Многоразовый подгузник Чудо-чадо Пеленка-подгузник ситцевый
iD Подгузники для взрослых Slip M 30 шт
" Seni Подгузники для взрослых ' Super Seni Plus ', размер 3 ( 100-150 см ), 30 шт "
Wet Wipes Pampers Sensitive 224 pcs
Johnson`s baby От макушки до пяточек Влажные салфетки детские 15 шт
" Детские пеленки Helen Harper Soft&Dry 40*60 , ПРОМО, 30 шт 1401601"


### получить предсказания

In [17]:
%%time
pred_labels = []
pred_text =[]
for line in test_texts:
    tmp = predict(line)
    pred_labels.append(tmp[0])
    pred_text.append(tmp[1])
    
len([item for sublist in pred_labels for item in sublist])

CPU times: user 23.9 s, sys: 83.3 ms, total: 24 s
Wall time: 24 s


23896

In [18]:
len(test_texts)/24 # скорость порядка 130-140 транзакций в секунду

130.75

In [19]:
counter_TP= Counter()
counter_FN = Counter()
counter_FP = Counter()

for y_test_l, y_pred_l in zip(test_labels, pred_labels):
    calculate_match(y_test_l, y_pred_l)
    
labels=["ORDINAL", "ORG", "MODEL" ,"PRODUCT",  "QUANTITY"]
print("Label\t\tPrecision\tRecall\tF1\tTruePositive\tFalsePositiv\tFalseNegative")
for label in labels:
    precision = counter_TP.get(label, 0) / (counter_TP.get(label, 1) + counter_FP.get(label, 0))
    recall = counter_TP.get(label, 0) / (counter_TP.get(label, 1) + counter_FN.get(label, 0))
    f1 = 2 * (precision*recall) / max((precision + recall),1)
    print("%s\t\t%.3f\t\t%.3f\t%.3f\t%d\t\t%d\t\t%d" % (label, precision, recall, f1, counter_TP.get(label, 0), counter_FP.get(label, 0), counter_FN.get(label, 0)))

Label		Precision	Recall	F1	TruePositive	FalsePositiv	FalseNegative
ORDINAL		0.959		0.960	0.960	2102		90		87
ORG		0.968		0.965	0.966	3246		109		119
MODEL		0.944		0.940	0.942	1672		100		107
PRODUCT		0.981		0.978	0.979	6680		130		150
QUANTITY		0.971		0.963	0.967	3871		115		148


In [20]:
def find_model_mismatch(true_values, pred_values):
    results = []
    for i, label in enumerate(filter(lambda v: v !="O",true_values)):
        if label != "MODEL":
            continue
        if len(pred_values) <= i:
            results.append(pred_values)
            continue
            
        pred_value = pred_values[i] 
        #print("true=%s pred_value=%s index=%d" % (label, pred_value, i))
        if  label != pred_value:
            results.append(pred_value)
    return results

mismatch_results = []
mismatch_index = []

for index, (y_test_l, y_pred_l) in enumerate(zip(test_labels, pred_labels)):
    tmp = find_model_mismatch(y_test_l, y_pred_l)
    if len(tmp) > 0:
        #print(index)
        mismatch_results.extend(tmp)
        mismatch_index.append(index)
#     if index >9:
#         break
        
print(len(mismatch_index))

77


In [29]:
mismatch_index[:12]

[36, 94, 139, 192, 210, 243, 265, 325, 332, 493, 557, 576]

In [30]:
mismatch_index[40:52]

[1320, 1350, 1390, 1458, 1477, 1478, 1520, 1636, 1670, 1685, 1766, 1769]

In [31]:
mismatch_index[-12:]

[2062, 2133, 2263, 2297, 2329, 2361, 2370, 2380, 2384, 2394, 2460, 2665]

In [24]:
def calculate_match_debug(true_values, pred_values):
    for i, label in enumerate(filter(lambda v: v !="O",true_values)): #stamza не показывает O
        if len(pred_values) <= i:
            counter_FN[label] += 1
            #counter_FP[pred_value] +=1
            continue
            
        pred_value = pred_values[i]    
        if  label == pred_value:
            print (f"{i}:{label} == {pred_value}")
            counter_TP[label] += 1
        else:
            print(f"{i}:{label} <> {pred_value}")
            counter_FN[label] += 1
            counter_FP[pred_value] +=1

In [32]:
index = 1320
print(test_texts[index])
print("-------------------------------------------------------------------------")
print(list(filter(lambda tup: tup[1] != "O", test_dataset[index])))
print()
print(list(zip(pred_text[index], pred_labels[index])))
print("-------------------------------------------------------------------------")
calculate_match_debug(test_labels[index], pred_labels[index])


" Подгузники Pampers New Baby ' Памперс Нью Бэби ' 2 Mini ( 3-6 кг ), 144 шт 1543907"
-------------------------------------------------------------------------
[('Подгузники', 'PRODUCT'), ('Pampers', 'ORG'), ('New', 'MODEL'), ('Baby', 'MODEL'), ('Памперс', 'ORG'), ('Нью', 'MODEL'), ('Бэби', 'MODEL'), ('2', 'ORDINAL'), ('Mini', 'ORDINAL'), ('3-6', 'QUANTITY'), ('кг', 'QUANTITY'), ('144', 'CARDINAL'), ('шт', 'CARDINAL')]

[('Подгузники', 'PRODUCT'), ('Pampers', 'ORG'), ('New', 'MODEL'), ('Baby', 'MODEL'), ('Нью', 'MODEL'), ('Бэби', 'MODEL'), ('2', 'ORDINAL'), ('Mini', 'ORDINAL'), ('3-6', 'QUANTITY'), ('кг', 'QUANTITY'), ('144', 'CARDINAL'), ('шт', 'CARDINAL')]
-------------------------------------------------------------------------
0:PRODUCT == PRODUCT
1:ORG == ORG
2:MODEL == MODEL
3:MODEL == MODEL
4:ORG <> MODEL
5:MODEL == MODEL
6:MODEL <> ORDINAL
7:ORDINAL == ORDINAL
8:ORDINAL <> QUANTITY
9:QUANTITY == QUANTITY
10:QUANTITY <> CARDINAL
11:CARDINAL == CARDINAL


In [26]:
index = 154
print(test_texts[index])
print("-------------------------------------------------------------------------")
print(test_dataset[index])
print()
print(list(zip(pred_text[index], pred_labels[index])))
print("-------------------------------------------------------------------------")
calculate_match_debug(test_labels[index], pred_labels[index])

Трусики Qianquhui Самолет- L
-------------------------------------------------------------------------
[('Трусики', 'PRODUCT'), ('Qianquhui', 'ORG'), ('Самолет-', 'O'), ('L', 'ORDINAL')]

[('Трусики', 'PRODUCT'), ('Qianquhui', 'ORG'), ('L', 'ORDINAL')]
-------------------------------------------------------------------------
0:PRODUCT == PRODUCT
1:ORG == ORG
2:ORDINAL == ORDINAL


In [27]:
print(test_labels[:10])

[['ORG', 'PRODUCT', 'MODEL', 'MODEL', 'QUANTITY', 'QUANTITY', 'O', 'ORDINAL', 'ORDINAL', 'O', 'CARDINAL', 'CARDINAL', 'O', 'O', 'O'], ['ORG', 'PRODUCT', 'O', 'ORDINAL', 'ORDINAL', 'O', 'QUANTITY', 'QUANTITY', 'O', 'CARDINAL', 'CARDINAL', 'O', 'O', 'O', 'O', 'O'], ['PRODUCT', 'PRODUCT', 'O', 'O', 'O', 'O'], ['ORG', 'PRODUCT', 'ORDINAL', 'QUANTITY', 'QUANTITY', 'CARDINAL', 'CARDINAL', 'CARDINAL', 'CARDINAL', 'O', 'O', 'O'], ['PRODUCT', 'PRODUCT', 'ORG', 'PRODUCT', 'O'], ['ORG', 'PRODUCT', 'O', 'PRODUCT', 'ORG', 'ORDINAL', 'CARDINAL', 'CARDINAL'], ['O', 'ORG', 'PRODUCT', 'O', 'PRODUCT', 'O', 'MODEL', 'MODEL', 'MODEL', 'O', 'ORDINAL', 'ORDINAL', 'O', 'QUANTITY', 'QUANTITY', 'O', 'CARDINAL', 'CARDINAL', 'O'], ['PRODUCT', 'PRODUCT', 'ORG', 'MODEL', 'CARDINAL', 'CARDINAL'], ['ORG', 'ORG', 'O', 'O', 'O', 'O', 'PRODUCT', 'PRODUCT', 'PRODUCT', 'CARDINAL', 'CARDINAL'], ['O', 'PRODUCT', 'PRODUCT', 'ORG', 'ORG', 'MODEL', 'QUANTITY', 'O', 'O', 'CARDINAL', 'CARDINAL', 'O']]


In [28]:
print(pred_labels[:10])

[['ORG', 'PRODUCT', 'MODEL', 'MODEL', 'QUANTITY', 'QUANTITY', 'ORDINAL', 'ORDINAL', 'CARDINAL', 'CARDINAL'], ['ORG', 'PRODUCT', 'ORDINAL', 'ORDINAL', 'QUANTITY', 'QUANTITY', 'CARDINAL', 'CARDINAL'], ['PRODUCT', 'PRODUCT'], ['ORG', 'PRODUCT', 'ORDINAL', 'QUANTITY', 'QUANTITY', 'CARDINAL', 'CARDINAL', 'CARDINAL', 'CARDINAL'], ['PRODUCT', 'PRODUCT', 'ORG', 'PRODUCT'], ['ORG', 'PRODUCT', 'PRODUCT', 'ORG', 'ORDINAL', 'CARDINAL', 'CARDINAL'], ['ORG', 'PRODUCT', 'PRODUCT', 'MODEL', 'MODEL', 'MODEL', 'ORDINAL', 'ORDINAL', 'QUANTITY', 'QUANTITY', 'CARDINAL', 'CARDINAL'], ['PRODUCT', 'PRODUCT', 'ORG', 'MODEL', 'CARDINAL', 'CARDINAL'], ['ORG', 'ORG', 'PRODUCT', 'PRODUCT', 'PRODUCT', 'CARDINAL', 'CARDINAL'], ['PRODUCT', 'PRODUCT', 'ORG', 'ORG', 'MODEL', 'QUANTITY', 'CARDINAL', 'CARDINAL']]
