In [1]:
import stanza

In [2]:
import torch

In [3]:
torch.cuda.is_available()

True

In [5]:
!ls -all ../saved_models/ner

total 47892
drwxr-xr-x 2 captain captain     4096 июн  5 13:46 .
drwxr-xr-x 3 captain captain     4096 июн  5 13:12 ..
-rw-r--r-- 1 captain captain 49030689 июн  5 14:32 ru_pampers_nertagger.pt


In [4]:
stanza.download('ru')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.0.0.json: 115kB [00:00, 8.00MB/s]                    
2020-06-05 15:40:12 INFO: Downloading default packages for language: ru (Russian)...
2020-06-05 15:40:13 INFO: File exists: /home/captain/stanza_resources/ru/default.zip.
2020-06-05 15:40:18 INFO: Finished downloading models and saved to /home/captain/stanza_resources.


In [4]:
nlp = stanza.Pipeline('ru', processors='tokenize,ner', tokenize_pretokenized=True, ner_model_path="../saved_models/ner/ru_pampers_nertagger.pt")

2020-06-08 13:38:21 INFO: Loading these models for language: ru (Russian):
| Processor | Package                 |
---------------------------------------
| tokenize  | syntagrus               |
| ner       | ../saved_m...rtagger.pt |

2020-06-08 13:38:21 INFO: Use device: gpu
2020-06-08 13:38:21 INFO: Loading: tokenize
2020-06-08 13:38:21 INFO: Loading: ner
2020-06-08 13:38:23 INFO: Done loading processors!


In [5]:
nlp.loaded_processors[1].pipeline.processors["ner"].config

{'hidden_dim': 256,
 'char_hidden_dim': 100,
 'word_emb_dim': 100,
 'char_emb_dim': 100,
 'num_layers': 1,
 'char_num_layers': 1,
 'pretrain_max_vocab': 100000,
 'word_dropout': 0,
 'locked_dropout': 0.0,
 'dropout': 0.5,
 'rec_dropout': 0,
 'char_rec_dropout': 0,
 'char_dropout': 0,
 'char': True,
 'charlm': False,
 'charlm_shorthand': None,
 'char_lowercase': False,
 'lowercase': True,
 'emb_finetune': True,
 'input_transform': True,
 'scheme': 'bioes',
 'sample_train': 1.0,
 'optim': 'sgd',
 'lr': 0.1,
 'min_lr': 0.0001,
 'momentum': 0,
 'lr_decay': 0.5,
 'patience': 3,
 'max_steps': 200000,
 'eval_interval': 500,
 'batch_size': 32,
 'max_grad_norm': 5.0,
 'log_step': 20,
 'seed': 1234,
 'model_path': '../saved_models/ner/ru_pampers_nertagger.pt',
 'forward_charlm_path': '/home/captain/stanza_resources/ru/forward_charlm/newswiki.pt',
 'backward_charlm_path': '/home/captain/stanza_resources/ru/backward_charlm/newswiki.pt',
 'lang': 'ru',
 'mode': 'predict'}

In [6]:
print(nlp.loaded_processors[1].pipeline.processors["tokenize"].config)

{'model_path': '/home/captain/stanza_resources/ru/tokenize/syntagrus.pt', 'pretokenized': True, 'lang': 'ru', 'mode': 'predict'}


In [7]:
doc = nlp("Подгузники-трусики Bella baby Happy")
print(*[f'entity: {ent.text}\ttype: {ent.type}' for sent in doc.sentences for ent in sent.ents], sep='\n')

entity: Подгузники-трусики	type: PRODUCT
entity: Bella	type: ORG
entity: baby	type: ORG
entity: Happy	type: ORG


In [8]:
from stanza.models.ner.data import DataLoader

In [9]:
doc = nlp("Подгузники-трусики Bella baby Happy")
print(*[f'entity: {ent.text}\ttype: {ent.type}' for sent in doc.sentences for ent in sent.ents], sep='\n')

entity: Подгузники-трусики	type: PRODUCT
entity: Bella	type: ORG
entity: baby	type: ORG
entity: Happy	type: ORG


In [12]:
doc = nlp("Трусики-подгузники PAMPERS 2-3кг 28шт")
print(*[f'entity: {ent.text}\ttype: {ent.type}' for sent in doc.sentences for ent in sent.ents], sep='\n')

entity: Трусики-подгузники	type: PRODUCT
entity: PAMPERS	type: ORG
entity: 2-3кг	type: QUANTITY
entity: 28шт	type: CARDINAL


In [13]:
def predict(line):
    doc = nlp(line)
    labels = [ent.type for sent in doc.sentences for ent in sent.ents]
    text = [ent.text for sent in doc.sentences for ent in sent.ents]
    return (labels, text)
predict("Влажные салфетки Bella Sensitive, детские, 208 шт")

(['PRODUCT', 'PRODUCT', 'ORG', 'CARDINAL', 'CARDINAL'],
 ['Влажные', 'салфетки', 'Bella', '208', 'шт'])

 ### получить данные как остальные классификаторы

In [14]:
from collections import Counter

counter_TP= Counter()
counter_FN = Counter()
counter_FP = Counter()

def calculate_match(true_values, pred_values):
    for i, label in enumerate(filter(lambda v: v !="O",true_values)):
        if len(pred_values) <= i:
            counter_FN[label] += 1
            #counter_FP[pred_value] +=1
            continue
            
        pred_value = pred_values[i]    
        if  label == pred_value:
            counter_TP[label] += 1
        else:
            counter_FN[label] += 1
            counter_FP[pred_value] +=1

### прочитать валидационный корпус

In [15]:
valid_df_path="../data/ner/Russian-pampers/test.bio"
test_dataset = []
with open(valid_df_path, "r") as fp:
    acc = []
    for line in fp:
        if (line != "\n"):
            tmp = line.strip().split("\t")
            acc.append((tmp[0], tmp[1]))
        else:
            test_dataset.append(acc)
            acc = []
test_dataset[:2]

[[('Huggies', 'ORG'),
  ('Подгузники', 'PRODUCT'),
  ('Elite', 'MODEL'),
  ('Soft', 'MODEL'),
  ('12-22', 'QUANTITY'),
  ('кг', 'QUANTITY'),
  ('(', 'O'),
  ('размер', 'ORDINAL'),
  ('5', 'ORDINAL'),
  (')', 'O'),
  ('112', 'CARDINAL'),
  ('шт', 'CARDINAL'),
  ('Уцененный', 'O'),
  ('товар', 'O'),
  ('(№12)', 'O')],
 [('"OONIES', 'ORG'),
  ('Подгузники', 'PRODUCT'),
  (',', 'O'),
  ('размер', 'ORDINAL'),
  ('S', 'ORDINAL'),
  ('(', 'O'),
  ('3-7', 'QUANTITY'),
  ('кг', 'QUANTITY'),
  ('),', 'O'),
  ('72', 'CARDINAL'),
  ('шт', 'CARDINAL'),
  ('.(JOONIES?BUEBCHEN', 'O'),
  ('новые', 'O'),
  ('соски', 'O'),
  ('и', 'O'),
  ('бутылочки"', 'O')]]

In [16]:
test_labels = []
test_texts = []
for line in test_dataset:
    test_labels.append(list(map(lambda x: x[1], line)))
    test_texts.append(" ".join(map(lambda x: x[0], line)))
    
print(test_labels[:2])
print(test_texts[:2])

[['ORG', 'PRODUCT', 'MODEL', 'MODEL', 'QUANTITY', 'QUANTITY', 'O', 'ORDINAL', 'ORDINAL', 'O', 'CARDINAL', 'CARDINAL', 'O', 'O', 'O'], ['ORG', 'PRODUCT', 'O', 'ORDINAL', 'ORDINAL', 'O', 'QUANTITY', 'QUANTITY', 'O', 'CARDINAL', 'CARDINAL', 'O', 'O', 'O', 'O', 'O']]
['Huggies Подгузники Elite Soft 12-22 кг ( размер 5 ) 112 шт Уцененный товар (№12)', '"OONIES Подгузники , размер S ( 3-7 кг ), 72 шт .(JOONIES?BUEBCHEN новые соски и бутылочки"']


In [26]:
len(test_texts)

2103

### сохранить в файл транзакции

In [36]:
with open("/tmp/pampers_test.tsv", "w+") as fo:
    for line in test_texts:
        fo.write(line+"\n")

In [37]:
!head /tmp/pampers_test.tsv

Huggies Подгузники Elite Soft 12-22 кг ( размер 5 ) 112 шт Уцененный товар (№12)
"OONIES Подгузники , размер S ( 3-7 кг ), 72 шт .(JOONIES?BUEBCHEN новые соски и бутылочки"
Бамбуковые трусики Ракета на голубом (классика)
Merries Подгузники-трусики M 6-11 кг 58 шт 3 упаковки Уцененный товар (№23)
Многоразовый подгузник Чудо-чадо Пеленка-подгузник ситцевый
iD Подгузники для взрослых Slip M 30 шт
" Seni Подгузники для взрослых ' Super Seni Plus ', размер 3 ( 100-150 см ), 30 шт "
Wet Wipes Pampers Sensitive 224 pcs
Johnson`s baby От макушки до пяточек Влажные салфетки детские 15 шт
" Детские пеленки Helen Harper Soft&Dry 40*60 , ПРОМО, 30 шт 1401601"


### получить предсказания

In [17]:
%%time
pred_labels = []
pred_text =[]
for line in test_texts:
    tmp = predict(line)
    pred_labels.append(tmp[0])
    pred_text.append(tmp[1])
    
len([item for sublist in pred_labels for item in sublist])

CPU times: user 14.8 s, sys: 37.7 ms, total: 14.8 s
Wall time: 14.8 s


15891

In [22]:
len(test_texts)/15 # скорость порядка 140 транзакций в секунду

140.2

In [23]:
counter_TP= Counter()
counter_FN = Counter()
counter_FP = Counter()

for y_test_l, y_pred_l in zip(test_labels, pred_labels):
    calculate_match(y_test_l, y_pred_l)
    
labels=["ORDINAL", "ORG", "PRODUCT", "MODEL", "QUANTITY"]
print("Label\tPrecision\tRecall\tF1\tTruePositive\tFalsePositiv\tFalseNegative")
for label in labels:
    precision = counter_TP.get(label, 0) / (counter_TP.get(label, 1) + counter_FP.get(label, 0))
    recall = counter_TP.get(label, 0) / (counter_TP.get(label, 1) + counter_FN.get(label, 0))
    f1 = 2 * (precision*recall) / max((precision + recall),1)
    print("%s\t\t%.3f\t\t%.3f\t%.3f\t%d\t\t%d\t\t%d" % (label, precision, recall, f1, counter_TP.get(label, 0), counter_FP.get(label, 0), counter_FN.get(label, 0)))

Label	Precision	Recall	F1	TruePositive	FalsePositiv	FalseNegative
ORDINAL		0.957		0.956	0.956	2047		93		94
ORG		0.955		0.966	0.960	2134		101		75
PRODUCT		0.975		0.972	0.974	3458		87		100
MODEL		0.951		0.925	0.938	1499		77		122
QUANTITY		0.969		0.960	0.964	2836		91		119


In [24]:
def calculate_match_debug(true_values, pred_values):
    for i, label in enumerate(filter(lambda v: v !="O",true_values)): #stamza не показывает O
        if len(pred_values) <= i:
            counter_FN[label] += 1
            #counter_FP[pred_value] +=1
            continue
            
        pred_value = pred_values[i]    
        if  label == pred_value:
            print (f"{i}:{label} == {pred_value}")
            counter_TP[label] += 1
        else:
            print(f"{i}:{label} <> {pred_value}")
            counter_FN[label] += 1
            counter_FP[pred_value] +=1

In [32]:
counter_TP= Counter()
counter_FN = Counter()
counter_FP = Counter()

index = 2 #11, 15
print(test_dataset[index])
print(list(zip(pred_text[index], pred_labels[index])))
print("-------------------------------------------------------------------------")
calculate_match_debug(test_labels[index], pred_labels[index])
labels=["ORDINAL", "ORG", "PRODUCT", "MODEL", "QUANTITY"]
print("-------------------------------------------------------------------------")
print("Label\t\tPrecision\tRecall\tF1\tTruePositive\tFalsePositiv\tFalseNegative")
for label in labels:
    precision = counter_TP.get(label, 0) / (counter_TP.get(label, 1) + counter_FP.get(label, 0))
    recall = counter_TP.get(label, 0) / (counter_TP.get(label, 1) + counter_FN.get(label, 0))
    f1 = 2 * (precision*recall) / max((precision + recall),1)
    print("%s\t\t%.3f\t\t%.3f\t%.3f\t%d\t\t%d\t\t%d" % (label, precision, recall, f1, counter_TP.get(label, 0), counter_FP.get(label, 0), counter_FN.get(label, 0)))

[('Бамбуковые', 'PRODUCT'), ('трусики', 'PRODUCT'), ('Ракета', 'O'), ('на', 'O'), ('голубом', 'O'), ('(классика)', 'O')]
[('Бамбуковые', 'PRODUCT'), ('трусики', 'PRODUCT'), ('Ракета', 'ORG')]
-------------------------------------------------------------------------
0:PRODUCT == PRODUCT
1:PRODUCT == PRODUCT
-------------------------------------------------------------------------
Label		Precision	Recall	F1	TruePositive	FalsePositiv	FalseNegative
ORDINAL		0.000		0.000	0.000	0		0		0
ORG		0.000		0.000	0.000	0		0		0
PRODUCT		1.000		1.000	1.000	2		0		0
MODEL		0.000		0.000	0.000	0		0		0
QUANTITY		0.000		0.000	0.000	0		0		0


In [29]:
print(test_labels[:10])

[['ORG', 'PRODUCT', 'MODEL', 'MODEL', 'QUANTITY', 'QUANTITY', 'O', 'ORDINAL', 'ORDINAL', 'O', 'CARDINAL', 'CARDINAL', 'O', 'O', 'O'], ['ORG', 'PRODUCT', 'O', 'ORDINAL', 'ORDINAL', 'O', 'QUANTITY', 'QUANTITY', 'O', 'CARDINAL', 'CARDINAL', 'O', 'O', 'O', 'O', 'O'], ['PRODUCT', 'PRODUCT', 'O', 'O', 'O', 'O'], ['ORG', 'PRODUCT', 'ORDINAL', 'QUANTITY', 'QUANTITY', 'CARDINAL', 'CARDINAL', 'CARDINAL', 'CARDINAL', 'O', 'O', 'O'], ['PRODUCT', 'PRODUCT', 'ORG', 'PRODUCT', 'O'], ['ORG', 'PRODUCT', 'PRODUCT', 'PRODUCT', 'MODEL', 'ORDINAL', 'CARDINAL', 'CARDINAL'], ['O', 'ORG', 'PRODUCT', 'PRODUCT', 'PRODUCT', 'O', 'MODEL', 'MODEL', 'MODEL', 'O', 'ORDINAL', 'ORDINAL', 'O', 'QUANTITY', 'QUANTITY', 'O', 'CARDINAL', 'CARDINAL', 'O'], ['PRODUCT', 'PRODUCT', 'ORG', 'MODEL', 'CARDINAL', 'CARDINAL'], ['ORG', 'ORG', 'O', 'O', 'O', 'O', 'PRODUCT', 'PRODUCT', 'PRODUCT', 'CARDINAL', 'CARDINAL'], ['O', 'PRODUCT', 'PRODUCT', 'ORG', 'ORG', 'MODEL', 'QUANTITY', 'O', 'O', 'CARDINAL', 'CARDINAL', 'O']]


In [30]:
print(pred_labels[:10])

[['ORG', 'PRODUCT', 'MODEL', 'MODEL', 'QUANTITY', 'QUANTITY', 'ORDINAL', 'ORDINAL', 'CARDINAL', 'CARDINAL'], ['ORG', 'PRODUCT', 'ORDINAL', 'ORDINAL', 'QUANTITY', 'QUANTITY', 'CARDINAL', 'CARDINAL'], ['PRODUCT', 'PRODUCT', 'ORG'], ['ORG', 'PRODUCT', 'ORDINAL', 'QUANTITY', 'QUANTITY', 'CARDINAL', 'CARDINAL', 'CARDINAL', 'CARDINAL'], ['PRODUCT', 'PRODUCT', 'ORG', 'PRODUCT'], ['PRODUCT', 'PRODUCT', 'PRODUCT', 'ORG', 'ORDINAL', 'CARDINAL', 'CARDINAL'], ['ORG', 'PRODUCT', 'PRODUCT', 'PRODUCT', 'MODEL', 'MODEL', 'MODEL', 'ORDINAL', 'ORDINAL', 'QUANTITY', 'QUANTITY', 'CARDINAL', 'CARDINAL'], ['PRODUCT', 'PRODUCT', 'ORG', 'MODEL', 'CARDINAL', 'CARDINAL'], ['ORG', 'ORG', 'PRODUCT', 'PRODUCT', 'PRODUCT', 'CARDINAL', 'CARDINAL'], ['PRODUCT', 'PRODUCT', 'ORG', 'ORG', 'MODEL', 'QUANTITY', 'CARDINAL', 'CARDINAL']]
