## Тема разметка и извлечение именованных сущностей

Задание 1. Написать теггер на данных с русским языком
проверить UnigramTagger, BigramTagger, TrigramTagger и их комбинации


**Working on medical dataset with symptoms, medical treatment and patients' feedback**  
from https://github.com/cimm-kzn/RuDReC/blob/master/data/rudrec_annotated.json

In [1]:
#nltk.download() 
import nltk
from nltk.tokenize import word_tokenize
import matplotlib
%matplotlib inline

In [2]:
import os, sys

module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

In [3]:
from razdel import tokenize
import json
from tqdm import notebook
from corus import load_rudrec

In [4]:
import warnings
warnings.filterwarnings("ignore")

In [5]:
import re

In [6]:
import pandas as pd

In [7]:
from nltk.tag import DefaultTagger
from nltk.tag import UnigramTagger
from nltk.tag import BigramTagger, TrigramTagger
from nltk.tag import RegexpTagger


In [8]:
# downloading medical records (simptoms, treatment)

docs = []
for rec in load_rudrec("../data/rudrec_annotated.json"):
    words = []
    labels = []
    idx_ent = -1
    len_ents = len(rec.entities)
    rec_entities = sorted(rec.entities, key=lambda v: v.start)
    ent = None
    is_start = None
    for token in tokenize(rec.text):
        type_ent = 'OUT'
        if len_ents == 0:
            words.append(token.text)
            labels.append(type_ent)
            
            continue
        if (idx_ent == -1) or (idx_ent + 1 < len_ents and token.start > ent.end):
            idx_ent += 1
            ent = rec_entities[idx_ent]
            is_start = True
            
        if (token.start >= ent.start) and (token.stop <= ent.end):
            type_ent = 'B-' + ent.entity_type if is_start else 'I-' + ent.entity_type
        
        words.append(token.text)
        labels.append(type_ent)
           
        
    docs.append([words, labels])

In [9]:
docs[9]

[['Время', 'использования', ':', '3', 'недели'],
 ['OUT', 'OUT', 'OUT', 'OUT', 'OUT']]

In [10]:
train_ready = []
valid_tagged = []
valid_untagged = []

for irec, rec in enumerate(docs):
    if irec <= 3000:
        train_ready.append(list(zip(*rec)))     
    else:
        valid_tagged.append(list(zip(*rec)))
        valid_untagged.append(rec[0])
      

In [11]:
train_ready[5]

[('Принимала', 'OUT'),
 ('Иммунал', 'B-Drugname'),
 ('строго', 'OUT'),
 ('по', 'OUT'),
 ('инструкции', 'OUT'),
 ('4', 'OUT'),
 ('раза', 'OUT'),
 ('в', 'OUT'),
 ('день', 'OUT'),
 ('3', 'OUT'),
 ('недели', 'OUT'),
 ('.', 'OUT')]

In [12]:
valid_tagged[3]

[('Рекомендую', 'OUT'), ('всем', 'OUT'), ('.', 'OUT')]

In [13]:
true_tags_valid = [t[1] for t_list in valid_tagged for t in t_list]

In [14]:
valid_untagged[3]

['Рекомендую', 'всем', '.']

In [15]:
valid_fortag = [i for i_list in valid_untagged for i in i_list]

In [16]:
valid_fortag[:5]

['Ампулу', 'нужно', 'использовать', 'в', 'течение']

#### UnigramTagger


In [17]:
unigram_tagger = UnigramTagger(train_ready)

In [18]:
valid_tags = unigram_tagger.tag(valid_fortag)

In [19]:
display(valid_tags[:15], unigram_tagger.evaluate(valid_tagged))

[('Ампулу', None),
 ('нужно', 'OUT'),
 ('использовать', 'OUT'),
 ('в', 'OUT'),
 ('течение', 'OUT'),
 ('короткого', None),
 ('времени', 'OUT'),
 ('.', 'OUT'),
 ('"', 'OUT'),
 ('Интерферон', 'B-Drugname'),
 ('"', 'OUT'),
 ('является', 'OUT'),
 ('одним', 'OUT'),
 ('из', 'OUT'),
 ('недорогих', None)]

0.8312119319069187

#### BigramTagger

In [20]:
bigram_tagger = BigramTagger(train_ready, backoff=unigram_tagger)
display(bigram_tagger.tag(valid_fortag)[:15], bigram_tagger.evaluate(valid_tagged))

[('Ампулу', None),
 ('нужно', 'OUT'),
 ('использовать', 'OUT'),
 ('в', 'OUT'),
 ('течение', 'OUT'),
 ('короткого', None),
 ('времени', 'OUT'),
 ('.', 'OUT'),
 ('"', 'OUT'),
 ('Интерферон', 'B-Drugname'),
 ('"', 'OUT'),
 ('является', 'OUT'),
 ('одним', 'OUT'),
 ('из', 'OUT'),
 ('недорогих', None)]

0.8299625175698891

#### TrigramTagger

In [21]:
trigram_tagger = TrigramTagger(train_ready, backoff=bigram_tagger)
display(trigram_tagger.tag(valid_fortag)[:15], trigram_tagger.evaluate(valid_tagged))

[('Ампулу', None),
 ('нужно', 'OUT'),
 ('использовать', 'OUT'),
 ('в', 'OUT'),
 ('течение', 'OUT'),
 ('короткого', None),
 ('времени', 'OUT'),
 ('.', 'OUT'),
 ('"', 'OUT'),
 ('Интерферон', 'B-Drugname'),
 ('"', 'OUT'),
 ('является', 'OUT'),
 ('одним', 'OUT'),
 ('из', 'OUT'),
 ('недорогих', None)]

0.8297282523816961

**Accuracy in unigram**

In [22]:
df_unigram = pd.DataFrame({"token": valid_fortag,
                           "pred_tag": [tag[1] for tag in valid_tags],
                           "true_tag": [t[1] for t_list in valid_tagged for t in t_list]},
                          columns=["token", "pred_tag", "true_tag"])

In [23]:
df_unigram.head(5)

Unnamed: 0,token,pred_tag,true_tag
0,Ампулу,,OUT
1,нужно,OUT,OUT
2,использовать,OUT,OUT
3,в,OUT,OUT
4,течение,OUT,OUT


In [24]:
df_unigram['accuracy_tag'] = 0

In [25]:
df_unigram.loc[df_unigram['true_tag']==df_unigram['pred_tag'], ['accuracy_tag']] = 1

In [26]:
df_unigram.head(3)

Unnamed: 0,token,pred_tag,true_tag,accuracy_tag
0,Ампулу,,OUT,0
1,нужно,OUT,OUT,1
2,использовать,OUT,OUT,1


In [27]:
df_unigram['true_tag'].value_counts()

OUT            23086
B-DI             854
B-ADR            676
B-Drugname       412
B-Drugform       303
B-Finding        154
B-Drugclass      127
Name: true_tag, dtype: int64

In [28]:
df_unigram['pred_tag'].value_counts()

OUT            20722
B-DI             553
B-Drugform       264
B-ADR            247
B-Drugname       221
B-Drugclass       84
B-Finding         48
Name: pred_tag, dtype: int64

In [29]:
df_unigram.loc[(df_unigram['true_tag']=='OUT'), ['accuracy_tag']].value_counts()

accuracy_tag
1               20229
0                2857
dtype: int64

In [30]:
def get_accuracy(tag_name):
    accuracy_tags = df_unigram.loc[(df_unigram['true_tag']==tag_name), ['accuracy_tag']].value_counts()
    accuracy_percent = accuracy_tags[1] / (accuracy_tags[1]+accuracy_tags[0])
    return accuracy_percent

In [31]:
for tag_name in df_unigram['pred_tag'].value_counts().index:
    print(f'Accuracy for {tag_name} tag is {get_accuracy(tag_name)} %')

Accuracy for OUT tag is 0.876245343498224 %
Accuracy for B-DI tag is 0.405152224824356 %
Accuracy for B-Drugform tag is 0.834983498349835 %
Accuracy for B-ADR tag is 0.23076923076923078 %
Accuracy for B-Drugname tag is 0.5169902912621359 %
Accuracy for B-Drugclass tag is 0.6456692913385826 %
Accuracy for B-Finding tag is 0.06493506493506493 %


#### Комбинация тэггеров

Примущество Backoff Tagging в том, что если текущий тэггер не знает, как тэггировать слово, он передает это следующему и так далее, пока не пройдет перебор по всем тэггерам. В данному случае тэггирование производит последовательность UnigramTagger, BigramTagger, TrigramTagger. Комбинация тэггеров дала немного лучший результат, чем UnigramTagger, BigramTagger по отдельности.

In [32]:
def backoff_tagger(train_sents, tagger_classes, backoff=None):
    for cls in tagger_classes:
        backoff = cls(train_sents, backoff=backoff)
    return backoff


backoff = DefaultTagger('NN') 
tag = backoff_tagger(train_ready,  
                     [UnigramTagger, BigramTagger, TrigramTagger],  
                     backoff = backoff) 
  
tag.evaluate(valid_tagged) 

0.8297282523816961

In [33]:
# the score ia identical to Trigram Tagger. Checking without backoff_tagger() function

In [34]:
tag1 = UnigramTagger(train_ready, backoff=backoff)
tag2 = BigramTagger(train_ready, backoff=tag1)
tag3 = TrigramTagger(train_ready, backoff=tag2)

In [35]:
tag3.evaluate(valid_tagged)

0.8297282523816961

In [36]:
# the same. Backoff on previous tagging didn't improve the result.