# Demo1: Role Keywords Extraction (抽取角色关键词)

In [1]:
from keywords_extractor import KeywordsExtractor
KE = KeywordsExtractor(lang='en')



Language: English
Loading word vectors......


In [2]:
# load training dataset
import pandas as pd
data = pd.read_csv('data/bbc_100/train.csv')
contents = list(data['content'])
labels = list(data['label'])

In [3]:
# extract keywords
kws_dict = KE.global_role_kws_extraction_one_line(contents, labels, output_dir='saved_keywords',name='bbc_100')
kws_dict.keys()

ls dict already exists at:  saved_keywords/global_ls_dict_bbc_100.pkl
lr dict already exists at:  saved_keywords/global_lr_dict_bbc_100.pkl
global roles dict already exists at:  saved_keywords/global_kws_dict_bbc_100.pkl


dict_keys(['global_ls', 'global_lr', 'global_roles'])

In [8]:
for key in kws_dict['global_roles']:
    print(f"keywords for \"{key}\":")
    for each in ['ccw','scw','fcw','iw']:
        print(f"{each}: {kws_dict['global_roles'][key][each][:10]}")

keywords for "entertainment":
ccw: ['concerts', 'Music', 'festival', 'producer', 'movie', 'dance', 'gigs', 'Awards', 'concert', 'singers']
scw: ['entertainment', 'media', 'news', 'online', 'social', 'culture', 'education', 'feature', 'features', 'world']
fcw: ['Award', 'commit', 'Academy', 'bassist', 'copies', 'anniversary', 'Aids', 'Led', '8am', 'Will']
iw: ['was', 'about', 'not', 'Mr', 'last', 'months', 'In', 'when', 'found', 'A']
keywords for "tech":
ccw: ['software', 'computer', 'PC', 'devices', 'gadget', 'Internet', 'broadband', 'video', 'images', 'technologies']
scw: ['manufacturing', 'Google', 'telecoms', 'modern', 'energy', 'art', 'business', 'Hollywood', 'Chinese', 'businesses']
fcw: ['distribute', 'improve', 'managing', 'listen', 'households', 'downloaded', 'sharing', 'Currently', 'broadcaster', 'severe']
iw: ['had', 'his', 'singer', 'Sunday', 'rights', 'third', 'side', 'actions', 'second', 'spokesman']
keywords for "politics":
ccw: ['Labour', 'Liberal', 'Tories', 'Secretary'

dict_keys(['lr', 'ls', 'ccw', 'scw', 'fcw', 'iw'])

# Demo2: Selective Text Augmentation (针对性文本增强)

In [39]:
# from text_augmenter import TextAugmenter
TA = TextAugmenter(lang='en')

Language: English


在`TextAugmenter`类中，对删除、替换、插入、顺序互换等增强操作(operations)做了统一的接口:
- .aug_by_deletion(text, p, mode, selected_words)
- .aug_by_replacement(text, p, mode, selected_words)
- .aug_by_insertion(text, p, mode, selected_words)
- .aug_by_swap(text, p, mode, selected_words)
- .aug_by_selection(text, selected_words)

上述5中方法中，除了`aug_by_selection()`之外，其余方法均可通过设置`mode='random'`或者`mode='selective'`来决定使用“随机”增强还是“针对性”增强。

## 当使用随机增强时 (`mode='random'`):

In [11]:
contents[10]

"Parmalat bank barred from suing\n\nBank of America has been banned from suing Parmalat, the food group which went bust in 2003 after an accounting scandal.\n\nThe bank - along with investors, auditors and the group's managers - wants damages for being a victim of fraud at the hands of the Italian firm. But a judge has barred Bank of America and two auditors from the case. The bank, and Italaudit - formerly the Italian arm of auditor Grant Thornton - face lawsuits and possible prosecution. A second auditor, Deloitte & Touche, has also been banned from the case. Grant Thornton - now rid of the Italian unit at the centre of the case - is still being permitted to sue, as are Consob, Italy's stock market regulator, hundreds of small investors and Parmalat's new managers. Parmalat collapsed in December 2003 after it emerged that the 4bn euros ($5.2bn; 拢2.8bn) it supposedly held in a Bank of American offshore account did not in fact exist.\n"

In [41]:
sentence = "Bank of America has been banned from suing Parmalat, the food group which went bust in 2003 after an accounting scandal"
p = 0.1
print(' '.join(TA.aug_by_deletion(text=sentence,p=p,mode='random')))
print(' '.join(TA.aug_by_replacement(text=sentence,p=p,mode='random')))
print(' '.join(TA.aug_by_insertion(text=sentence,p=p,mode='random')))
print(' '.join(TA.aug_by_swap(text=sentence,p=p,mode='random')))

Bank of America has been banned from suing Parmalat , the food group which went bust in 2003 after an accounting scandal
Bank of America has been banned from suing Parmalat , the meal group which went bust in 2003 after an accounting scandal
Bank of America has been banned from suing Parmalat becom_ing , the food group which went bust in food_pantries_shelters 2003 after an accounting scandal
suing of America has been in from Bank Parmalat , the food group which went bust banned 2003 after an accounting scandal


## 当使用针对性增强时 (`mode='selective'`)
跟随机增强相比，针对性增强只需要指定对应的`selected_words`即可：

In [42]:
print(' '.join(TA.aug_by_deletion(text=sentence,p=p,mode='selective',selected_words=['food','banned'])))
print(' '.join(TA.aug_by_replacement(text=sentence,p=p,mode='selective',selected_words=['food','banned'])))
print(' '.join(TA.aug_by_insertion(text=sentence,p=p,mode='selective',selected_words=['food','banned'])))
print(' '.join(TA.aug_by_swap(text=sentence,p=p,mode='selective',selected_words=['food','banned'])))
print(' '.join(TA.aug_by_selection(text=sentence, selected_words=['Bank','accounting'])))

Bank of America has been from suing Parmalat , the group which went bust in 2003 after an accounting scandal
Bank of America has been prohibiting from suing Parmalat , the nourishing_meals group which went bust in 2003 after an accounting scandal
Bank of America has been banned from suing prohibits Parmalat , the food group which nutritious_foods went bust in 2003 after an accounting scandal
Bank of America has been group from suing Parmalat , the in banned which went bust food 2003 after an accounting scandal
Bank accounting


在文本分类任务中，不同的词可能会有不同的角色(roles)。在我们的论文中，我们提出如下规则：
- 对于 deletion/replacement 操作，应避开 gold words
- 对于 insertion 操作，应避开 venture words
- 对于 selection 操作，直接选取 gold words 和标点

In [43]:
# read saved keywords
import pickle
name = 'bbc_100'
global_kws_dict_path = f'saved_keywords/global_kws_dict_{name}.pkl'
with open(global_kws_dict_path, 'rb') as f:
    global_kws_dict = pickle.load(f)

In [51]:
category = 'business'
kws = global_kws_dict[category]
print(' '.join(TA.aug_by_deletion(sentence, p, 'selective', print_info=True,
                   selected_words=kws['scw']+kws['fcw']+kws['iw'])))  # except ccw
print(' '.join(TA.aug_by_replacement(sentence, p, 'selective', print_info=True,
                   selected_words=kws['scw']+kws['fcw']+kws['iw'])))  # except ccw
print(' '.join(TA.aug_by_insertion(sentence, p, 'selective', print_info=True,
                   selected_words=kws['ccw']+kws['scw']+kws['iw'])))  # except ccw

punc_list = [w for w in ',.，。!?！？;；、']
print(' '.join(TA.aug_by_selection(sentence, print_info=True,
                    selected_words=kws['ccw']+punc_list)))

deletion info: ['banned', 'went']
Bank of America has been from suing Parmalat , the food group which bust in 2003 after an accounting scandal
replacement info: [('went', 'drove'), ('banned', 'ban')]
Bank of America has been ban from suing Parmalat , the food group which drove bust in 2003 after an accounting scandal
insertion info: [('accounting', 'Irina_Parkhomenko_spokeswoman'), ('went', 'gone')]
Bank of America Irina_Parkhomenko_spokeswoman has been banned from suing Parmalat , the food group which went gone bust in 2003 after an accounting scandal
selection info: Parmalat
selection info: ,
selection info: bust
selection info: accounting
Parmalat , bust accounting
