# NER tutorial: EnviBert model

> This notebook contains some examples of how to use the EnviBert-based model on NER task

- skip_showdoc: true
- skip_exec: true

In [None]:
%reload_ext autoreload
%autoreload 2

In [None]:
#| hide
from nbdev.showdoc import *

## Load lib

In [None]:
# from that_nlp_library.text_transformation import *
# from that_nlp_library.text_augmentation import *
# from that_nlp_library.text_main import *

In [None]:
# from underthesea import text_normalize
# from functools import partial
# from pathlib import Path
# from importlib.machinery import SourceFileLoader
# from transformers import DataCollatorWithPadding


In [None]:
import numpy as np
import pandas as pd

import torch

import os
import re

from collections import defaultdict
from datasets import DatasetDict,Dataset

import torch.nn as nn
from transformers import XLMRobertaConfig
from transformers.modeling_outputs import TokenClassifierOutput
from transformers.models.roberta.configuration_roberta import RobertaConfig

from transformers.models.roberta.modeling_roberta import RobertaModel # body only
from transformers import TrainingArguments
#inherit this to load pretrained weight
from transformers.models.roberta.modeling_roberta import RobertaPreTrainedModel
from datasets import Features,ClassLabel

from transformers import AutoConfig

from pathlib import Path

In [None]:
DATA_PATH = Path('./data')
TRAIN_PATH = DATA_PATH/'train'
TEST_PATH = DATA_PATH/'test'

In [None]:
def read_text_file(file_path):
    with open(file_path, 'r') as f:
        return f.read()


def read_text_file_line(file_path):
    with open(file_path, 'r') as f:
        return f.readlines()


def write_text_file_line(file_path,contents):
    with open(file_path,'w') as f:
        f.writelines(contents)
    print('Write sucessfully to '+ file_path)

def process_each_file(file_str,f_name,is_train=True,max_len=360):
    """
    Inputs:
        - file_str: all texts from f_name
        - max_len: maximum len of a token list (to be suitable for BERT max_sequence_len of 514)
    
    Outputs: 
        - a list of token list, 
        - a list of POS tag, 
        - a list of chunking tag, 
        - a list of Named-Entity tag
        - a list of fname
    """
    pat = r'<s>([\S\n\t\v ]*?)</s>'
    results = re.findall(pat,file_str) # list of raw strings inside <s> tag
    token_list,pos_list,chunk_list,ner_list,name_list=[],[],[],[],[]
    
    
    curr_len=0
    _token_list,_pos_list,_chunk_list,_ner_list,_name_list=[],[],[],[],[]

    for r in results:
        _tmp = zip(*[s.strip().split('\t') for s in r.strip().split('\n') if s.strip()])
        if not is_train:
            a,b,c = _tmp
        else: a,b,c,d,_ = _tmp # tos,pos,chunk,ner
        
        # cummulating paragraphs until it reachs max_len
        if curr_len+len(a) >= max_len and len(_token_list):
            token_list.append(_token_list)
            pos_list.append(_pos_list)
            chunk_list.append(_chunk_list)
            name_list.append(_name_list)
            ner_list.append(_ner_list)

            _token_list,_pos_list,_chunk_list,_ner_list,_name_list=[],[],[],[],[]
            curr_len=0
        

        _token_list+=a
        _pos_list+=b
        _chunk_list+=c
        _name_list+=[f_name for i in range(len(a))]
        if is_train: _ner_list+=d
        curr_len+=len(a)
        
    if len(_token_list):
        token_list.append(_token_list)
        pos_list.append(_pos_list)
        chunk_list.append(_chunk_list)
        name_list.append(_name_list)
        ner_list.append(_ner_list)
            
    return token_list,pos_list,chunk_list,ner_list,name_list
            
            

Create Huggingface dataset for training set


## Load NER vietnamese data

In [None]:

# iterate through all train file
final_token_list,final_pos_list,final_chunk_list,final_ner_list,final_name_list=[],[],[],[],[]
for file in np.sort(os.listdir(TRAIN_PATH)):
    # Check whether file is in text format or not
#     print(f'process {file}')
    if file.endswith(".txt"):
        file_path = f"{TRAIN_PATH}/{file}"
        # call read text file function
        file_str = read_text_file(file_path)
        token_list,pos_list,chunk_list,ner_list,name_list = process_each_file(file_str,file)
        final_token_list+=token_list
        final_pos_list+=pos_list
        final_chunk_list+=chunk_list
        final_ner_list+=ner_list
        final_name_list+=name_list

train_dataset = Dataset.from_dict(
                        {'tokens': final_token_list,
                        'pos_str':final_pos_list,
                         'chunk_str':final_chunk_list,
                         'ner_tags_str':final_ner_list,
                         'fname':final_name_list
                        }
                    )

ner_ddict = DatasetDict()
ner_ddict['train'] = train_dataset.select(range(int(train_dataset.num_rows*0.8)))
ner_ddict['validation'] = train_dataset.select(range(int(train_dataset.num_rows*0.8),train_dataset.num_rows))

ner_ddict

DatasetDict({
    train: Dataset({
        features: ['tokens', 'pos_str', 'chunk_str', 'ner_tags_str', 'fname'],
        num_rows: 958
    })
    validation: Dataset({
        features: ['tokens', 'pos_str', 'chunk_str', 'ner_tags_str', 'fname'],
        num_rows: 240
    })
})

In [None]:
print(file_str)

﻿<title>Người xoá "xóc_chéo" mùa lũ.</title>
<editor>Vietlex team, 8-2016</editor>
-DOCSTART-
<s>				
Nghe	V	B-VP	O	O
nhiều	A	B-AP	O	O
về	E	B-PP	O	O
ông	Ns	B-NP	O	O
,	CH	O	O	O
nhưng	C	O	O	O
đến	E	B-PP	O	O
hôm_nay	N	B-NP	O	O
tôi	P	B-NP	O	O
mới	R	O	O	O
có	V	B-VP	O	O
dịp	N	B-NP	O	O
về	V	B-VP	O	O
ấp	N	B-NP	B-LOC	O
Long_Châu	NNP	I-NP	I-LOC	O
1	M	I-NP	I-LOC	O
,	CH	O	O	O
xã	N	B-NP	B-LOC	O
Thạnh_Mỹ_Tây	NNP	I-NP	I-LOC	O
(	CH	O	O	O
Châu_Phú	NNP	B-NP	B-LOC	O
,	CH	O	O	O
An_Giang	NNP	B-NP	B-LOC	O
)	CH	O	O	O
để	E	B-PP	O	O
gặp	V	B-VP	O	O
ông	Ns	B-NP	O	O
.	CH	O	O	O
</s>				
<s>				
Người_ta	N	B-NP	O	O
thường	R	O	O	O
gọi	V	B-VP	O	O
ông	Ns	B-NP	O	O
là	V	B-VP	O	O
ông	Ns	B-NP	O	O
Ba	NNP	B-NP	B-PER	O
Phước	NNP	I-NP	I-PER	O
(	CH	O	O	O
Trần	NNP	B-NP	B-PER	O
Văn	NNP	I-NP	I-PER	O
Minh	NNP	I-NP	I-PER	O
)	CH	O	O	O
-	CH	O	O	O
người	N	B-NP	O	O
đã	R	O	O	O
bỏ	V	B-VP	O	O
nhiều	A	B-AP	O	O
công	N	B-NP	O	O
của	N	B-NP	O	O
và	Cc	O	O	O
ngày_tháng	N	B-NP	O	O
để	E	B-PP	O	O
lo	V	B-VP	O	O
chỗ	N	B-NP	O	O
an_nghỉ	V	B-VP	O	O
cuối

In [None]:
ner_ddict['validation']['tokens'][0],len(ner_ddict['validation']['tokens'][0])

(['Bởi',
  'theo',
  'qui_định',
  ',',
  'họ',
  'vẫn',
  'còn',
  'một',
  'cơ_hội',
  'sống',
  'khi',
  'đặt',
  'bút',
  'viết',
  'đơn',
  'xin',
  'ân_giảm',
  'án',
  'chết',
  '...',
  'Các',
  'tử_tù',
  'như',
  'Tân',
  ',',
  'Ngọc',
  '...',
  'đã',
  'nhảy_cẫng',
  'lên',
  'và',
  'thét',
  'to',
  'đến',
  'mức',
  'như',
  'vỡ',
  'tung',
  'cả',
  'buồng',
  'giam',
  'khi',
  'bất_ngờ',
  'được',
  'thông_báo',
  'đơn',
  'xin',
  'ân_giảm',
  'đã',
  'được',
  'Chủ_tịch',
  'nước',
  'chấp_thuận',
  '.',
  'Họ',
  'cúi',
  'lạy',
  'trời_đất',
  ',',
  'cúi',
  'lạy',
  'Chủ_tịch',
  'nước',
  ',',
  'và',
  'hôn',
  'cả',
  'quản_giáo',
  '.',
  'Họ',
  'mừng',
  'như',
  'được',
  'sống',
  'lại',
  'một',
  'cuộc_đời',
  'mới',
  '.',
  'Và',
  'tự_nhiên',
  'lúc',
  'ấy',
  ',',
  'Thắng',
  'cũng',
  'thấy',
  'lòng',
  'nhẹ_nhàng',
  'đến',
  'lạ',
  '.',
  'Những',
  'bước',
  'tường',
  'bêtông',
  'hằn',
  'sâu',
  'bao',
  'hình_bóng',
  'tử_tù',
  'như',

In [None]:
ner_ddict['validation']['ner_tags_str'][0]

['O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-PER',
 'O',
 'B-PER',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-PER',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O'

## Load huggingface ner data

In [None]:
from datasets import load_dataset
from datasets import get_dataset_config_names

In [None]:
# from collections import defaultdict
# from datasets import DatasetDict

# langs = ["de", "fr", "it", "en"]
# fracs = [0.629, 0.229, 0.084, 0.059]
# # Return a DatasetDict if a key doesn't exist
# panx_ch = defaultdict(DatasetDict)

# for lang, frac in zip(langs, fracs):
#     # Load monolingual corpus
#     ds = load_dataset("xtreme", name=f"PAN-X.{lang}")
#     # Shuffle and downsample each split according to spoken proportion
#     for split in ds:
#         panx_ch[lang][split] = (
#             ds[split]
#             .shuffle(seed=0)
#             .select(range(int(frac * ds[split].num_rows))))

In [None]:
tmp = load_dataset("xtreme", name=f"PAN-X.vi")
v_ddict_sample = DatasetDict()
for split in tmp:
    v_ddict_sample[split] = tmp[split].shuffle(seed=42).select(range(int(0.5*tmp[split].num_rows)))

Found cached dataset xtreme (/home/quan/.cache/huggingface/datasets/xtreme/PAN-X.vi/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached shuffled indices for dataset at /home/quan/.cache/huggingface/datasets/xtreme/PAN-X.vi/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4/cache-90e8d12eb87708dc.arrow
Loading cached shuffled indices for dataset at /home/quan/.cache/huggingface/datasets/xtreme/PAN-X.vi/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4/cache-83187dc62d826c69.arrow
Loading cached shuffled indices for dataset at /home/quan/.cache/huggingface/datasets/xtreme/PAN-X.vi/1.0.0/29f5d57a48779f37ccb75cb8708d1095448aad0713b425bdc1ff9a4a128a56e4/cache-3b3d13f2fe95ebe9.arrow


In [None]:
v_ddict_sample

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 5000
    })
})

In [None]:
v_ddict_sample['validation']['tokens'][10]

['Cơ', 'quan', 'Mật', 'vụ', 'Hoa', 'Kỳ', '(', 'USSS', ')']

In [None]:
v_ddict_sample['validation']['ner_tags'][10]

[3, 4, 4, 4, 4, 4, 0, 0, 0]

In [None]:
tags  = v_ddict_sample['train'].features['ner_tags'].feature
tags

ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None)

In [None]:
def create_tag_names(batch):
    return {"ner_tags_str": [tags.int2str(idx) for idx in batch["ner_tags"]]}

In [None]:
v_ddict_sample

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 5000
    })
})

In [None]:
v_ddict_sample = v_ddict_sample.map(create_tag_names)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [None]:
v_ddict_sample['validation']['tokens'][10]

['Cơ', 'quan', 'Mật', 'vụ', 'Hoa', 'Kỳ', '(', 'USSS', ')']

In [None]:
v_ddict_sample['validation']['ner_tags_str'][10]

## Load actual data

In [None]:
txt_tfms=[text_normalize]

In [None]:
over_nonown_tfm = partial(sampling_with_condition,query='Source=="non owned"',frac=0.5,seed=42,apply_to_all=False)
over_nonown_tfm.__name__ = 'Oversampling Non Owned'

over_own_tfm = partial(sampling_with_condition,query='Source=="owned"',frac=2,seed=42,apply_to_all=False)
over_own_tfm.__name__ = 'Oversampling Owned'

over_hc_tfm = partial(sampling_with_condition,query='Source=="hc search"',frac=2.5,seed=42,apply_to_all=False)
over_hc_tfm.__name__ = 'Oversampling HC search'

remove_accent_tfm = partial(remove_vnmese_accent,frac=1,seed=42,apply_to_all=True)
remove_accent_tfm.__name__ = 'Add No-Accent Text'

aug_tfms = [over_nonown_tfm,over_own_tfm,over_hc_tfm,remove_accent_tfm]

Create a TextDataMain object

In [None]:
DATA_PATH = Path('secret_data')

In [None]:
tdm = TextDataMain.from_csv(DATA_PATH/'buyer_listening_with_all_raw_data_w151617.csv',
                            return_df=False,
                            main_content='Content',
                            metadatas='Source',
                            label_names='L1',
                            val_ratio=0.24,
                            split_cols='L1',
                            content_tfms = txt_tfms,
                            aug_tfms = aug_tfms,
                            process_metadatas=True,
                            seed=42,
                            shuffle_trn=True)

----- Input Validation Precheck -----
DataFrame contains missing values!
-----> List of columns and the number of missing values for each
is_valid    65804
dtype: int64
DataFrame contains duplicated values!
-----> Number of duplications: 7 rows


Define our tokenizer for EnviBert

In [None]:
cache_dir=Path('./envibert_tokenizer')
tokenizer = SourceFileLoader("envibert.tokenizer", 
                             str(cache_dir/'envibert_tokenizer.py')).load_module().RobertaTokenizer(cache_dir)

EnviBert a data collator to work. We will save this as an attribute in TDM

In [None]:
data_collator = DataCollatorWithPadding(tokenizer,padding=True,max_length=512)
tdm.set_data_collator(data_collator)

Create our DatasetDict from TextDataMain (as our `ModelController` class can also work with DatasetDict)

In [None]:
main_ddict= tdm.to_datasetdict(tokenizer,
                               max_length=512,
                               trn_ratio=0.1)

-------------------- Start Main Text Processing --------------------
----- Metadata Simple Processing & Concatenating to Main Content -----
----- Label Encoding -----
-------------------- Text Transformation --------------------
----- text_normalize -----


100%|████████████████████████████████████████████████████████████████████████████████| 112453/112453 [00:28<00:00, 3940.23it/s]


-------------------- Train Test Split --------------------
Previous Validation Percentage: 24.0%
- Before leak check
Size: 26989
- After leak check
Size: 23930
- Number of rows leaked: 3059, or 11.33% of the original validation (or test) data
Current Validation Percentage: 21.28%
-------------------- Text Augmentation --------------------
Train data size before augmentation: 88523
----- Oversampling Non Owned -----
Train data size after THIS augmentation: 98345
----- Oversampling Owned -----
Train data size after THIS augmentation: 109231
----- Oversampling HC search -----
Train data size after THIS augmentation: 116233
----- Add No-Accent Text -----


100%|███████████████████████████████████████████████████████████████████████████████| 116233/116233 [00:06<00:00, 19148.47it/s]


Train data size after THIS augmentation: 232466
Train data size after ALL augmentation: 232466
-------------------- Map Tokenize Function --------------------


Map:   0%|          | 0/23246 [00:00<?, ? examples/s]

Map:   0%|          | 0/23930 [00:00<?, ? examples/s]

In [None]:
main_ddict

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'Source', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 23246
    })
    validation: Dataset({
        features: ['text', 'label', 'Source', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 23930
    })
})

In [None]:
main_ddict['validation']['label'][:5]

[0, 7, 3, 5, 9]

# Model Experiment: EnviBert Multi-Head Classification (with Hidden Layer Concatenation)

In [None]:
from that_nlp_library.models.classifiers import *
from that_nlp_library.model_main import *

comet_ml is installed but `COMET_API_KEY` is not set.


In [None]:
from sklearn.metrics import f1_score, accuracy_score
import os

This will specify a (or a list) of GPUs for training

In [None]:
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

## Train EnviBert (with hidden layer concatenation), using TDM

Let's create our model controller

In [None]:
model_name='nguyenvulebinh/envibert'
num_classes = len(tdm.label_lists[0])

_model_kwargs={
    'concathead_class': RobertaConcatHeadSimple,
    'classifier_dropout':0.1,
    'last_hidden_size':768,  
    'is_multilabel':tdm.is_multilabel, 
    'is_multihead':tdm.is_multihead,
    'head_class_sizes': num_classes,
}

model = model_init_classification(model_class = RobertaHiddenStateConcatForSequenceClassification,
                                  cpoint_path = 'nguyenvulebinh/envibert', 
                                  output_hidden_states=True, # since we are using 'hidden layer contatenation'
                                  seed=42,
                                  model_kwargs = _model_kwargs)
metric_funcs = [partial(f1_score,average='macro'),accuracy_score]
controller = ModelController(model,tdm,metric_funcs)

Some weights of the model checkpoint at nguyenvulebinh/envibert were not used when initializing RobertaHiddenStateConcatForSequenceClassification: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaHiddenStateConcatForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaHiddenStateConcatForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaHiddenStateConcatForSequenceClassification were not initialized from the model checkpoint at nguyenvulebinh/envibert and are newly initialized: ['classification_head.out_proj.bia

And we can start training our model

In [None]:
lr = 8.2e-5
bs=8
wd=0.01
epochs= 2

In [None]:
controller.fit(epochs,lr,
               batch_size=bs,
               weight_decay=wd,
               save_checkpoint=False,
#                o_dir='sample_weights',
               compute_metrics=compute_metrics_classification,
              )

# Epoch	Training Loss	Validation Loss	F1 Score L1	Accuracy Score L1
# 1	No log	0.770289	0.633258	0.749269
# 2	0.857600	0.710960	0.689167	0.770079
# 3	0.857600	0.746624	0.698602	0.775512
# 4	0.354300	0.811047	0.700496	0.776139

# Equal weights
# Epoch	Training Loss	Validation Loss	F1 Score L1	Accuracy Score L1	F1 Score L2	Accuracy Score L2
# 1	No log	2.209617	0.622977	0.741307	0.196366	0.627549
# 2	2.476700	1.915091	0.692587	0.765965	0.281379	0.669843
# 3	2.476700	1.854167	0.696627	0.776204	0.328412	0.689694
# 4	1.101100	1.894282	0.699866	0.777666	0.330808	0.692578

# L1 1 L2 2
# Epoch	Training Loss	Validation Loss	F1 Score L1	Accuracy Score L1	F1 Score L2	Accuracy Score L2
# 1	No log	3.735447	0.614677	0.723587	0.196626	0.615388
# 2	4.016400	3.096701	0.683411	0.762496	0.304085	0.669425
# 3	4.016400	2.957187	0.698510	0.777583	0.341109	0.694040
# 4	1.739400	3.008255	0.700440	0.775242	0.349905	0.695127

In [None]:
controller.trainer.model.save_pretrained('./sample_weights/my_model')

## Predict using trained model, using TDM

### Load trained model

In [None]:
model = model_init_classification(model_class = RobertaHiddenStateConcatForSequenceClassification,
                                  cpoint_path = 'sample_weights/my_model', 
                                  output_hidden_states=True,
                                  seed=42,
                                  model_kwargs = _model_kwargs)
metric_funcs = [partial(f1_score,average='macro'),accuracy_score]
controller = ModelController(model,tdm,metric_funcs)

### Predict Train/Validation set

Make prediction on all validation set

In [None]:
df_val = controller.predict_ddict(ds_type='validation')

-------------------- Start making predictions --------------------


Map:   0%|          | 0/23930 [00:00<?, ? examples/s]



In [None]:
df_val.head()

Unnamed: 0,text,label,Source,pred_L1,pred_prob_L1
0,owned - [ Cảnh báo ] bán fa.ke giả mạo Shop Ma...,0,owned,Buyer complained seller,0.78914
1,google play - Chính sách trả hàng hoàn tiền kh...,7,google play,Return/Refund,0.976054
2,google play - Hi vọng shopee kiểm duyệt phản h...,3,google play,Feature,0.582839
3,google play - Shoppe bị lỗi r ....,5,google play,Feature,0.760949
4,google play - Hàng không đặt được gì hết một sao,9,google play,Others,0.735695


To convert the label index to string, we can use the ```label_lists``` attribute of tdm

In [None]:
df_val['label']= df_val['label'].apply(lambda x: tdm.label_lists[0][x]).values

In [None]:
f1_score(df_val.label,df_val.pred_L1,average='macro')

0.6916331451379791

### Predict Test set

We will go through details on how to make a prediction on a completely new and raw dataset using our trained model. For now, let's reuse the sample csv and pretend it's our test set

In [None]:
df_test = TextDataMain.from_csv(Path('sample_data')/'sample.csv',return_df=True)

----- Input Validation Precheck -----


We will remove all the labels and unnecessary columns

In [None]:
df_test = df_test.drop(['L1','L2'],axis=1)

In [None]:
df_test.head()

Unnamed: 0,Group,Source,Content
0,Google Play,Google Play,Mình khuyên các bạn nên mua bên Lazada hoặc Ti...
1,Google Play,Google Play,Con cc quoảng cáu ít thôi
2,iOS,iOS,Mình có một vài món hàng shipper ấn giao r mà ...
3,Google Play,Google Play,Mình đã sử dụng shoppe cũng 1 thời gian dài rồ...
4,Google Play,Google Play,Chăm sóc khách hàng quá tệ. Nhân viên hỗ trợ c...


We will create a DatasetDict for this test dataframe

In [None]:
test_ddict = tdm.get_test_datasetdict_from_df(df_test)

-------------------- Getting Test Set --------------------
----- Input Validation Precheck -----
-------------------- Start Test Set Transformation --------------------
----- Metadata Simple Processing & Concatenating to Main Content -----
-------------------- Text Transformation --------------------
----- text_normalize -----


100%|████████████████████████████████████████████████████████████████████████████████████████| 70/70 [00:00<00:00, 5981.73it/s]

-------------------- Test Leak Checking --------------------
- Before leak check
Size: 70





- After leak check
Size: 0
- Number of rows leaked: 70, or 100.00% of the original validation (or test) data
-------------------- Construct DatasetDict --------------------


Map:   0%|          | 0/70 [00:00<?, ? examples/s]

Remember the ***Leak Check*** we did in TextDataMain? Our ```df_test``` only has 70 rows, and it also shows that 70 rows of our data is leaked (100%), which is correct because this test dataset is actually a small sample of the training data.

In [None]:
test_ddict

DatasetDict({
    test: Dataset({
        features: ['text', 'Source', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 70
    })
})

Our test data has been processed + transformed (but not augmented) the same way as the validation set. Now we can start making the prediction

In [None]:
controller = ModelController(model,tdm)
df_result = controller.predict_ddict(ddict=test_ddict,ds_type='test')

-------------------- Start making predictions --------------------


Map:   0%|          | 0/70 [00:00<?, ? examples/s]



In [None]:
df_result.head()

Unnamed: 0,text,Source,pred_L1,pred_prob_L1
0,google play - Mình khuyên các bạn nên mua bên ...,google play,Services,0.749757
1,google play - Con cc quoảng cáu ít thôi,google play,Others,0.845028
2,ios - Mình có một vài món hàng shipper ấn giao...,ios,Delivery,0.963248
3,google play - Mình đã sử dụng shoppe cũng 1 th...,google play,Services,0.702858
4,google play - Chăm sóc khách hàng quá tệ . Nhâ...,google play,Services,0.943015


We can even predict top k results

In [None]:
df_result = controller.predict_ddict(ddict=test_ddict,ds_type='test',topk=3)
df_result.head()

-------------------- Start making predictions --------------------


Map:   0%|          | 0/70 [00:00<?, ? examples/s]



Unnamed: 0,text,Source,pred_L1,pred_prob_L1,pred_L1_top1,pred_L1_top2,pred_L1_top3,pred_prob_L1_top1,pred_prob_L1_top2,pred_prob_L1_top3
0,google play - Mình khuyên các bạn nên mua bên ...,google play,"[8, 7, 5]","[0.7497572, 0.11502659, 0.06754405]",Services,Return/Refund,Others,0.749757,0.115027,0.067544
1,google play - Con cc quoảng cáu ít thôi,google play,"[5, 1, 3]","[0.8450278, 0.11246138, 0.027051244]",Others,Commercial,Feature,0.845028,0.112461,0.027051
2,ios - Mình có một vài món hàng shipper ấn giao...,ios,"[2, 3, 5]","[0.9632478, 0.018235153, 0.007653378]",Delivery,Feature,Others,0.963248,0.018235,0.007653
3,google play - Mình đã sử dụng shoppe cũng 1 th...,google play,"[8, 5, 7]","[0.7028584, 0.10196633, 0.10041263]",Services,Others,Return/Refund,0.702858,0.101966,0.100413
4,google play - Chăm sóc khách hàng quá tệ . Nhâ...,google play,"[8, 5, 7]","[0.9430152, 0.038205713, 0.007924775]",Services,Others,Return/Refund,0.943015,0.038206,0.007925


If we just want to make a prediction on a small amount of data (single sentence, or a few sentences), we can use `ModelController.predict_raw_text`

In [None]:
# Since we have some metadatas, we need to define a dictionary (to imitate a DatasetDict)
raw_content={
    'Source': 'Google play',
    'Content':'Tôi không thích Shopee.Tại vì dùng app rất chậm,lag banh nhà lầu, thậm chí log in còn không đc'
}

If we don't use metadata, we can use something like this: 

```raw_content='Tôi không thích Shopee.Tại vì dùng app rất chậm,lag banh nhà lầu, thậm chí log in còn không đc'```

In [None]:
df_result = controller.predict_raw_text(raw_content,topk=1)
df_result

100%|██████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 4639.72it/s]


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]



Unnamed: 0,text,Source,pred_L1,pred_prob_L1
0,google play - Tôi không thích Shopee . Tại vì ...,google play,Feature,0.993081


In [None]:
raw_content={
    'Source': ['Google play','Owned'],
    'Content':['Tôi không thích Shopee.Tại vì dùng app rất chậm,lag banh nhà lầu, thậm chí log in còn không đc','App này xài được']
            }
df_result = controller.predict_raw_text(raw_content,topk=2)
df_result

100%|██████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 7796.10it/s]


Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]



Unnamed: 0,text,Source,pred_L1,pred_prob_L1,pred_L1_top1,pred_L1_top2,pred_prob_L1_top1,pred_prob_L1_top2
0,google play - Tôi không thích Shopee . Tại vì ...,google play,"[3, 5]","[0.99308056, 0.002194975]",Feature,Others,0.993081,0.002195
1,owned - App này xài được,owned,"[5, 1]","[0.8444226, 0.09454699]",Others,Commercial,0.844423,0.094547


## Train EnviBert (with hidden layer concatenation), using tokenized DatasetDict

In [None]:
tokenizer


RobertaTokenizer(name_or_path='', vocab_size=59993, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True)

In [None]:
data_collator

DataCollatorWithPadding(tokenizer=RobertaTokenizer(name_or_path='', vocab_size=59993, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True), padding=True, max_length=512, pad_to_multiple_of=None, return_tensors='pt')

Note that your DatasetDict must contain tokens besides raw text (which typically includes 'input_ids', 'token_type_ids', 'attention_mask')

In [None]:
main_ddict

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'Source', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 23246
    })
    validation: Dataset({
        features: ['text', 'label', 'Source', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 23930
    })
})

In [None]:
label_names = 'L1'

In [None]:
model_name='nguyenvulebinh/envibert'
num_classes = 10

_model_kwargs={
    'concathead_class': RobertaConcatHeadSimple,
    'classifier_dropout':0.1,
    'last_hidden_size':768,  
    'is_multilabel':False,
    'is_multihead': False,
    'head_class_sizes':num_classes
}

model = model_init_classification(model_class = RobertaHiddenStateConcatForSequenceClassification,
                                  cpoint_path = 'nguyenvulebinh/envibert', 
                                  output_hidden_states=True, # since we are using 'hidden layer contatenation'
                                  seed=42,
                                  model_kwargs = _model_kwargs)

metric_funcs = [partial(f1_score,average='macro'),accuracy_score] # we will use both f1_macro and accuracy score as metrics
controller = ModelController(model,
                             metric_funcs=metric_funcs)

Some weights of the model checkpoint at nguyenvulebinh/envibert were not used when initializing RobertaHiddenStateConcatForSequenceClassification: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaHiddenStateConcatForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaHiddenStateConcatForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaHiddenStateConcatForSequenceClassification were not initialized from the model checkpoint at nguyenvulebinh/envibert and are newly initialized: ['classification_head.out_proj.bia

In [None]:
lr = 8.2e-5
bs=8
wd=0.01
epochs= 2

In [None]:
controller.fit(epochs,lr,
               ddict=main_ddict,
               batch_size=bs,
               weight_decay=wd,
               save_checkpoint=False,
#                o_dir='sample_weights',
               compute_metrics=compute_metrics_classification,
               tokenizer=tokenizer,
               data_collator=data_collator,
               label_names=label_names
              )




Epoch,Training Loss,Validation Loss,F1 Score L1,Accuracy Score L1
1,No log,0.736859,0.660113,0.755662
2,0.795700,0.70203,0.691708,0.775512




In [None]:
controller.trainer.model.save_pretrained('./sample_weights/my_model')

## Predict using trained model, using tokenized DatasetDict

### Load trained model

In [None]:
model_name='nguyenvulebinh/envibert'
num_classes = 10

_model_kwargs={
    'concathead_class': RobertaConcatHeadSimple,
    'classifier_dropout':0.1,
    'last_hidden_size':768,  
    'is_multilabel':False,
    'is_multihead': False,
    'head_class_sizes':num_classes
}

model = model_init_classification(model_class = RobertaHiddenStateConcatForSequenceClassification,
                                  cpoint_path = './sample_weights/my_model', 
                                  output_hidden_states=True, # since we are using 'hidden layer contatenation'
                                  seed=42,
                                  model_kwargs = _model_kwargs)

metric_funcs = [partial(f1_score,average='macro'),accuracy_score] # we will use both f1_macro and accuracy score as metrics
controller = ModelController(model,
                             metric_funcs=metric_funcs)

### Predict

In [None]:
main_ddict

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'Source', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 23246
    })
    validation: Dataset({
        features: ['text', 'label', 'Source', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 23930
    })
})

In [None]:
my_label_name = 'L1'
my_class_predefined = ['Buyer complained seller',
 'Commercial',
 'Delivery',
 'Feature',
 'Order/Item',
 'Others',
 'Payment',
 'Return/Refund',
 'Services',
 'Shopee account']

In [None]:
df_val = controller.predict_ddict(main_ddict,
                                  ds_type='validation',
                                  is_multilabel=False,
                                  tokenizer=tokenizer,
                                  data_collator=data_collator,
                                  label_names = my_label_name,
                                  class_names_predefined=my_class_predefined
                                  )
df_val.head()

-------------------- Start making predictions --------------------


Map:   0%|          | 0/23930 [00:00<?, ? examples/s]



Unnamed: 0,text,label,Source,pred_L1,pred_prob_L1
0,owned - [ Cảnh báo ] bán fa.ke giả mạo Shop Ma...,0,owned,Buyer complained seller,0.78914
1,google play - Chính sách trả hàng hoàn tiền kh...,7,google play,Return/Refund,0.976054
2,google play - Hi vọng shopee kiểm duyệt phản h...,3,google play,Feature,0.582839
3,google play - Shoppe bị lỗi r ....,5,google play,Feature,0.760949
4,google play - Hàng không đặt được gì hết một sao,9,google play,Others,0.735695
