# Domain Adaption

In [12]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM

## 1. Data Load & Preprocessing

In [13]:
train_dir = '../dataset/train/train.csv'
test_dir = '../dataset/test/test_data.csv'

In [14]:
pd_dataset = pd.read_csv(train_dir)
pd_dataset['sentence'].to_csv('data_sentence.txt', index=False, header=False, encoding='utf-8')

## 2. Load Model & Tokenizer

In [15]:
from transformers import AutoModelForMaskedLM, AutoTokenizer
MODEL_NAME = 'klue/roberta-large'

# Domain-pre-training corpora
dpt_corpus_train = 'data_sentence.txt'
dpt_corpus_train_data_selected = 'data_sentence_selected.txt'
dpt_corpus_val = 'data_sentence_val.txt'

# Fine-tuning corpora
# If there are multiple downstream NLP tasks/corpora, you can concatenate those files together
ft_corpus_train = 'vocab.txt'

# Load Model & Tokenizer
model = AutoModelForMaskedLM.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

## 3. Data Selection

In [16]:
from pathlib import Path
from transformers_domain_adaptation import DataSelector

selector = DataSelector(
    keep=0.5,  # TODO Replace with `keep`
    tokenizer=tokenizer,
    similarity_metrics=['euclidean'],
    diversity_metrics=[
        "type_token_ratio",
        "entropy",
    ],
)

In [21]:
# Load text data into memory
fine_tuning_texts = Path(ft_corpus_train).read_text(encoding='utf-8').splitlines()
training_texts = Path(dpt_corpus_train).read_text(encoding='utf-8').splitlines()

# Fit on fine-tuning corpus
selector.fit(fine_tuning_texts)

# Select relevant documents from in-domain training corpus
selected_corpus = selector.transform(training_texts)

# Save selected corpus to disk under `dpt_corpus_train_data_selected`
Path(dpt_corpus_train_data_selected).write_text('\n'.join(selected_corpus), encoding='utf-8');

computing similarity: 100%|██████████| 1/1 [00:26<00:00, 26.12s/metric]
computing diversity: 100%|██████████| 2/2 [00:00<00:00,  2.03metric/s]


In [22]:
selected_corpus[0]

'"기원전 1400년에 발생한 다사라즈나 전투가 이들 인도아리아 부족들 사이에 발생한 전투들 중 제일 대표적인 전투로, 신흥 인도아리아인 부족인 바라타족이 수다스 왕의 통솔하에 펀자브 지역의 라비 강의 지배세력으로 크게 성장하자 푸루족의 왕인 푸루쿠트샤가 펀자브 지역의 다른 인도아리아인 9부족들의 왕들과 연합하여 바라타족을 공격하면서 다사라즈나 전투가 발발하였는데, 이 전투에서 수다스 왕의 통솔로 바라타족이 푸루족을 비롯한 인도아리아 10부족을 격파하며 전쟁에서 승리하면서 이들 10부족들은 바라타족에게 흡수되었으며, 이들 10개 부족을 흡수한 바라타족은 쿠루크셰트라 지역으로 이주하였고 이후 쿠루족이라는 인도아리아인 부족으로 발전하였다."'

## 4. Vocabulary Augmentation

In [None]:
from transformers_domain_adaptation import VocabAugmentor

target_vocab_size = 32500  # len(tokenizer) == 30_522

augmentor = VocabAugmentor(
    tokenizer=tokenizer, 
    cased=False,
    target_vocab_size=target_vocab_size
)

# Obtain new domain-specific terminology based on the fine-tuning corpus
#new_tokens = augmentor.get_new_tokens(ft_corpus_train)
new_tokens = augmentor.get_new_tokens(open(dpt_corpus_train, 'rt', encoding='UTF8'))

In [None]:
new_tokens

In [None]:
# Update model and tokenizer with new vocab terminologies
tokenizer.add_tokens(new_tokens)
model.resize_token_embeddings(len(tokenizer))

## 5. Domain Pre-Training

In [25]:
import itertools as it
from pathlib import Path
from typing import Sequence, Union, Generator

from datasets import load_dataset
from transformers import DataCollatorForLanguageModeling, Trainer, TrainingArguments

In [31]:
datasets = load_dataset(
    'text', 
    data_files={
        "train": dpt_corpus_train, 
        "val": dpt_corpus_train_data_selected
    }
)

tokenized_datasets = datasets.map(
    lambda examples: tokenizer(examples['text'], truncation=True, max_length=model.config.max_position_embeddings), 
    batched=True
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

Using custom data configuration default


Downloading and preparing dataset text/default-e7847afc364aadc7 (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to C:\Users\N\.cache\huggingface\datasets\text\default-e7847afc364aadc7\0.0.0\daf90a707a433ac193b369c8cc1772139bb6cca21a9c7fe83bdd16aad9b9b6ab...




Dataset text downloaded and prepared to C:\Users\N\.cache\huggingface\datasets\text\default-e7847afc364aadc7\0.0.0\daf90a707a433ac193b369c8cc1772139bb6cca21a9c7fe83bdd16aad9b9b6ab. Subsequent calls will reuse this data.


100%|██████████| 33/33 [00:01<00:00, 31.60ba/s]
100%|██████████| 17/17 [00:00<00:00, 28.95ba/s]


In [32]:
training_args = TrainingArguments(
    output_dir="./results/domain_pre_training",
    overwrite_output_dir=True,
    max_steps=100,
    per_device_train_batch_size=40,
    per_device_eval_batch_size=40,
    evaluation_strategy="steps",
    save_steps=50,
    save_total_limit=2,
    logging_steps=50,
    seed=42,
    fp16=True,
    dataloader_num_workers=2,
    disable_tqdm=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['val'],
    data_collator=data_collator,
    tokenizer=tokenizer,  # This tokenizer has new tokens
)

In [None]:
trainer.train()