In [1]:
# !pip install transformers
# !pip install tokenizers

In [2]:
# !pip3 install tensorboard

In [3]:
# from google.colab import drive
# drive.mount('./gdrive')

In [4]:
# cd /content/gdrive/My\ Drive/DeepLearning

In [5]:
# !pip3 install ipywidgets
# !jupyter nbextension enable --py widgetsnbextension

In [6]:
from transformers import (
    AdamW,
    get_linear_schedule_with_warmup,
    get_cosine_with_hard_restarts_schedule_with_warmup,
)


In [7]:
import sys
sys.path.append('../')
from transformers import RobertaConfig
from transformers import RobertaTokenizerFast, RobertaTokenizer
from transformers import RobertaForMaskedLM
from modeling_yubert import YubertForMaskedLM, YubertLMHead

pretrained_dir = './pretrained_local'
# tokenizer_dir = './tokenizer'
log_dir='logs'

config = RobertaConfig(
    vocab_size=32000,
    max_position_embeddings=516,
    num_hidden_layers=6,
    type_vocab_size=1,
#     hidden_size=768,
#     num_attention_heads=12,
#     intermediate_size=3072,
    hidden_size=384,
    num_attention_heads=6,
    intermediate_size=1536,   
    isjupyter=True,
    seq_len=256,
    med_seq_len=64
)

tokenizer = RobertaTokenizerFast.from_pretrained(pretrained_dir, max_len=512)
# tokenizer = RobertaTokenizer.from_pretrained(pretrained_dir, max_len=512)
model = YubertForMaskedLM(config=config)

In [8]:
tokenizer.pad_token_id

3

In [9]:
# tokenizer.add_special_tokens({"s1_token":"<s1>"})

In [10]:
# tokenizer.add_special_tokens('<s1')
# tokenizer.add_special_tokens({cls_token2 :'<s2'})

In [11]:
tokenizer.all_special_ids

[0, 4, 5, 3, 6]

In [12]:
tokenizer.all_special_tokens

['<s>', '</s>', '<unk>', '<pad>', '<mask>']

In [13]:
tokenizer.convert_tokens_to_ids({'<s1>': 1})
tokenizer.convert_tokens_to_ids({'<s2>': 2})

[2]

In [14]:
tokenizer("너는 누구야 ")['input_ids']

[0, 18994, 306, 5651, 603, 1576, 227, 4]

In [15]:
tokenizer('<s1>')['input_ids']

[0, 34, 89, 23, 36, 4]

In [16]:
tokenizer.convert_ids_to_tokens(1)

'<s1>'

In [17]:
tokenizer.all_special_tokens_extended

['<s>',
 '</s>',
 '<unk>',
 '<pad>',
 AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)]

In [18]:
tokenizer("안녕 반가워, hi my pet")['input_ids']

[0, 2500, 9570, 1259, 1149, 2835, 18, 22284, 6948, 15695, 4]

In [19]:
tokenizer("너는 누구야")['input_ids']

[0, 18994, 306, 5651, 603, 1576, 4]

In [20]:
import numpy as np
# print(model)
# print(model)

model_parameters = filter(lambda p: p.requires_grad, model.parameters())
params = sum([np.prod(p.size()) for p in model_parameters])
print(params)

35750528


In [21]:
# model.num_parameters()

In [22]:
%%time
from transformers import LineByLineTextDataset
from transformers import DataCollatorForLanguageModeling

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
#     file_path="./files/pet_smallist.txt",
    file_path="./files/pet_0814.txt",
    block_size=256,
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

[[0, 1, 2, 21279, 18, 23607, 7575, 4027, 7187, 4114, 12515, 20, 1301, 8031, 3793, 5269, 2035, 6059, 3793, 8802, 4763, 1549, 3793, 1702, 13420, 6476, 4745, 1319, 14539, 15258, 7257, 1071, 5817, 26767, 3281, 4865, 31178, 3415, 3557, 3484, 285, 414, 15920, 4813, 2907, 6435, 12496, 4027, 19593, 11417, 3055, 3172, 1335, 436, 3104, 4912, 1335, 2018, 3746, 447, 4386, 5817, 11417, 4126, 2496, 4], [0, 1, 2, 558, 5106, 12511, 1216, 1503, 605, 542, 1223, 1458, 4406, 2153, 5154, 552, 1223, 4678, 964, 423, 285, 756, 6418, 19281, 123, 557, 1494, 1838, 3885, 28673, 6242, 8084, 10069, 1921, 2029, 3729, 3885, 14487, 5670, 5679, 1649, 602, 8808, 11946, 4366, 2507, 726, 1906, 919, 1119, 15282, 3359, 602, 4726, 8367, 2597, 4], [0, 1, 2, 7513, 930, 7656, 6978, 4634, 3312, 3850, 762, 1912, 7278, 11946, 4076, 20, 22226, 3842, 649, 945, 1804, 20, 996, 2362, 423, 14938, 10033, 996, 2362, 635, 20, 2369, 407, 490, 875, 4965, 22226, 2676, 1640, 17436, 2616, 1123, 20, 2086, 11661, 1220, 6126, 7108, 342, 649, 1804,

In [23]:
NUM_EPOCHS = 1
BATCH_SIZE = 16
LEARN_RATE = 0.00006
MAX_WARMUP_STEPS = 30000
SAVE_STEPS = 10000
MAX_STEPS = 30000
optimizer = AdamW(
    model.parameters(), lr=LEARN_RATE, betas=(0.9, 0.999), weight_decay=0.1
)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=MAX_WARMUP_STEPS,
    num_training_steps=len(dataset) * NUM_EPOCHS,
)

In [24]:
from transformers import Trainer, TrainingArguments
import numpy as np

# SEED = np.random.randint(0, 100000, size=None)
SEED = 1

training_args = TrainingArguments(
    output_dir=log_dir,
    overwrite_output_dir=True,
    num_train_epochs=NUM_EPOCHS,
    per_gpu_train_batch_size=BATCH_SIZE,
    max_steps=MAX_STEPS,
    save_steps=SAVE_STEPS,
    save_total_limit=5,
    seed=SEED
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
    prediction_loss_only=True,
    optimizers=(optimizer, scheduler),
)

You are instantiating a Trainer but Tensorboard is not installed. You should consider installing it.


In [None]:
%%time
trainer.train()

Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=1.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=83234.0, style=ProgressStyle(description_…

{'loss': 10.418813261032104, 'learning_rate': 1e-06, 'epoch': 0.006007160535358147, 'step': 500}
{'loss': 10.115291399002075, 'learning_rate': 2e-06, 'epoch': 0.012014321070716294, 'step': 1000}
{'loss': 9.6983158493042, 'learning_rate': 3e-06, 'epoch': 0.01802148160607444, 'step': 1500}
{'loss': 9.353532850265504, 'learning_rate': 4e-06, 'epoch': 0.024028642141432587, 'step': 2000}
{'loss': 9.003611511230469, 'learning_rate': 4.9999999999999996e-06, 'epoch': 0.030035802676790735, 'step': 2500}


In [None]:
trainer.save_model(pretrained_dir)

In [None]:
for num in range(1,100) :
    print('************************************************************', num)
    LRATE = LEARN_RATE/((num/10)+1)
    optimizer = AdamW(
        model.parameters(), lr=LRATE, betas=(0.9, 0.999), weight_decay=0.1
    )
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=MAX_WARMUP_STEPS,
        num_training_steps=len(dataset) * NUM_EPOCHS,
    )

    SEED = np.random.randint(0, 100000, size=None)

    training_args = TrainingArguments(
        output_dir=log_dir,
        overwrite_output_dir=True,
        num_train_epochs=NUM_EPOCHS,
        per_gpu_train_batch_size=BATCH_SIZE,
        max_steps=MAX_STEPS,
        save_steps=SAVE_STEPS,
        save_total_limit=5,
        seed=SEED
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset,
        prediction_loss_only=True,
        optimizers=(optimizer, scheduler),
    )

#     %%time
    trainer.train()

trainer.save_model(pretrained_dir)

In [None]:
trainer.save_model(pretrained_dir)