In [8]:
from pathlib import Path
import transformers, torch
from transformers import GPT2Config, AutoConfig
from transformers import BertTokenizerFast, BertForMaskedLM, pipeline
from tokenizers import BertWordPieceTokenizer
import os

In [9]:
os.environ["WANDB_WATCH"] = "all"
os.environ["WANDB_PROJECT"] = "BERT - Indonesian"
# os.environ["WANDB_DISABLED"] = "false"

In [10]:
os.environ['CUDA_VISIBLE_DEVICES']='5'
torch.cuda.device(5)

<torch.cuda.device at 0x7f64c20fb850>

In [24]:
model_type="bert"
lang_type="id-100"
data_dir = f'/dataset/wiki/{lang_type}'
paths = [str(x) for x in Path(data_dir).glob(f'std_*_{model_type}.txt')]
output_model=f'/output/{model_type}-{lang_type}/base-finetune'
print(paths, output_model)

['/dataset/wiki/id-100/std_test_bert.txt', '/dataset/wiki/id-100/std_train_bert.txt', '/dataset/wiki/id-100/std_valid_bert.txt'] /output/bert-id-100/base-finetune


In [7]:
%%time
# Initialize a tokenizer
tokenizer = BertWordPieceTokenizer()  # Bert
# Customize training

# Bert
tokenizer.train(files=paths, vocab_size=32_000, min_frequency=2, special_tokens=[
    "[UNK]",
    "[SEP]",
    "[PAD]",
    "[CLS]",
    "[MASK]",
])

CPU times: user 16min 30s, sys: 1min 54s, total: 18min 25s
Wall time: 4min 2s


In [14]:
output_model

'/output/bert-id-100/base'

In [15]:
tokenizer.save(output_model)

['/output/bert-id-100/base/vocab.txt']

In [5]:
from tokenizers.implementations import ByteLevelBPETokenizer, BertWordPieceTokenizer
from tokenizers.processors import BertProcessing

tokenizer = BertWordPieceTokenizer(
    f'{output_model}/vocab.txt',
)

In [6]:
tokenizer._tokenizer.post_processor = BertProcessing(
    ("[SEP]", tokenizer.token_to_id("[SEP]")),
    ("[CLS]", tokenizer.token_to_id("[CLS]")),
)
tokenizer.enable_truncation(max_length=512)

In [7]:
encoding = tokenizer.encode("Kucing ku makan ikan peda.")

In [8]:
encoding

Encoding(num_tokens=9, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [9]:
encoding.tokens

['[CLS]', 'kucing', 'ku', 'makan', 'ikan', 'ped', '##a', '.', '[SEP]']

In [10]:
encoding.attention_mask

[1, 1, 1, 1, 1, 1, 1, 1, 1]

In [22]:
import torch
torch.cuda.is_available()

True

In [23]:
# Bert-Base
config = BertConfig(
    vocab_size=32_000,
    max_position_embeddings=512,
    hidden_size=768,
    num_attention_heads=12,
    num_hidden_layers=12,
    type_vocab_size=2,
)

In [20]:
config = AutoConfig.from_pretrained(output_model)
config

BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "type_vocab_size": 2,
  "vocab_size": 32000
}

In [21]:
tokenizer = BertTokenizerFast.from_pretrained(output_model, max_len=512)



In [22]:
model = BertForMaskedLM(config=config)

In [23]:
model.num_parameters()

111241472

In [10]:
%%time
from transformers import LineByLineTextDataset, TextDataset

dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=f'{data_dir}/std_train_bert.txt',
    block_size=128,
)

CPU times: user 7.42 s, sys: 1.68 s, total: 9.11 s
Wall time: 9.1 s


In [11]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [12]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir=outputs_model,
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_gpu_train_batch_size=96,
    save_steps=10_000,
    save_total_limit=2,
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
    prediction_loss_only=True,
)


[34m[1mwandb[0m: Wandb version 0.9.1 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


In [13]:
%%time
trainer.train()

Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=10.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=8480.0, style=ProgressStyle(description_w…

[34m[1mwandb[0m: Wandb version 0.9.1 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


{"loss": 7.411663683891296, "learning_rate": 4.970518867924528e-05, "epoch": 0.0589622641509434, "step": 500}
{"loss": 6.871545332908631, "learning_rate": 4.941037735849057e-05, "epoch": 0.1179245283018868, "step": 1000}
{"loss": 6.7150249700546265, "learning_rate": 4.911556603773585e-05, "epoch": 0.17688679245283018, "step": 1500}
{"loss": 6.605466014862061, "learning_rate": 4.8820754716981134e-05, "epoch": 0.2358490566037736, "step": 2000}
{"loss": 6.522736531257629, "learning_rate": 4.852594339622642e-05, "epoch": 0.294811320754717, "step": 2500}
{"loss": 6.460593127250672, "learning_rate": 4.82311320754717e-05, "epoch": 0.35377358490566035, "step": 3000}
{"loss": 6.399852411270142, "learning_rate": 4.7936320754716986e-05, "epoch": 0.41273584905660377, "step": 3500}
{"loss": 6.364503573417664, "learning_rate": 4.7641509433962266e-05, "epoch": 0.4716981132075472, "step": 4000}
{"loss": 6.314677367210388, "learning_rate": 4.734669811320755e-05, "epoch": 0.5306603773584906, "step": 450

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=8480.0, style=ProgressStyle(description_w…

{"loss": 6.121740288734436, "learning_rate": 4.4988207547169816e-05, "epoch": 1.0023584905660377, "step": 8500}
{"loss": 6.097916290283203, "learning_rate": 4.4693396226415095e-05, "epoch": 1.0613207547169812, "step": 9000}
{"loss": 6.074948431968689, "learning_rate": 4.439858490566038e-05, "epoch": 1.1202830188679245, "step": 9500}
{"loss": 5.999774028778076, "learning_rate": 4.410377358490566e-05, "epoch": 1.179245283018868, "step": 10000}
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 1048576.0
{"loss": 5.874733050346374, "learning_rate": 4.380896226415094e-05, "epoch": 1.2382075471698113, "step": 10500}
{"loss": 5.723001289367676, "learning_rate": 4.351415094339623e-05, "epoch": 1.2971698113207548, "step": 11000}
{"loss": 5.6048536214828495, "learning_rate": 4.3219339622641514e-05, "epoch": 1.3561320754716981, "step": 11500}
{"loss": 5.4907419862747195, "learning_rate": 4.292452830188679e-05, "epoch": 1.4150943396226414, "step": 12000}
Gradient overflow.  S

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=8480.0, style=ProgressStyle(description_w…

{"loss": 4.1530061550140385, "learning_rate": 3.997641509433962e-05, "epoch": 2.0047169811320753, "step": 17000}
{"loss": 4.071065832614899, "learning_rate": 3.968160377358491e-05, "epoch": 2.063679245283019, "step": 17500}
{"loss": 3.991127878189087, "learning_rate": 3.938679245283019e-05, "epoch": 2.1226415094339623, "step": 18000}
{"loss": 3.9261935968399047, "learning_rate": 3.9091981132075475e-05, "epoch": 2.1816037735849054, "step": 18500}
{"loss": 3.8674808773994447, "learning_rate": 3.8797169811320754e-05, "epoch": 2.240566037735849, "step": 19000}
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 1048576.0
{"loss": 3.8098981595039367, "learning_rate": 3.8502358490566034e-05, "epoch": 2.2995283018867925, "step": 19500}
{"loss": 3.766021531581879, "learning_rate": 3.820754716981133e-05, "epoch": 2.358490566037736, "step": 20000}
{"loss": 3.7256044545173643, "learning_rate": 3.791273584905661e-05, "epoch": 2.417452830188679, "step": 20500}
{"loss": 3.6775840

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=8480.0, style=ProgressStyle(description_w…

{"loss": 3.4006250624656675, "learning_rate": 3.4964622641509436e-05, "epoch": 3.0070754716981134, "step": 25500}
{"loss": 3.3741621017456054, "learning_rate": 3.466981132075472e-05, "epoch": 3.0660377358490565, "step": 26000}
{"loss": 3.3530526728630066, "learning_rate": 3.4375e-05, "epoch": 3.125, "step": 26500}
{"loss": 3.3272424349784853, "learning_rate": 3.408018867924528e-05, "epoch": 3.1839622641509435, "step": 27000}
{"loss": 3.3243484535217287, "learning_rate": 3.378537735849057e-05, "epoch": 3.2429245283018866, "step": 27500}
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 1048576.0
{"loss": 3.285313486099243, "learning_rate": 3.349056603773585e-05, "epoch": 3.30188679245283, "step": 28000}
{"loss": 3.2684149346351625, "learning_rate": 3.3195754716981134e-05, "epoch": 3.3608490566037736, "step": 28500}
{"loss": 3.2586914496421815, "learning_rate": 3.290094339622642e-05, "epoch": 3.419811320754717, "step": 29000}
Gradient overflow.  Skipping step, loss 

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=8480.0, style=ProgressStyle(description_w…

{"loss": 3.1265247435569763, "learning_rate": 2.995283018867925e-05, "epoch": 4.009433962264151, "step": 34000}
{"loss": 3.110561480522156, "learning_rate": 2.965801886792453e-05, "epoch": 4.068396226415095, "step": 34500}
{"loss": 3.093845167160034, "learning_rate": 2.9363207547169812e-05, "epoch": 4.127358490566038, "step": 35000}
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 1048576.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0
{"loss": 3.0941307692527773, "learning_rate": 2.9068396226415095e-05, "epoch": 4.186320754716981, "step": 35500}
{"loss": 3.071337870121002, "learning_rate": 2.8773584905660378e-05, "epoch": 4.245283018867925, "step": 36000}
{"loss": 3.0709372782707214, "learning_rate": 2.847877358490566e-05, "epoch": 4.304245283018868, "step": 36500}
{"loss": 3.051677589416504, "learning_rate": 2.8183962264150944e-05, "epoch": 4.363207547169811, "step": 37000}
{"loss": 3.0497585949897767, "learning_rate": 2.78891

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=8480.0, style=ProgressStyle(description_w…

{"loss": 2.9803603062629698, "learning_rate": 2.4941037735849056e-05, "epoch": 5.011792452830188, "step": 42500}
{"loss": 2.9559700865745544, "learning_rate": 2.464622641509434e-05, "epoch": 5.070754716981132, "step": 43000}
{"loss": 2.940078497886658, "learning_rate": 2.4351415094339626e-05, "epoch": 5.129716981132075, "step": 43500}
{"loss": 2.937126225948334, "learning_rate": 2.405660377358491e-05, "epoch": 5.188679245283019, "step": 44000}
{"loss": 2.9336873950958253, "learning_rate": 2.3761792452830188e-05, "epoch": 5.247641509433962, "step": 44500}
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0
{"loss": 2.93406600522995, "learning_rate": 2.346698113207547e-05, "epoch": 5.306603773584905, "step": 45000}
{"loss": 2.9258333911895753, "learning_rate": 2.3172169811320758e-05, "epoch": 5.365566037735849, "step": 45500}
{"loss": 2.9103901352882384, "learning_rate": 2.287735849056604e-05, "epoch": 5.4245283018867925, "step": 46000}
{"loss": 2.91541996240

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=8480.0, style=ProgressStyle(description_w…

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0
{"loss": 2.8626720790863036, "learning_rate": 1.992924528301887e-05, "epoch": 6.014150943396227, "step": 51000}
{"loss": 2.85968260717392, "learning_rate": 1.963443396226415e-05, "epoch": 6.07311320754717, "step": 51500}
{"loss": 2.8648132448196413, "learning_rate": 1.9339622641509436e-05, "epoch": 6.132075471698113, "step": 52000}
{"loss": 2.8394255776405335, "learning_rate": 1.904481132075472e-05, "epoch": 6.191037735849057, "step": 52500}
{"loss": 2.83781196641922, "learning_rate": 1.8750000000000002e-05, "epoch": 6.25, "step": 53000}
{"loss": 2.8368638310432432, "learning_rate": 1.8455188679245285e-05, "epoch": 6.308962264150943, "step": 53500}
{"loss": 2.828991418838501, "learning_rate": 1.8160377358490564e-05, "epoch": 6.367924528301887, "step": 54000}
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0
{"loss": 2.8346480121612547, "learning_rate": 1.786556603773585e-05, 

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=8480.0, style=ProgressStyle(description_w…

{"loss": 2.790600723743439, "learning_rate": 1.491745283018868e-05, "epoch": 7.0165094339622645, "step": 59500}
{"loss": 2.780982835292816, "learning_rate": 1.4622641509433963e-05, "epoch": 7.0754716981132075, "step": 60000}
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0
{"loss": 2.785551554679871, "learning_rate": 1.4327830188679244e-05, "epoch": 7.134433962264151, "step": 60500}
{"loss": 2.778546877861023, "learning_rate": 1.403301886792453e-05, "epoch": 7.193396226415095, "step": 61000}
{"loss": 2.7700710458755493, "learning_rate": 1.3738207547169812e-05, "epoch": 7.252358490566038, "step": 61500}
{"loss": 2.7746652050018312, "learning_rate": 1.3443396226415095e-05, "epoch": 7.311320754716981, "step": 62000}
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0
{"loss": 2.7679100184440615, "learning_rate": 1.3148584905660378e-05, "epoch": 7.370283018867925, "step": 62500}
{"loss": 2.7679089097976686, "learning_rate": 1.285

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=8480.0, style=ProgressStyle(description_w…

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0
{"loss": 2.7381920380592346, "learning_rate": 9.905660377358492e-06, "epoch": 8.018867924528301, "step": 68000}
{"loss": 2.739515981197357, "learning_rate": 9.610849056603773e-06, "epoch": 8.077830188679245, "step": 68500}
{"loss": 2.743640904903412, "learning_rate": 9.316037735849056e-06, "epoch": 8.13679245283019, "step": 69000}
{"loss": 2.7368316206932066, "learning_rate": 9.02122641509434e-06, "epoch": 8.195754716981131, "step": 69500}
{"loss": 2.7291755394935606, "learning_rate": 8.726415094339622e-06, "epoch": 8.254716981132075, "step": 70000}
{"loss": 2.734577302932739, "learning_rate": 8.431603773584907e-06, "epoch": 8.31367924528302, "step": 70500}
{"loss": 2.7323984031677244, "learning_rate": 8.136792452830188e-06, "epoch": 8.372641509433961, "step": 71000}
{"loss": 2.725057454586029, "learning_rate": 7.841981132075473e-06, "epoch": 8.431603773584905, "step": 71500}
Gradient overflow.  Skipping s

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=8480.0, style=ProgressStyle(description_w…

{"loss": 2.7112299284934998, "learning_rate": 4.893867924528302e-06, "epoch": 9.02122641509434, "step": 76500}
{"loss": 2.7091905012130737, "learning_rate": 4.599056603773585e-06, "epoch": 9.080188679245284, "step": 77000}
{"loss": 2.7012497153282164, "learning_rate": 4.304245283018868e-06, "epoch": 9.139150943396226, "step": 77500}
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 524288.0
{"loss": 2.706629996776581, "learning_rate": 4.009433962264151e-06, "epoch": 9.19811320754717, "step": 78000}
{"loss": 2.702580201625824, "learning_rate": 3.714622641509434e-06, "epoch": 9.257075471698114, "step": 78500}
{"loss": 2.6997499742507935, "learning_rate": 3.419811320754717e-06, "epoch": 9.316037735849056, "step": 79000}
{"loss": 2.689007068157196, "learning_rate": 3.125e-06, "epoch": 9.375, "step": 79500}
{"loss": 2.707279338359833, "learning_rate": 2.830188679245283e-06, "epoch": 9.433962264150944, "step": 80000}
{"loss": 2.6919138793945314, "learning_rate": 2.53537

TrainOutput(global_step=84800, training_loss=3.54714225102708)

In [14]:
trainer.save_model(outputs_model)

In [17]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model=output_model,
    tokenizer=output_model
)

In [18]:
fill_mask("ibu ku sedang bekerja [MASK] supermarket")

[{'sequence': '[CLS] ibu ku sedang bekerja di supermarket [SEP]',
  'score': 0.7983310222625732,
  'token': 1495},
 {'sequence': '[CLS] ibu ku sedang bekerja. supermarket [SEP]',
  'score': 0.090003103017807,
  'token': 17},
 {'sequence': '[CLS] ibu ku sedang bekerja sebagai supermarket [SEP]',
  'score': 0.025469014421105385,
  'token': 1600},
 {'sequence': '[CLS] ibu ku sedang bekerja dengan supermarket [SEP]',
  'score': 0.017966199666261673,
  'token': 1555},
 {'sequence': '[CLS] ibu ku sedang bekerja untuk supermarket [SEP]',
  'score': 0.016971781849861145,
  'token': 1572}]

In [19]:
fill_mask("ibu ku sedang [MASK] di supermarket")

[{'sequence': '[CLS] ibu ku sedang berada di supermarket [SEP]',
  'score': 0.24948164820671082,
  'token': 2186},
 {'sequence': '[CLS] ibu ku sedang bekerja di supermarket [SEP]',
  'score': 0.08426331728696823,
  'token': 2730},
 {'sequence': '[CLS] ibu ku sedang ada di supermarket [SEP]',
  'score': 0.04405415058135986,
  'token': 1821},
 {'sequence': '[CLS] ibu ku sedang bermain di supermarket [SEP]',
  'score': 0.036261286586523056,
  'token': 2715},
 {'sequence': '[CLS] ibu ku sedang tinggal di supermarket [SEP]',
  'score': 0.03429163992404938,
  'token': 2757}]