# Google Drive Mount

In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


# Install


In [2]:
!pip install datasets==2.20.0

Collecting datasets==2.20.0
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets==2.20.0)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets==2.20.0)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.2 (from datasets==2.20.0)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets==2.20.0)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux

In [3]:
!pip install transformers==4.41.2



# 4.2 모델 초기화 후 학습

In [1]:
from datasets import load_dataset
from transformers import BertTokenizerFast

dataset = load_dataset("klue", "ynat")
model_name = "drive/MyDrive/Books/outputs/MyTokenizer"
tokenizer = BertTokenizerFast.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [2]:
from transformers import BertConfig

cfg = BertConfig
print(cfg)

<class 'transformers.models.bert.configuration_bert.BertConfig'>


In [3]:
mycfg = BertConfig(vocab_size=tokenizer.vocab_size)

In [4]:
from transformers import BertForMaskedLM

model = BertForMaskedLM(mycfg)
print(model.config)

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.41.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 24000
}



In [5]:
!pip install accelerate -U # 런타임 > 세션 다시 시작



In [6]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

datasets = dataset.map(
    lambda x: tokenizer(x["title"]),
    batched=True,
    batch_size=1000,
    remove_columns=dataset.column_names["train"],
)

args = TrainingArguments(
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=5e-5,
    max_steps=1000,
    evaluation_strategy="steps",
    logging_strategy="steps",
    logging_steps=100,
    logging_dir="drive/MyDrive/Books/outputs/logs",
    output_dir="drive/MyDrive/Books/outputs/ckpt",
)

collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=datasets["train"],
    eval_dataset=datasets["validation"],
    tokenizer=tokenizer,
    data_collator=collator,
)

trainer.train()
trainer.save_model("drive/MyDrive/Books/outputs/MyBertModel")

max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss,Validation Loss
100,9.7035,9.560806
200,9.3695,9.507977
300,9.3292,9.488928
400,9.2679,9.455236
500,9.2184,9.40627
600,9.1764,9.435023
700,9.1788,9.38963
800,9.2378,9.386599
900,9.2016,9.398674
1000,9.1617,9.348604
