"""

create : 2021-12-03, 13:44

modify : 2021-12-06, 12:46

author : KIM DONG EON

"""

In [15]:
import transformers
import torch
import pandas as pd
import numpy as np 

### check version

In [16]:
_version = [torch, transformers, pd, np]

for ver in _version:
    print(str(ver).split("'")[1], 'version : ', ver.__version__)

torch version :  1.9.1+cpu
transformers version :  4.10.2
pandas version :  1.2.3
numpy version :  1.19.5


In [21]:
from transformers import BertModel, BertConfig

### BertConfig
- vocab_size : int, defaults = 30522, Vocabluary size of the BERT. BertModel, TFBertModel 을 호출할 때 inputs_ids 로 표현할 수 있는 토큰의 수를 의미함.
- hidden_size : int, defaults = 768, Dimensionality of the encoder layers and the pooler layer.
- num_hidden_layers : int, defaults = 12, Number of hidden layers in the Transformer encoder.
- num_attention_heads : int, defaults = 12, Number of attention heads for each attention layer in the Transformer encoder. [BERT_based_uncased]
- intermediate_size : int, defaults = 3072, Dimensionality of the 'intermediate' layer in the Transformer encoder.
- hidden_act : str or Callable, defaults = 'gelu', the non-linear activation function in the encoder and pooler. [gelu, relu, silu, gelu_new] are supported.
- hidden_dropout_prob : float, defaults = 0.1, The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
- attention_probs_dropout_prob : float, defaults = 0.1, The dropout ratio for the attention probabilities.
- max_position_embeddings : int, defaults = 512, The maximum sequence length that this model might ever be used with. Typically set this to something large just in case.(e.g., 512, 1024, 2048)
- type_vocab_size : int, defaults = 2, The vocabulary size of the token_type_ids passed when calling BertModel or TFBertModel.
- initializer_range : float, defaults = 0.02, The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
- layer_norm_eps : float, defaults = 1e-12 : The epsilon used by the layer normalization layers.
- position_embedding_type : str, defaults = 'absolute', Type of position embedding. Choose 'absolute', 'relative_key', 'relative_key_query'
- use_cache : bool, defaults = True, Whether or not the model should return the last key/values attentions (not used by all models). Only relevant if if config.is_decoder=True.
- classifier_dropout : float, The dropout ratio for the classification head.

In [40]:
BertConfig(
    vocab_size=30522, 
    hidden_size=768, 
    num_hidden_layers=12, # bert-base-uncased : 12, BERT-large : 24
    num_attention_heads=12, # BERT-large : 16
    intermediate_size=3072, 
    hidden_act = 'gelu', # relu, gelu, silu, gelu_new
    hidden_dropout_prob=0.1, 
    attention_probs_dropout_prob=0.1, 
    max_position_embeddings=512, # 512, 1024, 2048
    type_vocab_size=2, 
    initializer_range=0.02, # 0.01, 0.001, deviation
    layer_norm_eps=1e-12, # normalization layers
    position_embedding_type='absolute', # absolute, relative_key, relative_key_query
    use_cache=True, # boolean
    # classifier_dropout= null # float
    )

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.10.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [None]:
# initializing a BERT bert-base-uncased style configuration
configuration = BertConfig()

# initializing a model from the bert-base-uncased style configuration
model = BertModel(configuration)