"""

create : 2021-12-03, 13:44

modify : 2021-12-06, 12:46

author : KIM DONG EON

"""

In [1]:
import transformers
import torch
import pandas as pd
import numpy as np 

### check version

In [2]:
_version = [torch, transformers, pd, np]

for ver in _version:
    print(str(ver).split("'")[1], 'version : ', ver.__version__)

torch version :  1.9.1+cpu
transformers version :  4.10.2
pandas version :  1.2.3
numpy version :  1.19.5


In [3]:
from transformers import BertModel, BertConfig

### BertConfig
- vocab_size : int, defaults = 30522, Vocabluary size of the BERT. BertModel, TFBertModel 을 호출할 때 inputs_ids 로 표현할 수 있는 토큰의 수를 의미함.
- hidden_size : int, defaults = 768, Dimensionality of the encoder layers and the pooler layer.
- num_hidden_layers : int, defaults = 12, Number of hidden layers in the Transformer encoder.
- num_attention_heads : int, defaults = 12, Number of attention heads for each attention layer in the Transformer encoder. [BERT_based_uncased]
- intermediate_size : int, defaults = 3072, Dimensionality of the 'intermediate' layer in the Transformer encoder.
- hidden_act : str or Callable, defaults = 'gelu', the non-linear activation function in the encoder and pooler. [gelu, relu, silu, gelu_new] are supported.
- hidden_dropout_prob : float, defaults = 0.1, The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
- attention_probs_dropout_prob : float, defaults = 0.1, The dropout ratio for the attention probabilities.
- max_position_embeddings : int, defaults = 512, The maximum sequence length that this model might ever be used with. Typically set this to something large just in case.(e.g., 512, 1024, 2048)
- type_vocab_size : int, defaults = 2, The vocabulary size of the token_type_ids passed when calling BertModel or TFBertModel.
- initializer_range : float, defaults = 0.02, The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
- layer_norm_eps : float, defaults = 1e-12 : The epsilon used by the layer normalization layers.
- position_embedding_type : str, defaults = 'absolute', Type of position embedding. Choose 'absolute', 'relative_key', 'relative_key_query'
- use_cache : bool, defaults = True, Whether or not the model should return the last key/values attentions (not used by all models). Only relevant if if config.is_decoder=True.
- classifier_dropout : float, The dropout ratio for the classification head.

In [4]:
BertConfig(
    vocab_size=30522, 
    hidden_size=768, 
    num_hidden_layers=12, # bert-base-uncased : 12, BERT-large : 24
    num_attention_heads=12, # BERT-large : 16
    intermediate_size=3072, 
    hidden_act = 'gelu', # relu, gelu, silu, gelu_new
    hidden_dropout_prob=0.1, 
    attention_probs_dropout_prob=0.1, 
    max_position_embeddings=512, # 512, 1024, 2048
    type_vocab_size=2, 
    initializer_range=0.02, # 0.01, 0.001, deviation
    layer_norm_eps=1e-12, # normalization layers
    position_embedding_type='absolute', # absolute, relative_key, relative_key_query
    use_cache=True, # boolean
    # classifier_dropout= null # float
    )

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.10.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [5]:
# initializing a BERT bert-base-uncased style configuration
configuration = BertConfig()

# initializing a model from the bert-base-uncased style configuration
model = BertModel(configuration)

## BertTokenizer
- Construct a BERT tokenizer. Based on WordPiece.
- do_lower_case : Whether or not to lowercase the input when tokenizing.
- do_basic_tokenize : Whether or not to do basic tokenizaiton befor WordPiece.
- never_split : Collection of tokens which will never be split during tokenization. Only has an effect when do_basic_tokenize = True
- unk_token : The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this token instead.
- sep_token : The separator token. 
- pad_token : The token used for padding.
- cls_token : The classifier token which is used when doing sequence classification.
- mask_token : The token used for masking values.
- tokenize_chinese_char : Whether or not to tokenize Chinese characters.
- strip_accents : This should likely be deactivated for Japanese.

In [6]:
from transformers import BertTokenizer, BertTokenizerFast

In [None]:
bert_tokenizer = BertTokenizer('file.path',
    # vocab_file= 'str' # File containing the vocabulary
    do_lower_case=True, # uncased
    do_basic_tokenize=True, 
    never_split=None, 
    unk_token='[UNK]', 
    sep_token='[SEP]', 
    pad_token='[PAD]',
    cls_token='[CLS]', 
    mask_token='[MASK]', 
    tokenize_chinese_chars=True, 
    strip_accents=False # This should likely be deactivated for Japanese
)

## BertTokenizerFast
- Construct a 'fast' BERT tokenizer (backed by HuggingFace's tokenizers library). Based on WordPiece.

In [None]:
tokenizer_fast = BertTokenizerFast(
)

## BertModel

In [19]:
from transformers import BertTokenizer, BertModel
import torch 

In [21]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading: 100%|██████████| 232k/232k [00:00<00:00, 281kB/s]
Downloading: 100%|██████████| 28.0/28.0 [00:00<00:00, 14.0kB/s]
Downloading: 100%|██████████| 466k/466k [00:01<00:00, 457kB/s]


In [20]:
# initializing a BERT bert-base-uncased style configuration
configuration = BertConfig()

# initializing a model from the bert-base-uncased style configuration
model = BertModel(configuration)

model = BertModel.from_pretrained('bert-base-uncased')

Downloading: 100%|██████████| 570/570 [00:00<00:00, 71.6kB/s]
Downloading: 100%|██████████| 440M/440M [00:37<00:00, 11.7MB/s]
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [22]:
inputs = tokenizer('hi, my name is DONGEON KIM', return_tensors= 'pt')
inputs # input_ids, token_type_ids, attention_mask

{'input_ids': tensor([[  101,  7632,  1010,  2026,  2171,  2003, 11947, 10242,  5035,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [29]:
print(inputs.keys())

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])


In [37]:
inputs = tokenizer('hi, my name is DONGEON KIM', return_tensors= 'pt')

In [55]:
outputs = model(**inputs)

In [59]:
last_hidden_states = outputs.last_hidden_state

## BertForPreTraining

In [62]:
from transformers import BertTokenizer, BertForPreTraining

In [63]:
model = BertForPreTraining.from_pretrained('bert-base-uncased')

Some weights of BertForPreTraining were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [64]:
outputs = model(**inputs)

In [69]:
prediction_logits = outputs.prediction_logits
seq_relationship_logits = outputs.seq_relationship_logits

In [73]:
torch.argmax(seq_relationship_logits, 1)


tensor([0])