In [1]:
# Setting up
!pip install transformers
!pip install transformers[sentencepiece]

Collecting transformers
  Downloading transformers-4.10.3-py3-none-any.whl (2.8 MB)
[K     |████████████████████████████████| 2.8 MB 13.3 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 18.6 MB/s 
Collecting huggingface-hub>=0.0.12
  Downloading huggingface_hub-0.0.17-py3-none-any.whl (52 kB)
[K     |████████████████████████████████| 52 kB 1.6 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 35.3 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 57.7 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installat

tokenization, conversion to input IDs, padding, truncation, attention mask -> Huggingface Transformers API는 tokenizer를 바로 호출함으로써 이런 복잡한 작업들을 모두 처리할 수 있다. (데이터가 모델에 들어가도록 준비 완료!)

In [2]:
from transformers import AutoTokenizer

checkpoint = 'distilbert-base-uncased-finetuned-sst-2-english'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

sequence = "I've been waiting for a HuggingFace course my whole life."

model_inputs = tokenizer(sequence)

Downloading:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

model_inputs은 inputs_ids와 attention mask를 갖고 있음. (distilbert의 인풋으로 들어가기 위해 필요한 모든 것을 model_input이 갖추고 있음)

In [3]:
model_inputs

{'input_ids': [101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

multiple sequence도 가능!

In [5]:
sequences = [
  "I've been waiting for a HuggingFace course my whole life.",
  "So have I!"
]

model_inputs = tokenizer(sequences)
model_inputs

{'input_ids': [[101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102], [101, 2061, 2031, 1045, 999, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]]}

### Padding
옵션으로 padding도 할 수 있음

In [14]:
# 가장 긴 sequence 길이에 맞춰 padding
model_inputs = tokenizer(sequences, padding='longest')
print(len(model_inputs['input_ids'][0]))
print(len(model_inputs['input_ids'][1]))

16
16


In [15]:
print(model_inputs['input_ids'][0])
print(model_inputs['input_ids'][1])

[101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102]
[101, 2061, 2031, 1045, 999, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [11]:
# model의 최대 길이에 맞춰 padding (BERT, DistilBERT는 512)
model_inputs = tokenizer(sequences, padding='max_length')
print(len(model_inputs['input_ids'][0]))
print(len(model_inputs['input_ids'][1]))

512
512


In [16]:
# max length를 지정하여 그 길이에 맞춰 padding
model_inputs = tokenizer(sequences, padding='max_length', max_length=8)
print(len(model_inputs['input_ids'][0]))
print(len(model_inputs['input_ids'][1]))

print(model_inputs['input_ids'][0]) # max_length 길이를 초과하는 문장이 잘리지는 않음!
print(model_inputs['input_ids'][1]) # max_length 길이보다 짧은 문장을 max_length까지 padding

16
8
[101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102]
[101, 2061, 2031, 1045, 999, 102, 0, 0]


### truncate

In [17]:
sequences = [
  "I've been waiting for a HuggingFace course my whole life.",
  "So have I!"
]

# model max length 보다 긴 문장일 경우 잘라냄(truncate). BERT와 DistilBERT는 max length 512
model_inputs = tokenizer(sequences, truncation=True)
print(len(model_inputs['input_ids'][0]))
print(len(model_inputs['input_ids'][1])) # padding 옵션 안줘서 padding 안됐음

16
6


In [20]:
# max length를 지정하여 그 길이 이상이면 잘라냄.
model_inputs = tokenizer(sequences, max_length=8, truncation=True)
print(len(model_inputs['input_ids'][0])) # 8 이상인 16이니까 잘라냄
print(len(model_inputs['input_ids'][1])) # 8 이하여서 변화 X

print(model_inputs['input_ids'][0])

8
6
[101, 1045, 1005, 2310, 2042, 3403, 2005, 102]


### convert to specific framework tensors
- pt -> pytorch
- tf -> tensorflow
- np -> numpy

In [23]:
sequences = [
  "I've been waiting for a HuggingFace course my whole life.",
  "So have I!"
]

# PyTorch tensors
model_inputs = tokenizer(sequences, padding=True, return_tensors="pt")
print(type(model_inputs['input_ids'][0]))

# TensorFlow tensors
model_inputs = tokenizer(sequences, padding=True, return_tensors="tf")
print(type(model_inputs['input_ids'][0]))

# Numpy tensors
model_inputs = tokenizer(sequences, padding=True, return_tensors="pt")
print(type(model_inputs['input_ids'][0]))

<class 'torch.Tensor'>
<class 'tensorflow.python.framework.ops.EagerTensor'>
<class 'torch.Tensor'>


### Special tokens
tokenizer()를 통과하면 맨 앞과 맨 끝에 token ID가 하나씩 추가되는 것을 확인할 수 있음.

In [24]:
sequence = "I've been waiting for a HuggingFace course my whole life."

model_inputs = tokenizer(sequence)
print(model_inputs["input_ids"])

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

[101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102]
[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012]


확인해보면 맨 앞에는 [CLS], 맨 뒤에는 [SEP] 토큰이 붙는다. pretrain할 때 그렇게 사용했기 때문에, inference 때 같은 결과를 얻으려면 똑같이 두 개의 토큰을 붙여줘야 한다. (그러니까 어떤 모델은 special token을 안 붙이기도 하고, 다른 걸 붙이기도 한다는 것에 유의하자!)

In [25]:
print(tokenizer.decode(model_inputs["input_ids"]))
print(tokenizer.decode(ids))

[CLS] i've been waiting for a huggingface course my whole life. [SEP]
i've been waiting for a huggingface course my whole life.


## Wrapping up: From tokenizer to model
최종본 !

In [26]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = 'distilbert-base-uncased-finetuned-sst-2-english'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequences = [
  "I've been waiting for a HuggingFace course my whole life.",
  "So have I!"
]

tokens = tokenizer(sequences, padding=True, truncation=True, return_tensors='pt')
output = model(**tokens)


Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [27]:
output

SequenceClassifierOutput([('logits', tensor([[-1.5607,  1.6123],
                                   [-3.6183,  3.9137]], grad_fn=<AddmmBackward>))])