<a href="https://colab.research.google.com/github/bakky14/Writing/blob/main/src/3.2%20Transformers%20%EB%9D%BC%EC%9D%B4%EB%B8%8C%EB%9F%AC%EB%A6%AC%20%EA%B0%9C%EC%9A%94.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 3.2.1 Transformers 설치


In [None]:
!pip install \
     datasets==2.20.0 \
     transformers==4.41.2

In [None]:
!pip list | grep transformers

# 3.2.2 Tokenizer

### Tokenizer 다운로드

In [None]:
from transformers import BertTokenizer

model_name = "klue/bert-base"
tokenizer = BertTokenizer.from_pretrained(model_name)

In [None]:
help(tokenizer)

In [None]:
print(tokenizer.vocab_size)
print(tokenizer.get_vocab())
print(tokenizer.special_tokens_map)

### 토큰화 작업

In [None]:
sentence = "안녕하세요. 이건 테스트입니다."

# 토큰화 작업
tokens1 = tokenizer.tokenize(sentence)
print(tokens1)

# 토큰을 입력 식별자로 변환
ids1 = tokenizer.convert_tokens_to_ids(tokens1)
print(ids1)

ids2 = tokenizer(sentence)
print(ids2)

In [None]:
# 디코딩
decoded_string1 = tokenizer.decode(ids1)
print(decoded_string1)

decoded_string2 = tokenizer.decode(ids2["input_ids"])
print(decoded_string2)

decoded_string3 = tokenizer.decode(ids2["input_ids"], skip_special_tokens=True)
print(decoded_string3)

### 데이터셋 전처리

In [None]:
from datasets import load_dataset

dataset = load_dataset("klue", "ynat")
raw_train_dataset = dataset["train"]

In [None]:
from transformers import BertTokenizer

model_name = "klue/bert-base"
tokenizer = BertTokenizer.from_pretrained(model_name)

tokenized_examples = tokenizer(
    raw_train_dataset["title"],
    padding="max_length",
    truncation=True,
)

In [None]:
def tokenize_function(sample):
    return tokenizer(sample["title"])

tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    batch_size=1000,
    remove_columns=["guid", "title", "url", "date"]
)

In [None]:
tokenized_datasets

# 3.2.3 DataCollator

### DataCollator 사용

In [None]:
print(tokenized_datasets["train"][0]["input_ids"])
print(type(tokenized_datasets["train"][0]["input_ids"]))

In [None]:
from pprint import pprint
from transformers import DataCollatorWithPadding

batch = [tokenized_datasets["train"][i] for i in range(8)]
print([len(sample["input_ids"]) for sample in batch])

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

batch = data_collator(batch)
pprint({k: v.size() for k, v in batch.items()})

# 3.2.4 Model
https://huggingface.co/docs/transformers/index

### Model 다운로드

In [None]:
!ls ~/.cache/huggingface/hub

In [None]:
from transformers import BertTokenizer, BertModel

model = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model)
model = BertModel.from_pretrained(model)

In [None]:
model_path = "/content/MyBertModel/"
tokenizer.save_pretrained(model_path)
model.save_pretrained(model_path)

!ls -l {model_path}

In [None]:
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertModel.from_pretrained(model_path)

### Model 추론 실습

In [None]:
import torch
from transformers import BertTokenizer, BertForMaskedLM

model = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model)
model = BertForMaskedLM.from_pretrained(model)

In [None]:
inputs = tokenizer("The capital of France is [MASK].", return_tensors="pt")

with torch.no_grad():
   logits = model(**inputs).logits

# retrieve index of [MASK]
mask_token_index = (inputs.input_ids == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0]

In [None]:
predicted_token_id = logits[0, mask_token_index].argmax(axis=-1)

tokenizer.decode(predicted_token_id)

# 3.2.5 AutoClass

### AutoClass로 Tokenizer, Model 다운로드

In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
                        #-----------------------------------#

model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained(model_name)

inputs = tokenizer("The capital of France is [MASK].", return_tensors="pt")

with torch.no_grad():
   logits = model(**inputs).logits

# retrieve index of [MASK]
mask_token_index = (inputs.input_ids == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0]

predicted_token_id = logits[0, mask_token_index].argmax(axis=-1)
tokenizer.decode(predicted_token_id)

# 3.2.7 Pipelines

### 허깅페이스 허브에 있는 모델 가져오기

In [None]:
from transformers import pipeline

pipe = pipeline(task="text-classification",
                model="google-bert/bert-base-uncased")

print(pipe("유튜브 내달 2일까지 크리에이터 지원 공간 운영"))

### 미세조정 모델 경로로 가져오기

In [None]:
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    pipeline
)

In [None]:
model_name = "/content/MyBertModel/"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [None]:
pipe = pipeline(task="text-classification",
                tokenizer=tokenizer,
                model=model)

print(pipe("유튜브 내달 2일까지 크리에이터 지원 공간 운영"))

In [None]:
model_name = "google-bert/bert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
pipeline = pipeline(task="text-classification",
                    model=model,
                    tokenizer=tokenizer)

print(pipeline("유튜브 내달 2일까지 크리에이터 지원 공간 운영"))

### 직접 구현

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification


model_name = "google-bert/bert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

model.cuda().eval()

with torch.no_grad():
    output = model(
        **tokenizer(
            "유튜브 내달 2일까지 크리에이터 지원 공간 운영",
            return_tensors="pt"
        ).to(model.device)
    )
    result = torch.softmax(output.logits.cpu(), -1)

result = [
    {"label": f"LABEL_{l}", "score": result[i, l].item()}
    for i, l in enumerate(result.argmax(-1))
]

print(result)