# 토큰화한 뒤 sequence length 알아보기

In [1]:
import pickle as pickle
import os
import pandas as pd
import torch

In [2]:
#  library
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    RobertaConfig,
    RobertaTokenizer,
    RobertaForSequenceClassification,
    BertTokenizer,
    ElectraModel,
    ElectraTokenizer,
)

2022-03-29 04:18:04.531798: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


In [3]:
def preprocessing_dataset(dataset):
    """ 처음 불러온 csv 파일을 원하는 형태의 DataFrame으로 변경 시켜줍니다."""
    subject_entity = []
    object_entity = []
    for i,j in zip(dataset['subject_entity'], dataset['object_entity']):
        i = i[1:-1].split(',')[0].split(':')[1]
        j = j[1:-1].split(',')[0].split(':')[1]
 
        subject_entity.append(i)
        object_entity.append(j)
    out_dataset = pd.DataFrame({'id':dataset['id'], 'sentence':dataset['sentence'],'subject_entity':subject_entity,'object_entity':object_entity,'label':dataset['label'],})
    return out_dataset

def load_data(dataset_dir):
    """ csv 파일을 경로에 맡게 불러 옵니다. """
    pd_dataset = pd.read_csv(dataset_dir)
    dataset = preprocessing_dataset(pd_dataset)
  
    return dataset

In [4]:
out_dataset = load_data('../dataset/train/train.csv')

In [None]:
out_dataset

In [5]:
def tokenized_dataset(dataset, tokenizer):
    """ tokenizer에 따라 sentence를 tokenizing 합니다."""
    concat_entity = []
    for e01, e02 in zip(dataset['subject_entity'], dataset['object_entity']):
        temp = ''
        temp = e01 + '[SEP]' + e02
        concat_entity.append(temp)
    tokenized_sentences = tokenizer(
        concat_entity,
        list(dataset['sentence']),
        return_tensors="pt",
        padding=True, 
        truncation=False, # 나중에 True로 변경해야 함
        #max_length=256, 
        add_special_tokens=True,
    )
    return tokenized_sentences

In [6]:
tokenizer = AutoTokenizer.from_pretrained('klue/bert-base')

In [7]:
tokenized_train = tokenized_dataset(out_dataset, tokenizer)

In [8]:
tokenized_train

{'input_ids': tensor([[    2,    11, 29830,  ...,     0,     0,     0],
        [    2,    11,  3772,  ...,     0,     0,     0],
        [    2,    11,  4104,  ...,     0,     0,     0],
        ...,
        [    2,    11, 18272,  ...,     0,     0,     0],
        [    2,    11, 15710,  ...,     0,     0,     0],
        [    2,    11, 15437,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [9]:
tokenized_train.input_ids.shape

torch.Size([32470, 241])

In [10]:
tokenized_train.input_ids

tensor([[    2,    11, 29830,  ...,     0,     0,     0],
        [    2,    11,  3772,  ...,     0,     0,     0],
        [    2,    11,  4104,  ...,     0,     0,     0],
        ...,
        [    2,    11, 18272,  ...,     0,     0,     0],
        [    2,    11, 15710,  ...,     0,     0,     0],
        [    2,    11, 15437,  ...,     0,     0,     0]])

In [None]:
tokenizer.tokenize(list(out_dataset['sentence'])[0])

### 다른 토크나이저

In [13]:
roberta_tokenizer = AutoTokenizer.from_pretrained('klue/roberta-large')

In [18]:
tokenized_train_roberta = tokenized_dataset(out_dataset, roberta_tokenizer)

In [19]:
tokenized_train_roberta.input_ids.shape

torch.Size([32470, 241])

In [20]:
tokenized_train_roberta.input_ids

tensor([[    0,    11, 29830,  ...,     1,     1,     1],
        [    0,    11,  3772,  ...,     1,     1,     1],
        [    0,    11,  4104,  ...,     1,     1,     1],
        ...,
        [    0,    11, 18272,  ...,     1,     1,     1],
        [    0,    11, 15710,  ...,     1,     1,     1],
        [    0,    11, 15437,  ...,     1,     1,     1]])

In [None]:
roberta_tokenizer.tokenize(list(out_dataset['sentence'])[0])