# 환경 설정

In [3]:
!pip install transformers



In [4]:
import os
data_path = "drive/MyDrive/2025/KW/Data/감정 분류를 위한 대화 음성 데이터셋/"
os.listdir(data_path)

['5차년도.csv', '5차년도_2차.csv', '4차년도.csv']

# 데이터 로드

In [45]:
import pandas as pd
df = pd.read_csv(data_path + "5차년도_2차.csv", encoding="cp949")

# Label Encoding
label_map = {
    "fear"     : 0,
    "surprise" : 1,
    "angry"    : 2,
    "sadness"  : 3,
    "neutral"  : 4,
    "happiness": 5,
    "disgust"  : 6
}

df["y"] = df["상황"].map(label_map)

x_col = '발화문'
y_col = 'y'
input_data = df[[x_col] + [y_col]]
input_data

Unnamed: 0,발화문,y
0,헐! 나 이벤트에 당첨 됐어.,5
1,내가 좋아하는 인플루언서가 이벤트를 하더라고. 그래서 그냥 신청 한번 해봤지.,5
2,"한 명 뽑는 거였는데, 그게 바로 내가 된 거야.",5
3,"당연히 마음에 드는 선물이니깐, 이벤트에 내가 신청 한번 해본 거지. 비싼 거야. ...",5
4,에피타이저 정말 좋아해. 그 것도 괜찮은 생각인 것 같애.,4
...,...,...
19369,나 엘리베이터에 갇혔어.,0
19370,하지만 기분이 나쁜 걸 어떡해?,2
19371,자취방 엘리베이턴데 정전인가봐.,0
19372,나 드디어 프로젝트 끝났어!,5


In [47]:
from sklearn.model_selection import train_test_split
trval_X, test_X, trval_y, test_y = train_test_split(
    input_data[x_col].tolist(), input_data[y_col].tolist(),
    test_size=0.05, stratify=input_data[y_col], random_state=42)

from sklearn.model_selection import train_test_split
train_X, valid_X, train_y, valid_y = train_test_split(
    trval_X, trval_y, test_size=0.05,
    stratify=trval_y, random_state=42)

print(f"            x      y")
print(f"train size: {len(train_X):<5}  {len(train_y):<5}")
print(f"valid size: {len(valid_X):<5}  {len(valid_y):<5}")
print(f"test size : {len(test_X):<5}  {len(test_y):<5}")

            x      y
train size: 17484  17484
valid size: 921    921  
test size : 969    969  


# Tokenizer

In [17]:
model_path = "drive/MyDrive/2025/KW/Model/"
model_id = "monologg/kobert"

In [18]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=model_path, trust_remote_code=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


## 인코드/디코드

In [20]:
text = "나는 학생입니다."
encoded = tokenizer.encode(text)
encoded

[2, 1375, 4952, 7139, 54, 3]

In [21]:
tokenizer.decode(encoded)

'[CLS] 나는 학생입니다.[SEP]'

## BERT 입력 생성

tokenizer 내부 `__call__()` 함수 호출

### Base

In [24]:
encoded = tokenizer(text)
encoded

{'input_ids': [2, 1375, 4952, 7139, 54, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1]}

In [25]:
# encoded['input_ids']
encoded.input_ids

[2, 1375, 4952, 7139, 54, 3]

In [26]:
# encoded['token_type_ids']
encoded.token_type_ids

[0, 0, 0, 0, 0, 0]

In [27]:
# encoded['attention_mask']
encoded.attention_mask

[1, 1, 1, 1, 1, 1]

### 모델 입력

In [32]:
encoded = tokenizer(
    text,
    return_tensors='pt',
    padding='max_length',
    truncation=True,
    max_length=20
)

In [33]:
encoded.input_ids

tensor([[   2, 1375, 4952, 7139,   54,    3,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1]])

In [34]:
encoded.token_type_ids

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [35]:
encoded.attention_mask

tensor([[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

### 예제 디코딩

In [85]:
decoded_string = tokenizer.decode(
    encoded['input_ids'][0]
)
decoded_string

'[CLS] 나는 학생입니다.[SEP][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]'

In [87]:
decoded_string = tokenizer.decode(
    encoded['input_ids'][0],
    skip_special_tokens=True
)
decoded_string

'나는 학생입니다.'

In [43]:
token_list = tokenizer.convert_ids_to_tokens(encoded['input_ids'][0])
token_list

['[CLS]',
 '▁나는',
 '▁학생',
 '입니다',
 '.',
 '[SEP]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]']

# 데이터셋

In [48]:
from torch.utils.data import Dataset, DataLoader
import torch

class KoBERTDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=64):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        inputs = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
            max_length=self.max_len
        )
        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'token_type_ids': inputs['token_type_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [51]:
from torch.utils.data import Dataset, DataLoader

train_dataset = KoBERTDataset(train_X, train_y, tokenizer)
valid_dataset = KoBERTDataset(valid_X, valid_y, tokenizer)
test_dataset = KoBERTDataset(test_X, test_y, tokenizer)

batch_size = 3
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [52]:
sample_batch = next(iter(valid_loader))
sample_batch

{'input_ids': tensor([[   2, 1601, 5760, 1754, 6900, 1458, 5213, 5439, 4297, 7444, 5761,  943,
          6564, 4745, 7794, 2913, 1282, 7096, 3255, 7431, 4208, 6855,   54,    3,
             1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
             1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
             1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
             1,    1,    1,    1],
         [   2, 3585, 7828, 3574, 7656, 6323, 5940, 3466, 1730, 2222, 1786, 5482,
          6116, 4930, 7318, 2049, 5405, 6855,    5,    3,    1,    1,    1,    1,
             1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
             1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
             1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
             1,    1,    1,    1],
         [   2, 3220,    5, 4368, 6079,   54, 1435, 4300, 1801, 6889, 4471, 5913,
          2829,

In [80]:
label_inverted_map = {v: k for k, v in label_map.items()}

def print_sample_valid_data(sample_idx, max_length=30):
    text = valid_X[sample_idx]
    label = valid_y[sample_idx]
    encoded = tokenizer(text,
        return_tensors='pt', padding='max_length',
        truncation=True, max_length=max_length)
    print(f"text          : {text}")
    print(f"label         : {label} ({label_inverted_map[label]})\n")
    print(f"input ids     : {encoded['input_ids'].cpu().numpy()[0]}")
    print(f"token type ids: {encoded['token_type_ids'].cpu().numpy()[0]}")
    print(f"attention mask: {encoded['attention_mask'].cpu().numpy()[0]}\n")


    decoded_string = tokenizer.decode(
        encoded['input_ids'][0],
        skip_special_tokens=True
    )
    token_list = tokenizer.convert_ids_to_tokens(encoded['input_ids'][0])
    print(f"decoded_string: {decoded_string}")
    print(f"token_list    : {token_list}")

In [81]:
idx = 0
(
    sample_batch['input_ids'][idx],
    sample_batch['token_type_ids'][idx],
    sample_batch['attention_mask'][idx],
    sample_batch['label'][idx]
)

(tensor([   2, 1601, 5760, 1754, 6900, 1458, 5213, 5439, 4297, 7444, 5761,  943,
         6564, 4745, 7794, 2913, 1282, 7096, 3255, 7431, 4208, 6855,   54,    3,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1]),
 tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 tensor(5))

In [82]:
print_sample_valid_data(idx)

text          : 달리는 동안에는 너무 힘들고 지쳤는데 결승선을 통과하는 순간 기분이 엄청 좋았어.
label         : 5 (happiness)

input ids     : [   2 1601 5760 1754 6900 1458 5213 5439 4297 7444 5761  943 6564 4745
 7794 2913 1282 7096 3255 7431 4208 6855   54    3    1    1    1    1
    1    1]
token type ids: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
attention mask: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0]

decoded_string: 달리는 동안에는 너무 힘들고 지쳤는데 결승선을 통과하는 순간 기분이 엄청 좋았어.
token_list    : ['[CLS]', '▁달리', '는', '▁동안', '에는', '▁너무', '▁힘들', '고', '▁지', '쳤', '는데', '▁결승', '선을', '▁통과', '하는', '▁순간', '▁기분', '이', '▁엄', '청', '▁좋았', '어', '.', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']


In [83]:
idx = 1
(
    sample_batch['input_ids'][idx],
    sample_batch['token_type_ids'][idx],
    sample_batch['attention_mask'][idx],
    sample_batch['label'][idx]
)

(tensor([   2, 3585, 7828, 3574, 7656, 6323, 5940, 3466, 1730, 2222, 1786, 5482,
         6116, 4930, 7318, 2049, 5405, 6855,    5,    3,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1]),
 tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 tensor(2))

In [84]:
print_sample_valid_data(idx)

text          : 유명한 유튜버들이 왜 돈 받고 뒷광고를 하는지 모르겠어!
label         : 2 (angry)

input ids     : [   2 3585 7828 3574 7656 6323 5940 3466 1730 2222 1786 5482 6116 4930
 7318 2049 5405 6855    5    3    1    1    1    1    1    1    1    1
    1    1]
token type ids: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
attention mask: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0]

decoded_string: 유명한 유튜버들이 왜 돈 받고 뒷광고를 하는지 모르겠어!
token_list    : ['[CLS]', '▁유명', '한', '▁유', '튜', '버', '들이', '▁왜', '▁돈', '▁받고', '▁뒷', '광고', '를', '▁하는', '지', '▁모르', '겠', '어', '!', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']
