In [2]:
import pandas as pd

EMOTION_LABELS = ["happiness", "angry", "disgust", "fear", "neutral", "sadness", "surprise"]
# EMOTION_LABLES = ["happiness", "angry", "disgust", "fear", "sadness", "surprise"] # neutral 제외
SCORE_LABLES = [emo+"_score" for emo in EMOTION_LABELS]

In [3]:
features_df = pd.read_csv("extracted_features_v0.2.csv", index_col="wav_id")

labels_df_01 = pd.read_csv("4th.csv", encoding='cp949').set_index("wav_id")
labels_df_02 = pd.read_csv("5th_1st.csv", encoding='cp949').set_index("wav_id")
labels_df_03 = pd.read_csv("5th_2nd.csv", encoding='cp949').set_index("wav_id")

# 4차 + 5차_1차 + 5차_2차
labels_df = pd.concat([labels_df_01, labels_df_02, labels_df_03])

In [4]:
labels_df.head()

Unnamed: 0_level_0,발화문,상황,1번 감정,1번 감정세기,2번 감정,2번 감정세기,3번 감정,3번 감정세기,4번 감정,4번 감정세기,5번 감정,5번 감정세기,나이,성별
wav_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
5e258fd1305bcf3ad153a6a4,"어, 청소 니가 대신 해 줘!",anger,Neutral,0,Angry,1,Neutral,0,Neutral,0,Angry,1,27,male
5e258fe2305bcf3ad153a6a5,둘 다 청소 하기 싫어. 귀찮아.,anger,Neutral,0,Angry,1,Neutral,0,Neutral,0,Angry,1,27,male
5e258ff5305bcf3ad153a6a6,둘 다 하기 싫어서 화내.,anger,Angry,1,Angry,1,Neutral,0,Angry,1,Angry,1,27,male
5e25902f305bcf3ad153a6a9,그럼 방세는 어떡해.,anger,Sadness,1,Sadness,1,Sadness,1,Sadness,1,Sadness,1,27,male
5e27f90b5807b852d9e0157b,권태긴줄 알았는데 다른 사람이 생겼나보더라고.,sad,Sadness,1,Sadness,1,Sadness,1,Sadness,2,Sadness,1,32,male


In [5]:
META_COLUMS = [
    "1번 감정", "1번 감정세기", "2번 감정", "2번 감정세기",
    "3번 감정", "3번 감정세기", "4번 감정", "4번 감정세기", "5번 감정", "5번 감정세기",
    "나이", "성별"
]

labels_df = labels_df.drop(columns=META_COLUMS)

In [6]:
labels_df.loc[labels_df['상황'] == 'anger', '상황'] = 'angry'
labels_df.loc[labels_df['상황'] == 'sad', '상황'] = 'sadness'

In [7]:
labels_df.head()

Unnamed: 0_level_0,발화문,상황
wav_id,Unnamed: 1_level_1,Unnamed: 2_level_1
5e258fd1305bcf3ad153a6a4,"어, 청소 니가 대신 해 줘!",angry
5e258fe2305bcf3ad153a6a5,둘 다 청소 하기 싫어. 귀찮아.,angry
5e258ff5305bcf3ad153a6a6,둘 다 하기 싫어서 화내.,angry
5e25902f305bcf3ad153a6a9,그럼 방세는 어떡해.,angry
5e27f90b5807b852d9e0157b,권태긴줄 알았는데 다른 사람이 생겼나보더라고.,sadness


In [8]:
def under_sampling(df, max_count_per_class):
    df_balanced = pd.concat([
        df[df['상황'] == emo].sample(n=max_count_per_class, random_state=42, replace=False)
        if len(df[df['상황'] == emo]) > max_count_per_class else df[df['상황'] == emo]
        for emo in EMOTION_LABELS
    ])
    return df_balanced

df = under_sampling(labels_df, 1000)
df.head()

Unnamed: 0_level_0,발화문,상황
wav_id,Unnamed: 1_level_1,Unnamed: 2_level_1
5f640d6d24c86141a01b11ac,맞아. 결승선을 통과하는 순간 힘든게 싹 사라지더라고.,happiness
5f8acb659e04b149046cd240,당첨자는 한 명 뽑는 거였는데 내가 당첨됐어. 운이 좋지?,happiness
5f4149779dd513131eacee7e,그렇잖아도 내일 친구들이랑 만나서 여행 계획을 세울 거야. 좋은 생각이지?,happiness
5fbb3d244c55eb78bd7ce46b,음악 추천해줄 수 있는 거 있어?,happiness
5f3e97d59dd513131eace3d4,당첨 선물이 내가 예전부터 갖고 싶었던 향수라서 너무 좋아. 너무 마음에 들어.,happiness


In [38]:
df['상황'].value_counts()

KeyError: '상황'

In [37]:
label2id = {e:i for i, e in enumerate(pd.unique(df['상황']).tolist())}
id2label = {i:e for i, e in enumerate(pd.unique(df['상황']).tolist())}
label2id
id2label

KeyError: '상황'

In [11]:
df["label"] = df["상황"].map(label2id)
df = df.rename(columns={'상황': 'emotion', '발화문': 'text'})
df.head()

Unnamed: 0_level_0,text,emotion,label
wav_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5f640d6d24c86141a01b11ac,맞아. 결승선을 통과하는 순간 힘든게 싹 사라지더라고.,happiness,0
5f8acb659e04b149046cd240,당첨자는 한 명 뽑는 거였는데 내가 당첨됐어. 운이 좋지?,happiness,0
5f4149779dd513131eacee7e,그렇잖아도 내일 친구들이랑 만나서 여행 계획을 세울 거야. 좋은 생각이지?,happiness,0
5fbb3d244c55eb78bd7ce46b,음악 추천해줄 수 있는 거 있어?,happiness,0
5f3e97d59dd513131eace3d4,당첨 선물이 내가 예전부터 갖고 싶었던 향수라서 너무 좋아. 너무 마음에 들어.,happiness,0


In [36]:
df.to_csv('preprocessed.csv', encoding='cp949')

In [19]:
from sklearn.model_selection import train_test_split

train_df, valid_df = train_test_split(
    df,
    test_size=0.2,
    stratify=df["label"],
    random_state=42
)


In [26]:
from datasets import Dataset


train_dataset = Dataset.from_pandas(train_df[["text", "label"]])
test_dataset = Dataset.from_pandas(valid_df[["text", "label"]])

In [27]:
train_dataset

Dataset({
    features: ['text', 'label', 'wav_id'],
    num_rows: 5600
})

In [28]:
test_dataset

Dataset({
    features: ['text', 'label', 'wav_id'],
    num_rows: 1400
})

In [14]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "klue/roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id,
)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
def tokenize_function(example):
    return tokenizer(example["text"], padding="max_length", truncation=True)

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


Map: 100%|██████████| 5600/5600 [00:00<00:00, 5814.51 examples/s]
Map: 100%|██████████| 1400/1400 [00:00<00:00, 6520.37 examples/s]


In [18]:
tokenized_dataset

Dataset({
    features: ['text', 'label', 'wav_id', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 7000
})

In [34]:
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average='macro'),
    }

training_args = TrainingArguments(
    output_dir="./bert_result",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)



  trainer = Trainer(


In [35]:
trainer.train()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 