## Prepare for fine tune data set

In [2]:
import pandas as pd

In [3]:
EMOTION_LABELS = ["happiness", "angry", "disgust", "fear", "neutral", "sadness", "surprise"]
SCORE_LABLES = [emo+"_score" for emo in EMOTION_LABELS]

In [4]:
LABAL_DIR = './dataset/'

labels_df_01 = pd.read_csv(LABAL_DIR+"4th.csv", encoding='cp949').set_index("wav_id")
labels_df_02 = pd.read_csv(LABAL_DIR+"5th_1st.csv", encoding='cp949').set_index("wav_id")
labels_df_03 = pd.read_csv(LABAL_DIR+"5th_2nd.csv", encoding='cp949').set_index("wav_id")

# 4차 + 5차_1차 + 5차_2차
labels_df = pd.concat([labels_df_01, labels_df_02, labels_df_03])

In [5]:
labels_df.head()

Unnamed: 0_level_0,발화문,상황,1번 감정,1번 감정세기,2번 감정,2번 감정세기,3번 감정,3번 감정세기,4번 감정,4번 감정세기,5번 감정,5번 감정세기,나이,성별
wav_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
5e258fd1305bcf3ad153a6a4,"어, 청소 니가 대신 해 줘!",anger,Neutral,0,Angry,1,Neutral,0,Neutral,0,Angry,1,27,male
5e258fe2305bcf3ad153a6a5,둘 다 청소 하기 싫어. 귀찮아.,anger,Neutral,0,Angry,1,Neutral,0,Neutral,0,Angry,1,27,male
5e258ff5305bcf3ad153a6a6,둘 다 하기 싫어서 화내.,anger,Angry,1,Angry,1,Neutral,0,Angry,1,Angry,1,27,male
5e25902f305bcf3ad153a6a9,그럼 방세는 어떡해.,anger,Sadness,1,Sadness,1,Sadness,1,Sadness,1,Sadness,1,27,male
5e27f90b5807b852d9e0157b,권태긴줄 알았는데 다른 사람이 생겼나보더라고.,sad,Sadness,1,Sadness,1,Sadness,1,Sadness,2,Sadness,1,32,male


## SenseVoice Finetune Manual

### Data Example

```json
{"key": "YOU0000008470_S0000238_punc_itn", "text_language": "<|en|>", "emo_target": "<|NEUTRAL|>", "event_target": "<|Speech|>", "with_or_wo_itn": "<|withitn|>", "target": "Including legal due diligence, subscription agreement, negotiation.", "source": "/cpfs01/shared/Group-speech/beinian.lzr/data/industrial_data/english_all/audio/YOU0000008470_S0000238.wav", "target_len": 7, "source_len": 140}
{"key": "AUD0000001556_S0007580", "text_language": "<|en|>", "emo_target": "<|NEUTRAL|>", "event_target": "<|Speech|>", "with_or_wo_itn": "<|woitn|>", "target": "there is a tendency to identify the self or take interest in what one has got used to", "source": "/cpfs01/shared/Group-speech/beinian.lzr/data/industrial_data/english_all/audio/AUD0000001556_S0007580.wav", "target_len": 18, "source_len": 360}
```

```text
Description：

key: audio file unique ID
source：path to the audio file
source_len：number of fbank frames of the audio file
target：transcription
target_len：length of target
text_language：language id of the audio file
emo_target：emotion label of the audio file
event_target：event label of the audio file
with_or_wo_itn：whether includes punctuation and inverse text normalization
```

In [6]:
from funasr.utils.postprocess_utils import emo_dict
emo_dict

{'<|HAPPY|>': '😊',
 '<|SAD|>': '😔',
 '<|ANGRY|>': '😡',
 '<|NEUTRAL|>': '',
 '<|FEARFUL|>': '😰',
 '<|DISGUSTED|>': '🤢',
 '<|SURPRISED|>': '😮'}

In [7]:
SENSEVOICE_EMO_COLS = [emo for emo in emo_dict.keys()]
SENSEVOICE_EMO_COLS

['<|HAPPY|>',
 '<|SAD|>',
 '<|ANGRY|>',
 '<|NEUTRAL|>',
 '<|FEARFUL|>',
 '<|DISGUSTED|>',
 '<|SURPRISED|>']

In [8]:
legacy2sense = {
    'happiness': '<|HAPPY|>',
    'sadness': '<|SAD|>',
    'angry': '<|ANGRY|>',
    'neutral': '<|NEUTRAL|>',
    'fear': '<|FEARFUL|>',
    'disgust': '<|DISGUSTED|>',
    'surprise': '<|SURPRISED|>'
}

In [9]:
def tag_final_emotion(row):
    emo_count = {emo:0 for emo in EMOTION_LABELS}
    emo_val_sum = {emo:0 for emo in EMOTION_LABELS}
    
    for i in range(1, 6):
        data_emo = row.get(f'{i}번 감정').lower()
        data_emo_val = row.get(f'{i}번 감정세기')
        emo_count[data_emo] += 1
        emo_val_sum[data_emo] += data_emo_val
    
    # 중립이 4개 이상인 경우 중립 리턴
    if emo_count['neutral'] > 3:
        return pd.Series({
            'emo_target': legacy2sense['neutral'],
            'intensity': 0
        })
    
    max_val = max(emo_val_sum.values())
    for emo, val in emo_val_sum.items():
        if max_val == val:
            return pd.Series({
                'emo_target': legacy2sense[emo],
                'intensity': emo_val_sum[emo] / 10
            })
        
    return pd.Series({
            'emo_target': legacy2sense['neutral'],
            'intensity': 0
        })
    

labels_df[['emo_target', 'intensity']] = labels_df.apply(tag_final_emotion, axis=1)

In [10]:
labels_df.head()

Unnamed: 0_level_0,발화문,상황,1번 감정,1번 감정세기,2번 감정,2번 감정세기,3번 감정,3번 감정세기,4번 감정,4번 감정세기,5번 감정,5번 감정세기,나이,성별,emo_target,intensity
wav_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
5e258fd1305bcf3ad153a6a4,"어, 청소 니가 대신 해 줘!",anger,Neutral,0,Angry,1,Neutral,0,Neutral,0,Angry,1,27,male,<|ANGRY|>,0.2
5e258fe2305bcf3ad153a6a5,둘 다 청소 하기 싫어. 귀찮아.,anger,Neutral,0,Angry,1,Neutral,0,Neutral,0,Angry,1,27,male,<|ANGRY|>,0.2
5e258ff5305bcf3ad153a6a6,둘 다 하기 싫어서 화내.,anger,Angry,1,Angry,1,Neutral,0,Angry,1,Angry,1,27,male,<|ANGRY|>,0.4
5e25902f305bcf3ad153a6a9,그럼 방세는 어떡해.,anger,Sadness,1,Sadness,1,Sadness,1,Sadness,1,Sadness,1,27,male,<|SAD|>,0.5
5e27f90b5807b852d9e0157b,권태긴줄 알았는데 다른 사람이 생겼나보더라고.,sad,Sadness,1,Sadness,1,Sadness,1,Sadness,2,Sadness,1,32,male,<|SAD|>,0.6


In [11]:
labels_df['emo_target'].value_counts()

emo_target
<|SAD|>          18766
<|ANGRY|>         9349
<|HAPPY|>         5849
<|FEARFUL|>       3576
<|DISGUSTED|>     2959
<|NEUTRAL|>       2174
<|SURPRISED|>     1318
Name: count, dtype: int64

In [12]:
def under_sampling(df, max_count_per_class):
    df_balanced = pd.concat([
        df[df['emo_target'] == emo].sample(n=max_count_per_class, random_state=42, replace=False)
        if len(df[df['emo_target'] == emo]) > max_count_per_class else df[df['emo_target'] == emo]
        for emo in SENSEVOICE_EMO_COLS
    ])
    return df_balanced

labels_df = under_sampling(labels_df, labels_df['emo_target'].value_counts().min())

In [13]:
labels_df['emo_target'].value_counts()

emo_target
<|HAPPY|>        1318
<|SAD|>          1318
<|ANGRY|>        1318
<|NEUTRAL|>      1318
<|FEARFUL|>      1318
<|DISGUSTED|>    1318
<|SURPRISED|>    1318
Name: count, dtype: int64

In [14]:
META_COLUMS = [
    "상황", "1번 감정", "1번 감정세기", "2번 감정", "2번 감정세기",
    "3번 감정", "3번 감정세기", "4번 감정", "4번 감정세기", "5번 감정", "5번 감정세기",
    "나이", "성별"
]

labels_df = labels_df.drop(columns=META_COLUMS)
labels_df.head()

Unnamed: 0_level_0,발화문,emo_target,intensity
wav_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5f62edf1b878131628349e6e,싼 걸로 샀어요. 한 50만원 대였던거 같애요.,<|HAPPY|>,0.1
5f5b55a1b8781316283497ea,벌써 내일 다 만나기로 약속했지.,<|HAPPY|>,0.2
5f645493639e10419832ebd2,드디어 구매한 노트북이 도착했어! 뜯어봐야지!,<|HAPPY|>,0.5
5e32e5aa5807b852d9e04264,알겠어. 걱정해줘서 고마워.,<|HAPPY|>,0.1
5f97e8e4111dfd48d40ff57b,나 드디어 프로젝트가 끝났어.,<|HAPPY|>,0.7


In [15]:
labels_df.rename(columns={'발화문':'target'}, inplace=True)
labels_df.index.name = 'key'
labels_df.head()

Unnamed: 0_level_0,target,emo_target,intensity
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5f62edf1b878131628349e6e,싼 걸로 샀어요. 한 50만원 대였던거 같애요.,<|HAPPY|>,0.1
5f5b55a1b8781316283497ea,벌써 내일 다 만나기로 약속했지.,<|HAPPY|>,0.2
5f645493639e10419832ebd2,드디어 구매한 노트북이 도착했어! 뜯어봐야지!,<|HAPPY|>,0.5
5e32e5aa5807b852d9e04264,알겠어. 걱정해줘서 고마워.,<|HAPPY|>,0.1
5f97e8e4111dfd48d40ff57b,나 드디어 프로젝트가 끝났어.,<|HAPPY|>,0.7


### text

In [16]:
labels_df['target_len'] = labels_df.apply(lambda row: len(row['target'].split()), axis=1)
labels_df['text_language'] = '<|ko|>'
labels_df['with_or_wo_itn'] = '<|withitn|>'
labels_df['event_target'] = '<|Speech|>'

In [17]:
labels_df.head()

Unnamed: 0_level_0,target,emo_target,intensity,target_len,text_language,with_or_wo_itn,event_target
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
5f62edf1b878131628349e6e,싼 걸로 샀어요. 한 50만원 대였던거 같애요.,<|HAPPY|>,0.1,7,<|ko|>,<|withitn|>,<|Speech|>
5f5b55a1b8781316283497ea,벌써 내일 다 만나기로 약속했지.,<|HAPPY|>,0.2,5,<|ko|>,<|withitn|>,<|Speech|>
5f645493639e10419832ebd2,드디어 구매한 노트북이 도착했어! 뜯어봐야지!,<|HAPPY|>,0.5,5,<|ko|>,<|withitn|>,<|Speech|>
5e32e5aa5807b852d9e04264,알겠어. 걱정해줘서 고마워.,<|HAPPY|>,0.1,3,<|ko|>,<|withitn|>,<|Speech|>
5f97e8e4111dfd48d40ff57b,나 드디어 프로젝트가 끝났어.,<|HAPPY|>,0.7,4,<|ko|>,<|withitn|>,<|Speech|>


## audio

In [18]:
AUDIO_DIR = './dataset/samples'

In [19]:
labels_df['source'] = [f'{AUDIO_DIR}/{key}.wav' for key in labels_df.index]
labels_df.head()

Unnamed: 0_level_0,target,emo_target,intensity,target_len,text_language,with_or_wo_itn,event_target,source
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
5f62edf1b878131628349e6e,싼 걸로 샀어요. 한 50만원 대였던거 같애요.,<|HAPPY|>,0.1,7,<|ko|>,<|withitn|>,<|Speech|>,./dataset/samples/5f62edf1b878131628349e6e.wav
5f5b55a1b8781316283497ea,벌써 내일 다 만나기로 약속했지.,<|HAPPY|>,0.2,5,<|ko|>,<|withitn|>,<|Speech|>,./dataset/samples/5f5b55a1b8781316283497ea.wav
5f645493639e10419832ebd2,드디어 구매한 노트북이 도착했어! 뜯어봐야지!,<|HAPPY|>,0.5,5,<|ko|>,<|withitn|>,<|Speech|>,./dataset/samples/5f645493639e10419832ebd2.wav
5e32e5aa5807b852d9e04264,알겠어. 걱정해줘서 고마워.,<|HAPPY|>,0.1,3,<|ko|>,<|withitn|>,<|Speech|>,./dataset/samples/5e32e5aa5807b852d9e04264.wav
5f97e8e4111dfd48d40ff57b,나 드디어 프로젝트가 끝났어.,<|HAPPY|>,0.7,4,<|ko|>,<|withitn|>,<|Speech|>,./dataset/samples/5f97e8e4111dfd48d40ff57b.wav


In [28]:
import torchaudio
import torchaudio.compliance.kaldi as Kaldi
import torchaudio.lib
# from funasr.utils.load_utils import extract_fbank

def get_source_len(source):
    try:
        waveform, sr = torchaudio.load(source)
    except Exception:
        return -1
    if waveform.size(0) > 1:
        waveform = waveform.mean(dim=0, keepdim=True)
    fb = Kaldi.fbank(
        waveform=waveform,
        num_mel_bins=80,
        frame_length=25.0,
        frame_shift=10.0,
        sample_frequency=sr
    )
    return int(fb.size(0))
    

In [29]:
labels_df['source_len'] = labels_df.apply(lambda row: get_source_len(row['source']), axis=1)

In [35]:
(labels_df['source_len']==-1).sum()

5

In [38]:
labels_df = labels_df[labels_df['source_len'] != -1]

In [42]:
labels_df.to_csv('preprocess_sensevoice.csv', encoding='cp949')

## export to json

In [47]:
import json

# 1. CSV 로드 (key 컬럼을 인덱스로)
df = pd.read_csv("preprocess_sensevoice.csv", index_col="key", encoding='cp949')

# 2. JSONL 파일로 쓰기
with open("train.jsonl", "w", encoding="utf-8") as fout:
    for key, row in df.iterrows():
        rec = {
            "key":           key,
            "source":        row["source"],
            "source_len":    int(row["source_len"]),
            "target":        row["target"],
            "target_len":    int(row["target_len"]),
            "text_language": row["text_language"],
            "emo_target":    row["emo_target"],
            "event_target":  row["event_target"],
            "with_or_wo_itn":row["with_or_wo_itn"],
            # intensity 컬럼명이 다르면 여기에 맞게 바꿔주세요
            "emo_intensity": float(row["intensity"])
        }
        fout.write(json.dumps(rec, ensure_ascii=False) + "\n")

print(f"✅ Saved {len(df)} records to train.jsonl")

✅ Saved 9221 records to train.jsonl
