In [1]:
!pip install -q transformers datasets soundfile torchaudio jiwer accelerate evaluate

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m116.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
!pip install -q torch torchaudio torchcodec datasets==3.0.0 transformers==4.44.0

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.3/474.3 kB[0m [31m28.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m141.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m95.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m177.6/177.6 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m100.6 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2025.3.0 requires fsspec==2025.3.0, but you have fsspec 2024.6.1 which is incompatible.[0m[

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import os
import glob
import json
import subprocess
from typing import List

import pandas as pd
import numpy as np
import torch

from datasets import Dataset, Audio
from transformers import (
    WhisperProcessor,
    WhisperForConditionalGeneration,
    DataCollatorForSeq2Seq,
    TrainingArguments,
    Trainer,
)
import evaluate

# ---------------------------------------------------
#  경로 설정
# ---------------------------------------------------
ROOT_DIR   = "/content/drive/MyDrive/childrenvoice"

TRAIN_BASE = os.path.join(ROOT_DIR, "trainingdata", "Sourcedata")
VAL_BASE   = os.path.join(ROOT_DIR, "valdata")
TEST_BASE  = os.path.join(ROOT_DIR, "testdata")
LABEL_BASE = os.path.join(ROOT_DIR, "trainingdata", "labellingdata")

TARGET_AGES = [4, 5, 6, 7]

print("ROOT_DIR  :", ROOT_DIR)
print("TRAIN_BASE:", TRAIN_BASE)
print("VAL_BASE  :", VAL_BASE)
print("TEST_BASE :", TEST_BASE)
print("LABEL_BASE:", LABEL_BASE)

# ---------------------------------------------------
#  JSON → 텍스트 추출 함수
#  (네가 쓰던 extract_text_from_json 그대로)
# ---------------------------------------------------
def extract_text_from_json(data):
    """
    AI-Hub 아동 음성 라벨 구조에 맞춰서
    Transcription -> LabelText 값을 우선적으로 꺼낸다.
    """
    try:
        text = data["Transcription"]["LabelText"]
        if isinstance(text, str):
            return text.strip()
    except KeyError:
        pass

    # 혹시 구조가 다를 경우 전체에서 문자열 긁어오기
    def collect_all_strings(obj, bucket):
        if isinstance(obj, dict):
            for v in obj.values():
                collect_all_strings(v, bucket)
        elif isinstance(obj, list):
            for item in obj:
                collect_all_strings(item, bucket)
        elif isinstance(obj, str):
            s = obj.strip()
            if s:
                bucket.append(s)

    strings = []
    collect_all_strings(data, strings)

    if not strings:
        return None

    strings.sort(key=len, reverse=True)
    return strings[0]

# ---------------------------------------------------
#  split별 (train/val/test) 기본 rows 만들기
#  (wav 경로만, text는 빈 문자열)
# ---------------------------------------------------
def build_base_df(split_name: str, base_dir: str, target_ages: List[int]) -> pd.DataFrame:
    rows = []

    for age in target_ages:
        age_dir = os.path.join(base_dir, f"age{age}")
        if not os.path.isdir(age_dir):
            print(f"⚠️ {split_name}: age{age} 폴더 없음: {age_dir}")
            continue

        for speaker_id in sorted(os.listdir(age_dir)):
            spk_dir = os.path.join(age_dir, speaker_id)
            if not os.path.isdir(spk_dir):
                continue

            wav_files = sorted(glob.glob(os.path.join(spk_dir, "*.wav")))
            if not wav_files:
                print(f"⚠️ {split_name}: wav 없음: {spk_dir}")
                continue

            for wav_path in wav_files:
                basename = os.path.basename(wav_path)      # K0001...
                utt_id   = os.path.splitext(basename)[0]   # 확장자 제거

                rows.append({
                    "split": split_name,
                    "age": age,
                    "speaker_id": speaker_id,
                    "utt_id": utt_id,
                    "audio_path": wav_path,
                    "text": "",     # 나중에 채움
                    "speed": 1.0,
                    "is_augmented": False,
                })

    df = pd.DataFrame(rows)
    print(f"\n✅ {split_name} base df 샘플 수:", len(df))
    if len(df) > 0:
        print("   예시:")
        display(df.head())
    return df

train_base_df = build_base_df("train", TRAIN_BASE, TARGET_AGES)
val_base_df   = build_base_df("val",   VAL_BASE,   TARGET_AGES)
test_base_df  = build_base_df("test",  TEST_BASE,  TARGET_AGES)

# ---------------------------------------------------
#  LABEL_BASE / speaker_id / utt_id.json 에서 텍스트 붙이기
#  (제로샷 때 쓰던 방식 그대로 확장)
# ---------------------------------------------------
def attach_labels(df: pd.DataFrame, label_base: str) -> pd.DataFrame:
    utt2text = {}
    not_found_json = 0

    for idx, row in df.iterrows():
        spk = row["speaker_id"]   # 예: "0282"
        utt = row["utt_id"]       # 예: "K0001..."

        json_path = os.path.join(label_base, spk, utt + ".json")

        if not os.path.exists(json_path):
            not_found_json += 1
            continue

        with open(json_path, "r", encoding="utf-8") as f:
            try:
                data = json.load(f)
            except Exception as e:
                print("⚠️ JSON 로드 실패:", json_path, "에러:", e)
                continue

        text = extract_text_from_json(data)
        if text:
            utt2text[utt] = text

    print("✅ json에서 텍스트를 가져온 utt 개수:", len(utt2text))
    print("⚠️ json 파일이 없어서 못 매칭한 케이스 수:", not_found_json)

    # df에 text 컬럼 채우기
    def get_transcript(row):
        return utt2text.get(row["utt_id"], "")

    df = df.copy()
    df["text"] = df.apply(get_transcript, axis=1)

    no_label_mask = (df["text"].astype(str).str.strip() == "")
    print("⚠️ 라벨 못 찾은 샘플 수:", no_label_mask.sum())
    print("✅ 라벨 있는 샘플 수:", (~no_label_mask).sum())

    df_labeled = df[~no_label_mask].reset_index(drop=True)
    print("✅ 최종 사용 df 샘플 수:", len(df_labeled))
    if len(df_labeled) > 0:
        display(df_labeled.head())
    return df_labeled

train_df = attach_labels(train_base_df, LABEL_BASE)
val_df   = attach_labels(val_base_df,   LABEL_BASE)
test_df  = attach_labels(test_base_df,  LABEL_BASE)

print("\ntrain_df.shape:", train_df.shape)
print("val_df.shape   :", val_df.shape)
print("test_df.shape  :", test_df.shape)

ROOT_DIR  : /content/drive/MyDrive/childrenvoice
TRAIN_BASE: /content/drive/MyDrive/childrenvoice/trainingdata/Sourcedata
VAL_BASE  : /content/drive/MyDrive/childrenvoice/valdata
TEST_BASE : /content/drive/MyDrive/childrenvoice/testdata
LABEL_BASE: /content/drive/MyDrive/childrenvoice/trainingdata/labellingdata

✅ train base df 샘플 수: 400
   예시:


Unnamed: 0,split,age,speaker_id,utt_id,audio_path,text,speed,is_augmented
0,train,4,282,K00010282-AFG10-L1N2D1-E-K0KK-00007409,/content/drive/MyDrive/childrenvoice/trainingd...,,1.0,False
1,train,4,282,K00020282-AFG10-L1N2D1-E-K0KK-00007425,/content/drive/MyDrive/childrenvoice/trainingd...,,1.0,False
2,train,4,282,K00030282-AFG10-L1N2D1-E-K0KK-01040217,/content/drive/MyDrive/childrenvoice/trainingd...,,1.0,False
3,train,4,282,K00040282-AFG10-L1N2D1-E-K0KK-01040219,/content/drive/MyDrive/childrenvoice/trainingd...,,1.0,False
4,train,4,282,K00050282-AFG10-L1N2D1-E-K0KK-00007474,/content/drive/MyDrive/childrenvoice/trainingd...,,1.0,False



✅ val base df 샘플 수: 120
   예시:


Unnamed: 0,split,age,speaker_id,utt_id,audio_path,text,speed,is_augmented
0,val,4,172,K00010172-AFG10-L1N2D1-E-K0KK-00006450,/content/drive/MyDrive/childrenvoice/valdata/a...,,1.0,False
1,val,4,172,K00020172-AFG10-L1N2D1-E-K0KK-00006458,/content/drive/MyDrive/childrenvoice/valdata/a...,,1.0,False
2,val,4,172,K00030172-AFG10-L1N2D1-E-K0KK-00006463,/content/drive/MyDrive/childrenvoice/valdata/a...,,1.0,False
3,val,4,172,K00040172-AFG10-L1N2D1-E-K0KK-00006479,/content/drive/MyDrive/childrenvoice/valdata/a...,,1.0,False
4,val,4,172,K00050172-AFG10-L1N2D1-E-K0KK-00006487,/content/drive/MyDrive/childrenvoice/valdata/a...,,1.0,False



✅ test base df 샘플 수: 200
   예시:


Unnamed: 0,split,age,speaker_id,utt_id,audio_path,text,speed,is_augmented
0,test,4,155,K00010155-AFG13-L1N2D1-E-K0KK-01714495,/content/drive/MyDrive/childrenvoice/testdata/...,,1.0,False
1,test,4,155,K00020155-AFG13-L1N2D1-E-K0KK-00031003,/content/drive/MyDrive/childrenvoice/testdata/...,,1.0,False
2,test,4,155,K00030155-AFG13-L1N2D1-E-K0KK-00031090,/content/drive/MyDrive/childrenvoice/testdata/...,,1.0,False
3,test,4,155,K00040155-AFG13-L1N2D1-E-K0KK-00031135,/content/drive/MyDrive/childrenvoice/testdata/...,,1.0,False
4,test,4,155,K00050155-AFG13-L1N2D1-E-K0KK-01714550,/content/drive/MyDrive/childrenvoice/testdata/...,,1.0,False


✅ json에서 텍스트를 가져온 utt 개수: 400
⚠️ json 파일이 없어서 못 매칭한 케이스 수: 0
⚠️ 라벨 못 찾은 샘플 수: 0
✅ 라벨 있는 샘플 수: 400
✅ 최종 사용 df 샘플 수: 400


Unnamed: 0,split,age,speaker_id,utt_id,audio_path,text,speed,is_augmented
0,train,4,282,K00010282-AFG10-L1N2D1-E-K0KK-00007409,/content/drive/MyDrive/childrenvoice/trainingd...,왕께 인사했어요(인사해떠요).,1.0,False
1,train,4,282,K00020282-AFG10-L1N2D1-E-K0KK-00007425,/content/drive/MyDrive/childrenvoice/trainingd...,섬세한 바이올린.,1.0,False
2,train,4,282,K00030282-AFG10-L1N2D1-E-K0KK-01040217,/content/drive/MyDrive/childrenvoice/trainingd...,깜깜한 밤은 무서워요.,1.0,False
3,train,4,282,K00040282-AFG10-L1N2D1-E-K0KK-01040219,/content/drive/MyDrive/childrenvoice/trainingd...,치즈 케이크.,1.0,False
4,train,4,282,K00050282-AFG10-L1N2D1-E-K0KK-00007474,/content/drive/MyDrive/childrenvoice/trainingd...,무릎을 다쳤어요.,1.0,False


✅ json에서 텍스트를 가져온 utt 개수: 120
⚠️ json 파일이 없어서 못 매칭한 케이스 수: 0
⚠️ 라벨 못 찾은 샘플 수: 0
✅ 라벨 있는 샘플 수: 120
✅ 최종 사용 df 샘플 수: 120


Unnamed: 0,split,age,speaker_id,utt_id,audio_path,text,speed,is_augmented
0,val,4,172,K00010172-AFG10-L1N2D1-E-K0KK-00006450,/content/drive/MyDrive/childrenvoice/valdata/a...,왕께 인사했어요.,1.0,False
1,val,4,172,K00020172-AFG10-L1N2D1-E-K0KK-00006458,/content/drive/MyDrive/childrenvoice/valdata/a...,섬세한 바이올린.,1.0,False
2,val,4,172,K00030172-AFG10-L1N2D1-E-K0KK-00006463,/content/drive/MyDrive/childrenvoice/valdata/a...,깜깜한 밤은 무서워요.,1.0,False
3,val,4,172,K00040172-AFG10-L1N2D1-E-K0KK-00006479,/content/drive/MyDrive/childrenvoice/valdata/a...,치즈 케이크.,1.0,False
4,val,4,172,K00050172-AFG10-L1N2D1-E-K0KK-00006487,/content/drive/MyDrive/childrenvoice/valdata/a...,무릎을 다쳤어요.,1.0,False


✅ json에서 텍스트를 가져온 utt 개수: 200
⚠️ json 파일이 없어서 못 매칭한 케이스 수: 0
⚠️ 라벨 못 찾은 샘플 수: 0
✅ 라벨 있는 샘플 수: 200
✅ 최종 사용 df 샘플 수: 200


Unnamed: 0,split,age,speaker_id,utt_id,audio_path,text,speed,is_augmented
0,test,4,155,K00010155-AFG13-L1N2D1-E-K0KK-01714495,/content/drive/MyDrive/childrenvoice/testdata/...,문을 두드려.,1.0,False
1,test,4,155,K00020155-AFG13-L1N2D1-E-K0KK-00031003,/content/drive/MyDrive/childrenvoice/testdata/...,미끄럼을 타요.,1.0,False
2,test,4,155,K00030155-AFG13-L1N2D1-E-K0KK-00031090,/content/drive/MyDrive/childrenvoice/testdata/...,아이가 밖으로 뛰어갑니다.,1.0,False
3,test,4,155,K00040155-AFG13-L1N2D1-E-K0KK-00031135,/content/drive/MyDrive/childrenvoice/testdata/...,장난감 두 개 주세요.,1.0,False
4,test,4,155,K00050155-AFG13-L1N2D1-E-K0KK-01714550,/content/drive/MyDrive/childrenvoice/testdata/...,체리가 상했어요.,1.0,False



train_df.shape: (400, 8)
val_df.shape   : (120, 8)
test_df.shape  : (200, 8)


In [6]:
# ---------------------------------------------------
#  ffmpeg로 0.7배속 증강
# ---------------------------------------------------
def speed_perturb(in_path: str, out_path: str, factor: float):
    os.makedirs(os.path.dirname(out_path), exist_ok=True)
    cmd = [
        "ffmpeg", "-y",
        "-i", in_path,
        "-filter:a", f"atempo={factor}",
        out_path,
    ]
    subprocess.run(
        cmd,
        stdout=subprocess.DEVNULL,
        stderr=subprocess.DEVNULL,
        check=True,
    )

FACTOR = 0.7
speed_tag = str(FACTOR).replace(".", "_")   # 0.7 -> "0_7"

AUG_TRAIN_BASE = os.path.join(ROOT_DIR, "trainingdata", f"Sourcedata_speed{speed_tag}")
print("AUG_TRAIN_BASE:", AUG_TRAIN_BASE)

aug_rows = []

print(f"▶ 0.7배속 증강 생성 시작 (라벨 있는 원본 train {len(train_df)}개 기준)")

for idx, row in train_df.iterrows():
    in_wav = row["audio_path"]

    # TRAIN_BASE 기준 상대 경로
    rel_path = os.path.relpath(in_wav, TRAIN_BASE)   # age4/0282/xxx.wav
    base, ext = os.path.splitext(rel_path)

    out_rel = base + f"_x{FACTOR}" + ext             # age4/0282/xxx_x0.7.wav
    out_wav = os.path.join(AUG_TRAIN_BASE, out_rel)

    if not os.path.exists(out_wav):
        speed_perturb(in_wav, out_wav, FACTOR)

    aug_rows.append({
        "split": "train",
        "age": row["age"],
        "speaker_id": row["speaker_id"],
        "utt_id": row["utt_id"],
        "audio_path": out_wav,
        "text": row["text"],
        "speed": FACTOR,
        "is_augmented": True,
    })

aug_df = pd.DataFrame(aug_rows)
print("✅ 0.7배속 증강 train 개수:", len(aug_df))
if len(aug_df) > 0:
    display(aug_df.head())

# 원본 train + 0.7배속 증강 train 합치기
train_df_full = pd.concat([train_df, aug_df], ignore_index=True)
print("train_df_full (원본+0.7) 개수:", len(train_df_full))
display(train_df_full.head())

AUG_TRAIN_BASE: /content/drive/MyDrive/childrenvoice/trainingdata/Sourcedata_speed0_7
▶ 0.7배속 증강 생성 시작 (라벨 있는 원본 train 400개 기준)
✅ 0.7배속 증강 train 개수: 400


Unnamed: 0,split,age,speaker_id,utt_id,audio_path,text,speed,is_augmented
0,train,4,282,K00010282-AFG10-L1N2D1-E-K0KK-00007409,/content/drive/MyDrive/childrenvoice/trainingd...,왕께 인사했어요(인사해떠요).,0.7,True
1,train,4,282,K00020282-AFG10-L1N2D1-E-K0KK-00007425,/content/drive/MyDrive/childrenvoice/trainingd...,섬세한 바이올린.,0.7,True
2,train,4,282,K00030282-AFG10-L1N2D1-E-K0KK-01040217,/content/drive/MyDrive/childrenvoice/trainingd...,깜깜한 밤은 무서워요.,0.7,True
3,train,4,282,K00040282-AFG10-L1N2D1-E-K0KK-01040219,/content/drive/MyDrive/childrenvoice/trainingd...,치즈 케이크.,0.7,True
4,train,4,282,K00050282-AFG10-L1N2D1-E-K0KK-00007474,/content/drive/MyDrive/childrenvoice/trainingd...,무릎을 다쳤어요.,0.7,True


train_df_full (원본+0.7) 개수: 800


Unnamed: 0,split,age,speaker_id,utt_id,audio_path,text,speed,is_augmented
0,train,4,282,K00010282-AFG10-L1N2D1-E-K0KK-00007409,/content/drive/MyDrive/childrenvoice/trainingd...,왕께 인사했어요(인사해떠요).,1.0,False
1,train,4,282,K00020282-AFG10-L1N2D1-E-K0KK-00007425,/content/drive/MyDrive/childrenvoice/trainingd...,섬세한 바이올린.,1.0,False
2,train,4,282,K00030282-AFG10-L1N2D1-E-K0KK-01040217,/content/drive/MyDrive/childrenvoice/trainingd...,깜깜한 밤은 무서워요.,1.0,False
3,train,4,282,K00040282-AFG10-L1N2D1-E-K0KK-01040219,/content/drive/MyDrive/childrenvoice/trainingd...,치즈 케이크.,1.0,False
4,train,4,282,K00050282-AFG10-L1N2D1-E-K0KK-00007474,/content/drive/MyDrive/childrenvoice/trainingd...,무릎을 다쳤어요.,1.0,False


In [12]:

# ---------------------------------------------------
#  train/val/test 라벨링 결과 점검
# ---------------------------------------------------
print(">> 라벨링된 샘플 수 확인")
print("  - train_df_full:", len(train_df_full))
print("  - val_df       :", len(val_df))
print("  - test_df      :", len(test_df))

if len(train_df_full) == 0:
    raise ValueError("train_df_full 이 0개입니다. JSON 매칭(attach_labels) 부분을 먼저 확인하세요.")
if len(val_df) == 0:
    raise ValueError("val_df 가 0개입니다. val 라벨 매칭을 확인하세요.")
if len(test_df) == 0:
    raise ValueError("test_df 가 0개입니다. test 라벨 매칭을 확인하세요.")

# ---------------------------------------------------
#  Whisper 모델 / Processor 로드
# ---------------------------------------------------
MODEL_NAME = "openai/whisper-small"   # tiny/base 등으로 바꿔도 됨

processor = WhisperProcessor.from_pretrained(
    MODEL_NAME,
    language="Korean",
    task="transcribe",
)
feature_extractor = processor.feature_extractor
tokenizer         = processor.tokenizer

model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME)

# 항상 한국어 transcribe로 고정
model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(
    language="Korean",
    task="transcribe",
)
model.config.suppress_tokens = []
model.config.use_cache = False

print("CUDA 사용 가능 여부:", torch.cuda.is_available())

# ---------------------------------------------------
#  torchaudio로 오디오 읽기 (불량 파일은 무음으로 대체)
# ---------------------------------------------------
import numpy as np

def load_audio_16k_mono(path: str):
    """
    - torchaudio.load로 wav 읽기
    - 채널 여러 개면 mono로 평균
    - 16kHz로 리샘플
    - 문제가 생기면 1초짜리 무음 반환 (학습은 되게끔)
    """
    try:
        waveform, sr = torchaudio.load(path)  # (channels, time)
    except Exception as e:
        print("⚠️ 오디오 로드 실패, 무음으로 대체:", path, "에러:", e)
        # 1초짜리 무음(16000 샘플)
        return np.zeros(16000, dtype=np.float32)

    # mono
    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)

    if sr != 16000:
        resampler = torchaudio.transforms.Resample(sr, 16000)
        waveform = resampler(waveform)

    waveform = waveform.squeeze(0).numpy()
    return waveform

# ---------------------------------------------------
#  pandas → HF Dataset (Audio 타입 쓰지 않음)
# ---------------------------------------------------
from datasets import Dataset

def make_hf_dataset(df: pd.DataFrame) -> Dataset:
    # 오디오 파일은 바로 안 읽고, 경로 + 텍스트만 Dataset에 넣어둔다
    ds = Dataset.from_pandas(df[["audio_path", "text"]].copy())
    return ds

train_ds = make_hf_dataset(train_df_full)
val_ds   = make_hf_dataset(val_df)
test_ds  = make_hf_dataset(test_df)

print(">> HF Dataset 크기")
print("  - train_ds:", len(train_ds))
print("  - val_ds  :", len(val_ds))
print("  - test_ds :", len(test_ds))

# 여기서 더 이상 train_ds[0] 같은 건 안 찍는다 (빈 경우 에러 방지)

# ---------------------------------------------------
#  전처리 함수: 여기서 torchaudio로 실제 로드
# ---------------------------------------------------
import torchaudio
import numpy as np

def load_audio_16k_mono(path: str):
    """
    - torchaudio로 wav 로드
    - 채널 여러 개면 mono로 평균
    - 16kHz로 리샘플
    - 문제 생기면 1초 무음 반환 (학습은 계속 진행되게)
    """
    try:
        waveform, sr = torchaudio.load(path)  # (channels, time)

        # mono
        if waveform.shape[0] > 1:
            waveform = waveform.mean(dim=0, keepdim=True)

        if sr != 16000:
            resampler = torchaudio.transforms.Resample(sr, 16000)
            waveform = resampler(waveform)

        return waveform.squeeze(0).numpy()

    except Exception as e:
        print("⚠️ 오디오 로드 실패, 무음으로 대체:", path, "에러:", e)
        return np.zeros(16000, dtype=np.float32)  # 1초짜리 무음

def prepare_dataset(batch):
    paths = batch["audio_path"]           # 이제 경로만 들어 있음
    audio_arrays = [load_audio_16k_mono(p) for p in paths]

    inputs = feature_extractor(
        audio_arrays,
        sampling_rate=16000,
    )
    batch["input_features"] = inputs["input_features"]

    labels = tokenizer(batch["text"])
    batch["labels"] = labels["input_ids"]
    return batch

train_ds_proc = train_ds.map(
    prepare_dataset,
    remove_columns=train_ds.column_names,
    batched=True,
    num_proc=1,
)

val_ds_proc = val_ds.map(
    prepare_dataset,
    remove_columns=val_ds.column_names,
    batched=True,
    num_proc=1,
)

test_ds_proc = test_ds.map(
    prepare_dataset,
    remove_columns=test_ds.column_names,
    batched=True,
    num_proc=1,
)

print(">> 전처리 후 Dataset 크기")
print("  - train_ds_proc:", len(train_ds_proc))
print("  - val_ds_proc  :", len(val_ds_proc))
print("  - test_ds_proc :", len(test_ds_proc))

# ---------------------------------------------------
#  DataCollator + WER/CER metric
# ---------------------------------------------------
from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # 1) 오디오 특징
        input_features = [{"input_features": f["input_features"]} for f in features]
        # 2) 라벨 토큰
        label_features = [{"input_ids": f["labels"]} for f in features]

        batch = self.processor.feature_extractor.pad(
            input_features,
            return_tensors="pt",
        )
        labels_batch = self.processor.tokenizer.pad(
            label_features,
            return_tensors="pt",
        )
        labels = labels_batch["input_ids"].masked_fill(
            labels_batch["attention_mask"].ne(1),
            -100,
        )
        batch["labels"] = labels
        return batch

data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

wer_metric = evaluate.load("wer")
cer_metric = evaluate.load("cer")

def compute_metrics(eval_pred):
    pred_ids = eval_pred.predictions
    if isinstance(pred_ids, tuple):
        pred_ids = pred_ids[0]

    label_ids = eval_pred.label_ids
    label_ids = np.where(label_ids == -100, tokenizer.pad_token_id, label_ids)

    pred_str  = tokenizer.batch_decode(pred_ids,  skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    cer = cer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer, "cer": cer}

>> 라벨링된 샘플 수 확인
  - train_df_full: 800
  - val_df       : 120
  - test_df      : 200
CUDA 사용 가능 여부: True
>> HF Dataset 크기
  - train_ds: 800
  - val_ds  : 120
  - test_ds : 200


Map:   0%|          | 0/800 [00:00<?, ? examples/s]

  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


Map:   0%|          | 0/120 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

>> 전처리 후 Dataset 크기
  - train_ds_proc: 800
  - val_ds_proc  : 120
  - test_ds_proc : 200


In [13]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

In [14]:
output_dir = "/content/whisper_children_0_7"

fp16 = torch.cuda.is_available()

args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=5,
    learning_rate=1e-5,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    predict_with_generate=True,   # ✅ Seq2SeqTrainingArguments라서 이제 정상
    fp16=torch.cuda.is_available(),
    remove_unused_columns=False,
    generation_max_length=128,       # (선택) 디코딩 길이
    generation_num_beams=1,          # (선택) beam search 안 쓰면 1
)
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=train_ds_proc,   # 네가 쓰는 train_ds_proc / train_proc_full 등
    eval_dataset=val_ds_proc,
    data_collator=data_collator,
    tokenizer=processor.feature_extractor,  # 혹은 tokenizer/processor
    compute_metrics=compute_metrics,
)

print("▶ 0.7배속 구성 (train=원본+0.7, val/test=원본) 학습 시작")
train_result = trainer.train()

# 모델 저장 (옵션)
trainer.save_model(os.path.join(output_dir, "final_model"))
processor.save_pretrained(os.path.join(output_dir, "final_processor"))

# ---------------------------------------------------
#  Validation 평가
# ---------------------------------------------------
eval_metrics = trainer.evaluate()
print("\n✅ Validation 결과 (0.7배속 구성):")
print(f"WER: {eval_metrics['eval_wer']:.4f}")
print(f"CER: {eval_metrics['eval_cer']:.4f}")

# ---------------------------------------------------
#  Test set 평가 + 전체 문장 비교 + CSV 저장
# ---------------------------------------------------
print("\n▶ Test set 평가 중...")

test_outputs = trainer.predict(test_ds_proc)
pred_ids = test_outputs.predictions
if isinstance(pred_ids, tuple):
    pred_ids = pred_ids[0]

label_ids = test_outputs.label_ids
label_ids = np.where(label_ids == -100, tokenizer.pad_token_id, label_ids)

pred_str  = tokenizer.batch_decode(pred_ids,  skip_special_tokens=True)
label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

test_wer = wer_metric.compute(predictions=pred_str, references=label_str)
test_cer = cer_metric.compute(predictions=pred_str, references=label_str)

print("\n✅ Test 결과 (0.7배속 구성, train=원본+0.7 / val,test=원본):")
print(f"Test WER: {test_wer:.4f}")
print(f"Test CER: {test_cer:.4f}")

# 🔹 Test 전체 ref–pred 출력
print("\n🧩 Test 전체 샘플 REF vs PRED\n" + "-"*60)
for i in range(len(pred_str)):
    print(f"\n🔹 샘플 {i+1}")
    print(f"[REF] {label_str[i]}")
    print(f"[PRED] {pred_str[i]}")

# 🔹 CSV로 저장
compare_df = pd.DataFrame({
    "ref": label_str,
    "pred": pred_str,
})
csv_path = os.path.join(output_dir, "predictions_compare0.7(1).csv")
compare_df.to_csv(csv_path, index=False, encoding="utf-8-sig")
print(f"\n✅ 전체 ref–pred 비교 CSV 저장 완료: {csv_path}")



▶ 0.7배속 구성 (train=원본+0.7, val/test=원본) 학습 시작


Epoch,Training Loss,Validation Loss,Wer,Cer
1,0.2582,0.787157,0.590244,0.35914
2,0.0498,0.798081,0.568293,0.38172
3,0.0084,0.805151,0.565854,0.351075
4,0.0022,0.839439,0.558537,0.367204
5,0.0015,0.831428,0.556098,0.389785


Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}
Non-default generation parameters: {'max_length': 448, 'su


✅ Validation 결과 (0.7배속 구성):
WER: 0.5561
CER: 0.3898

▶ Test set 평가 중...

✅ Test 결과 (0.7배속 구성, train=원본+0.7 / val,test=원본):
Test WER: 0.4074
Test CER: 0.3254

🧩 Test 전체 샘플 REF vs PRED
------------------------------------------------------------

🔹 샘플 1
[REF] 문을 두드려.
[PRED] 문을 두두요.

🔹 샘플 2
[REF] 미끄럼을 타요.
[PRED]  मिष्ग्छा यो.

🔹 샘플 3
[REF] 아이가 밖으로 뛰어갑니다.
[PRED] 

🔹 샘플 4
[REF] 장난감 두 개 주세요.
[PRED] 장난감 두 개 주세요.

🔹 샘플 5
[REF] 체리가 상했어요.
[PRED] 채벨가 타겠어요.

🔹 샘플 6
[REF] 과자가 매워요.
[PRED] 과자가 매워요.

🔹 샘플 7
[REF] 물에 넣어요.
[PRED] 무에 넣어요.

🔹 샘플 8
[REF] 나는 선물을 줍니다.
[PRED] नाने सम्होरे जूमिवा.

🔹 샘플 9
[REF] 친구가 노래를 불러요.
[PRED] 친구가 노래를 불러요.

🔹 샘플 10
[REF] 필요한 것을 샀습니다.
[PRED] 필요한 곳이 펴펴입니다.

🔹 샘플 11
[REF] 풀.
[PRED]  Prøy.

🔹 샘플 12
[REF] 이슬에 젖다.
[PRED] ಕಿಕರಿಕಿಕಿ.

🔹 샘플 13
[REF] 개미에게 물어봤어요.
[PRED] 개인에게 물어봤어요.

🔹 샘플 14
[REF] 깨끗한 거울이 있어요.
[PRED] 깨끗한 거울이 있어요.

🔹 샘플 15
[REF] 놀이터가 좋아.
[PRED] 노래가 좋아.

🔹 샘플 16
[REF] 송아지가 말해요.
[PRED] ತದರರರರರರರರರರರರರರರರರರರರರರರರರರರರರರರರರರರರರರರರರರರರರರರರರರರರರರರರರರರರ

🔹 샘플 17
[REF] 강을 건너요.

In [15]:
print(len(train_ds_proc))

800
