In [1]:
!uv pip install transformers datasets evaluate jiwer torch accelerate tensorboard torchcodec soundfile librosa

[2mUsing Python 3.12.12 environment at: /usr[0m
[2mAudited [1m10 packages[0m [2min 98ms[0m[0m


In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [6]:
from dotenv import load_dotenv
load_dotenv("../.env")

In [7]:
import torch
import numpy as np
import soundfile as sf
from io import BytesIO
from datasets import load_dataset, Audio
from transformers import (
    WhisperFeatureExtractor,
    WhisperTokenizer,
    WhisperProcessor,
    WhisperForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)
from dataclasses import dataclass
from typing import Any, Dict, List, Union
import evaluate

# Configuration
MODEL_NAME = "openai/whisper-large-v3-turbo"  # Options: tiny, base, small, medium, large
LANGUAGE = "korean"
TASK = "transcribe"
OUTPUT_DIR = "./whisper-korean-zeroth"

# Load dataset
print("Loading dataset...")
raw_datasets = load_dataset("Bingsu/zeroth-korean")

# Initialize model components
print("Loading model components...")
feature_extractor = WhisperFeatureExtractor.from_pretrained(MODEL_NAME)
tokenizer = WhisperTokenizer.from_pretrained(MODEL_NAME, language=LANGUAGE, task=TASK)
processor = WhisperProcessor.from_pretrained(MODEL_NAME, language=LANGUAGE, task=TASK)

print("Resampling audio to 16kHz...")
# IMPORTANT: Set decode=False to prevent automatic audio decoding
raw_datasets = raw_datasets.cast_column("audio", Audio(sampling_rate=16000, decode=False))

# Data collator that handles audio processing on-the-fly
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    """Data collator that processes audio on-the-fly and pads features/labels"""
    processor: Any
    
    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
        # Extract audio arrays and text
        audio_arrays = []
        texts = []
        
        for feature in features:
            # Manually decode audio using soundfile to avoid torchcodec
            audio_dict = feature["audio"]
            
            # Read audio from bytes using soundfile
            if "bytes" in audio_dict and audio_dict["bytes"] is not None:
                audio_bytes = audio_dict["bytes"]
                audio_array, sample_rate = sf.read(BytesIO(audio_bytes))
                
                # Resample if needed (simple resampling)
                if sample_rate != 16000:
                    # Use librosa for better resampling if available
                    try:
                        import librosa
                        audio_array = librosa.resample(audio_array, orig_sr=sample_rate, target_sr=16000)
                    except ImportError:
                        # Simple linear interpolation if librosa not available
                        from scipy import signal
                        num_samples = int(len(audio_array) * 16000 / sample_rate)
                        audio_array = signal.resample(audio_array, num_samples)
            elif "path" in audio_dict and audio_dict["path"] is not None:
                # Read from file path
                audio_array, sample_rate = sf.read(audio_dict["path"])
                if sample_rate != 16000:
                    try:
                        import librosa
                        audio_array = librosa.resample(audio_array, orig_sr=sample_rate, target_sr=16000)
                    except ImportError:
                        from scipy import signal
                        num_samples = int(len(audio_array) * 16000 / sample_rate)
                        audio_array = signal.resample(audio_array, num_samples)
            else:
                raise ValueError("Audio dict must have either 'bytes' or 'path'")
            
            audio_arrays.append(audio_array)
            texts.append(feature["text"])
        
        # Process audio to get input features
        input_features = self.processor.feature_extractor(
            audio_arrays,
            sampling_rate=16000,
            return_tensors="pt"
        ).input_features
        
        # Clone to avoid gradient computation issues
        input_features = input_features.clone()
        
        # Tokenize text to get labels
        label_features = self.processor.tokenizer(
            texts,
            return_tensors="pt",
            padding=True
        )
        
        labels = label_features["input_ids"].clone()
        
        # Replace padding with -100 to ignore in loss
        labels = labels.masked_fill(
            label_features.attention_mask.ne(1), -100
        )
        
        # Remove decoder_start_token_id if present
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]
        
        return {
            "input_features": input_features,
            "labels": labels
        }

data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

# Load evaluation metric
metric = evaluate.load("cer")

def compute_metrics(pred):
    """Compute Character Error Rate (CER)"""
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # Replace -100 with pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # Decode predictions and labels
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    # Compute CER
    cer = 100 * metric.compute(predictions=pred_str, references=label_str)
    return {"cer": cer}

# Load pre-trained model
print("Loading pre-trained model...")
model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME)

# Set language and task for generation
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []
model.generation_config.language = LANGUAGE
model.generation_config.task = TASK

# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=1,  # Reduced since no gradient checkpointing
    gradient_accumulation_steps=8,  # Compensate for smaller batch
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=5000,
    gradient_checkpointing=False,  # Disabled to avoid graph reuse issues
    fp16=True,
    eval_strategy="steps",
    remove_unused_columns=False,  # Keep audio and text columns for on-the-fly processing
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="cer",
    greater_is_better=False,
    dataloader_num_workers=0,  # Disable multiprocessing in dataloader
    # -----------------------------
    # HuggingFace Hub 관련 설정
    # -----------------------------
    hub_model_id="brainer/whisper-korean-zeroth-v1",  # 만들고 싶은 repo 이름
    push_to_hub=True,  # 이것만 True로 바꾸면 Trainer가 자동으로 push
)

# Initialize trainer
print("Initializing trainer...")
trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=raw_datasets["train"],
    eval_dataset=raw_datasets["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    processing_class=processor.feature_extractor,  # Updated from tokenizer
)

# Start training
print("Starting training...")
print("Note: Audio is processed on-the-fly, so first epoch may be slower")
trainer.train()

# Save final model
print("Saving model...")
trainer.save_model(OUTPUT_DIR)
processor.save_pretrained(OUTPUT_DIR)

print(f"Training complete! Model saved to {OUTPUT_DIR}")

# Evaluation on test set
print("\nEvaluating on test set...")
results = trainer.evaluate()
print(f"Test CER: {results['eval_cer']:.2f}%")

# Example inference
print("\nExample inference:")
test_sample = raw_datasets["test"][0]
audio_dict = test_sample["audio"]

# Decode audio manually
if "bytes" in audio_dict and audio_dict["bytes"] is not None:
    audio_array, sample_rate = sf.read(BytesIO(audio_dict["bytes"]))
elif "path" in audio_dict and audio_dict["path"] is not None:
    audio_array, sample_rate = sf.read(audio_dict["path"])

# Resample if needed
if sample_rate != 16000:
    try:
        import librosa
        audio_array = librosa.resample(audio_array, orig_sr=sample_rate, target_sr=16000)
    except ImportError:
        from scipy import signal
        num_samples = int(len(audio_array) * 16000 / sample_rate)
        audio_array = signal.resample(audio_array, num_samples)

input_features = processor(
    audio_array, 
    sampling_rate=16000, 
    return_tensors="pt"
).input_features

# Generate transcription
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

with torch.no_grad():
    predicted_ids = model.generate(input_features.to(device))
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]

print(f"Reference: {test_sample['text']}")
print(f"Prediction: {transcription}")

Loading dataset...
Loading model components...
Resampling audio to 16kHz...


Downloading builder script: 0.00B [00:00, ?B/s]

Loading pre-trained model...
Initializing trainer...
Starting training...
Note: Audio is processed on-the-fly, so first epoch may be slower


Step,Training Loss,Validation Loss,Cer
1000,0.1323,0.183128,4.95446
2000,0.0789,0.123526,3.607883
3000,0.034,0.099197,2.068938
4000,0.0403,0.078485,1.574278
5000,0.0243,0.067948,1.291614


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
There were missing keys in the checkpoint model loaded: ['proj_out.weight'].


Saving model...


Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...-zeroth/training_args.bin: 100%|##########| 6.03kB / 6.03kB            

  ...16468.a77a997ae15c.2051.0: 100%|##########| 50.0kB / 50.0kB            

  ...-zeroth/model.safetensors:   1%|1         | 41.8MB / 3.24GB            

Training complete! Model saved to ./whisper-korean-zeroth

Evaluating on test set...


Test CER: 1.29%

Example inference:
Reference: 지난해 삼 월 김 전 장관의 동료인 장동련 홍익대 교수가 민간 자문단장으로 위촉되면서 본격적인 공모와 개발 작업에 들어갔다
Prediction: 지난해 삼 월 김 전 장관의 동료인 장동련 홍익대 교수가 민간 자문단장으로 위촉되면서 본격적인 공모와 개발 작업에 들어갔다
