<a href="https://colab.research.google.com/github/Yewon9/STT_JEJU/blob/main/Wav2Vec2_Dialect.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# IMPORT

In [None]:
import librosa
import torch
import numpy as np
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
from tqdm import tqdm

# Wav2Vec2

In [None]:
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
pretrained_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
fine_tuned_model = Wav2Vec2ForCTC.from_pretrained("/path/to/fine-tuned-model")

In [None]:
def speech_file_to_array_fn(path):
    speech, _ = librosa.load(path, sr=16000)
    return speech

In [None]:
def predict_speech_pretrained(speech_array):
    inputs = processor(speech_array, return_tensors="pt", sampling_rate=16000, padding=True)
    with torch.no_grad():
        logits = pretrained_model(inputs.input_values).logits
    pred_ids = torch.argmax(logits, dim=-1)
    pred_transcription = processor.batch_decode(pred_ids)
    return pred_transcription

In [None]:
def predict_speech_finetuned(speech_array):
    inputs = processor(speech_array, return_tensors="pt", sampling_rate=16000, padding=True)
    with torch.no_grad():
        logits = fine_tuned_model(inputs.input_values).logits
    pred_ids = torch.argmax(logits, dim=-1)
    pred_transcription = processor.batch_decode(pred_ids)
    return pred_transcription

In [None]:
def calculate_wer(reference, hypothesis):
    reference = reference.split()
    hypothesis = hypothesis.split()
    d = np.zeros((len(reference) + 1, len(hypothesis) + 1))
    for i in range(len(reference) + 1):
        d[i][0] = i
    for j in range(len(hypothesis) + 1):
        d[0][j] = j
    for i in range(1, len(reference) + 1):
        for j in range(1, len(hypothesis) + 1):
            if reference[i - 1] == hypothesis[j - 1]:
                d[i][j] = d[i - 1][j - 1]
            else:
                d[i][j] = min(d[i - 1][j], d[i][j - 1], d[i - 1][j - 1]) + 1
    return d[len(reference)][len(hypothesis)] / len(reference)

In [None]:
results = []

In [None]:
for index, row in tqdm(df.iterrows(), total=len(df)):
    speech_array = speech_file_to_array_fn(row['Path'])

    # Pre-trained 모델 예측
    pred_transcription_pretrained = predict_speech_pretrained(speech_array)

    # Fine-tuned 모델 예측
    pred_transcription_finetuned = predict_speech_finetuned(speech_array)

    # WER 계산
    wer_pretrained = calculate_wer(row['Text'], pred_transcription_pretrained[0])
    wer_finetuned = calculate_wer(row['Text'], pred_transcription_finetuned[0])

    results.append({
        'Path': row['Path'],
        'Reference Text': row['Text'],
        'Pretrained Prediction': pred_transcription_pretrained[0],
        'FineTuned Prediction': pred_transcription_finetuned[0],
        'WER Pretrained': wer_pretrained,
        'WER FineTuned': wer_finetuned
    })

In [None]:
results_df = pd.DataFrame(results)

In [None]:
print(results_df.head())

In [None]:
results_df.to_csv("model_comparison_results.csv", index=False)