In [34]:
import os
import numpy as np
import librosa
import parselmouth
from google.cloud import speech
from dotenv import load_dotenv
from scipy.interpolate import interp1d

# Google Cloud Speech-to-Text API 설정
load_dotenv()
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = os.getenv("KEY_PATH")
speech_client = speech.SpeechClient()

In [35]:
def preprocess_audio_in_memory(input_path, target_sr=16000):
    """
    메모리에서 오디오 샘플링 속도를 변환하여 반환.
    """
    y, sr = librosa.load(input_path, sr=None)  # 원래 샘플링 속도로 로드
    if sr != target_sr:
        y_resampled = librosa.resample(y, orig_sr=sr, target_sr=target_sr)  # 샘플링 속도 변환
        return y_resampled, target_sr
    return y, sr


def extract_pitch_from_memory(y, sr):
    """
    메모리 상의 오디오 데이터에서 피치를 추출.
    """
    snd = parselmouth.Sound(y, sr)
    pitch = snd.to_pitch()
    times = pitch.xs()
    frequencies = pitch.selected_array['frequency']
    return times, frequencies


def recognize_speech_from_memory(y, sr, language_code="en-US"):
    """
    Google Speech-to-Text API를 이용해 메모리 상의 오디오 데이터에서 단어별 타임스탬프 추출.
    """
    import soundfile as sf
    from io import BytesIO

    # 메모리 상에서 변환된 데이터를 임시 WAV 파일로 저장
    temp_buffer = BytesIO()
    sf.write(temp_buffer, y, sr, format="WAV")
    temp_buffer.seek(0)
    audio_content = temp_buffer.read()

    # Google Speech-to-Text API 처리
    audio = speech.RecognitionAudio(content=audio_content)
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=sr,
        language_code=language_code,
        enable_word_time_offsets=True,
    )
    response = speech_client.recognize(config=config, audio=audio)
    word_timestamps = []
    for result in response.results:
        for word_info in result.alternatives[0].words:
            word_timestamps.append({
                "word": word_info.word,
                "start_time": word_info.start_time.total_seconds(),
                "end_time": word_info.end_time.total_seconds()
            })
    return word_timestamps


def detect_pauses(y, sr):
    """
    메모리 상의 오디오 데이터를 이용해 발화 중 침묵 구간 탐지.
    """
    intervals = librosa.effects.split(y, top_db=30)  # 비침묵 구간 탐지
    pause_durations = []
    for i in range(1, len(intervals)):
        pause_start = intervals[i - 1][1] / sr
        pause_end = intervals[i][0] / sr
        pause_durations.append(pause_end - pause_start)
    return pause_durations


def analyze_audio(user_audio_path, ref_audio_path):
    """
    주요 5가지 음성 특성 분석 및 문장 단위 결과 계산.
    """
    # Step 1: 사용자와 기준 음성을 메모리에서 샘플링 속도 변환
    y_user, sr_user = preprocess_audio_in_memory(user_audio_path)
    y_ref, sr_ref = preprocess_audio_in_memory(ref_audio_path)

    # Step 2: 단어별 타임스탬프 추출
    word_timestamps_user = recognize_speech_from_memory(y_user, sr_user)
    word_timestamps_reference = recognize_speech_from_memory(y_ref, sr_ref)

    # 빈 데이터 확인 및 처리
    if not word_timestamps_user:
        print("Error: User audio contains no recognizable words.")
        return {"Error": "User audio contains no recognizable words."}

    if not word_timestamps_reference:
        print("Error: Reference audio contains no recognizable words.")
        return {"Error": "Reference audio contains no recognizable words."}

    # 단어 리스트 추출
    user_words = [w['word'] for w in word_timestamps_user]
    reference_words = [w['word'] for w in word_timestamps_reference]

    # Step 3: 피치 유사도 계산
    times_user, pitch_user = extract_pitch_from_memory(y_user, sr_user)
    times_ref, pitch_ref = extract_pitch_from_memory(y_ref, sr_ref)
    aligned_pitch_user, aligned_pitch_ref = align_pitch_vectors(times_user, pitch_user, times_ref, pitch_ref)
    pitch_similarity = cosine_similarity(aligned_pitch_user, aligned_pitch_ref)

    # Step 4: 리듬 유사도 계산
    rhythm_similarity = analyze_rhythm(word_timestamps_user, word_timestamps_reference)

    # Step 5: 발화 속도 차이 계산
    user_total_time = word_timestamps_user[-1]['end_time'] - word_timestamps_user[0]['start_time']
    ref_total_time = word_timestamps_reference[-1]['end_time'] - word_timestamps_reference[0]['start_time']
    speech_rate_user = calculate_speech_rate(word_timestamps_user, user_total_time)
    speech_rate_ref = calculate_speech_rate(word_timestamps_reference, ref_total_time)
    speech_rate_difference = abs(speech_rate_user - speech_rate_ref)

    # Step 6: 발화 중단 빈도 차이 계산
    pause_user = detect_pauses(y_user, sr_user)
    pause_ref = detect_pauses(y_ref, sr_ref)
    pause_frequency_diff = calculate_pause_frequency(pause_user, pause_ref)

    # Step 7: 단어 누락 비율 계산
    word_omission_rate = compare_word_omission(user_words, reference_words)

    # 결과 집계
    results = {
        "Pitch Contour Similarity": pitch_similarity,
        "Rhythm Similarity": rhythm_similarity,
        "Speech Rate Difference": speech_rate_difference,
        "Pause Frequency Difference": pause_frequency_diff,
        "Word Omission Rate": word_omission_rate,
    }

    return results


# 문장 단위로 분석
def analyze_sentence(user_audio_path, ref_audio_path):
    """문장 단위로 음성 특징을 계산."""
    return analyze_audio(user_audio_path, ref_audio_path)

In [27]:
# 실행
user_audio = "./test_data/child_original/jaemin_1.wav"
ref_audio = "./test_data/adult_tts/tts_1.wav"

# 분석 수행
sentence_results = analyze_sentence(user_audio, ref_audio)
print("문장 단위 음성 특징 차이:", sentence_results)

문장 단위 음성 특징 차이: {'Pitch Contour Similarity': 0.6105258202051731, 'Rhythm Similarity': 0.946590294908157, 'Speech Rate Difference': 0.22770398481973464, 'Pause Frequency Difference': 0, 'Word Omission Rate': 0.0}


In [59]:
import os
import json
import numpy as np
import librosa
import parselmouth
from scipy.interpolate import interp1d
from google.cloud import speech
from dotenv import load_dotenv
import re

# Google Cloud Speech-to-Text API 설정
load_dotenv()
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = os.getenv("KEY_PATH")
speech_client = speech.SpeechClient()

In [60]:
def extract_number_from_filename(filename):
    """파일 이름에서 숫자를 추출."""
    match = re.search(r'(\d+)', filename)
    return int(match.group(1)) if match else float('inf')

def preprocess_audio(input_path, target_sr=16000):
    """오디오를 읽고 샘플링 속도를 변환."""
    y, sr = librosa.load(input_path, sr=None)
    if sr != target_sr:
        y = librosa.resample(y, orig_sr=sr, target_sr=target_sr)
    return y, target_sr

def extract_pitch(y, sr):
    """피치를 시간별로 추출."""
    snd = parselmouth.Sound(y, sr)
    pitch = snd.to_pitch()
    times = pitch.xs()
    frequencies = pitch.selected_array['frequency']
    return times, frequencies

def recognize_speech(y, sr, language_code="en-US"):
    """Google Speech-to-Text API를 사용해 단어별 타임스탬프 추출."""
    import soundfile as sf
    from io import BytesIO

    buffer = BytesIO()
    sf.write(buffer, y, sr, format="WAV")
    buffer.seek(0)
    audio = speech.RecognitionAudio(content=buffer.read())
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=sr,
        language_code=language_code,
        enable_word_time_offsets=True,
    )
    response = speech_client.recognize(config=config, audio=audio)

    word_timestamps = []
    for result in response.results:
        for word_info in result.alternatives[0].words:
            word_timestamps.append({
                "word": word_info.word,
                "start_time": word_info.start_time.total_seconds(),
                "end_time": word_info.end_time.total_seconds(),
            })
    return word_timestamps

def calculate_cosine_similarity(vec1, vec2):
    """벡터 간 코사인 유사도 계산."""
    if len(vec1) == 0 or len(vec2) == 0:
        return 0.0
    min_len = min(len(vec1), len(vec2))  # 최소 길이에 맞춤
    vec1 = vec1[:min_len]
    vec2 = vec2[:min_len]
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

def calculate_word_based_pitch_similarity(word_timestamps, y, sr):
    """단어별 음조(피치) 유사도 계산."""
    pitch_similarities = []
    times, frequencies = extract_pitch(y, sr)

    for word in word_timestamps:
        start_time = word["start_time"]
        end_time = word["end_time"]

        mask = (times >= start_time) & (times <= end_time)
        word_pitch = frequencies[mask]
        if len(word_pitch) > 0:
            pitch_similarities.append(word_pitch.mean())  # 평균 피치

    return pitch_similarities

def calculate_rhythm_similarity(word_timestamps):
    """단어별 리듬 유사도 계산."""
    gaps = []
    for i in range(1, len(word_timestamps)):
        gap = word_timestamps[i]["start_time"] - word_timestamps[i - 1]["end_time"]
        gaps.append(gap)

    return gaps

def analyze_audio(user_audio_path, ref_audio_path):
    """사용자와 기준 음성을 분석하여 결과 반환."""
    # 1. 오디오 전처리 (샘플링 속도 통일)
    y_user, sr_user = preprocess_audio(user_audio_path)
    y_ref, sr_ref = preprocess_audio(ref_audio_path)

    # 2. 음성 텍스트 인식
    user_timestamps = recognize_speech(y_user, sr_user)
    ref_timestamps = recognize_speech(y_ref, sr_ref)

    if len(user_timestamps) != len(ref_timestamps):
        print(f"Warning: Word count mismatch between user ({len(user_timestamps)}) and reference ({len(ref_timestamps)}).")

    # 3. 피치 패턴 유사도
    user_pitch = calculate_word_based_pitch_similarity(user_timestamps, y_user, sr_user)
    ref_pitch = calculate_word_based_pitch_similarity(ref_timestamps, y_ref, sr_ref)
    pitch_similarity = calculate_cosine_similarity(user_pitch, ref_pitch)

    # 4. 리듬 유사도
    user_rhythm = calculate_rhythm_similarity(user_timestamps)
    ref_rhythm = calculate_rhythm_similarity(ref_timestamps)
    rhythm_similarity = calculate_cosine_similarity(user_rhythm, ref_rhythm)

    # 5. 발화 속도
    user_speed = len(user_timestamps) / (user_timestamps[-1]["end_time"] - user_timestamps[0]["start_time"])
    ref_speed = len(ref_timestamps) / (ref_timestamps[-1]["end_time"] - ref_timestamps[0]["start_time"])
    speed_ratio = abs(user_speed / ref_speed)

    # 6. 발화 중단 빈도
    user_pauses = calculate_rhythm_similarity(user_timestamps)
    ref_pauses = calculate_rhythm_similarity(ref_timestamps)
    pause_similarity = calculate_cosine_similarity(user_pauses, ref_pauses)

    # 7. 단어 누락 및 잘못 인식된 단어
    user_words = [word["word"] for word in user_timestamps]
    ref_words = [word["word"] for word in ref_timestamps]
    missed_words = list(set(ref_words) - set(user_words))
    mispronounced_words = list(set(user_words) - set(ref_words))

    return {
        "Pitch Similarity": pitch_similarity,
        "Rhythm Similarity": rhythm_similarity,
        "Speed Ratio": speed_ratio,
        "Pause Similarity": pause_similarity,
        "Missed Words": missed_words,
        "Mispronounced Words": mispronounced_words,
    }

def compare_audio_folders(user_folder, ref_folder, output_file="results.json"):
    """폴더 간 음성 파일 비교."""
    user_files = sorted(os.listdir(user_folder), key=extract_number_from_filename)
    ref_files = sorted(os.listdir(ref_folder), key=extract_number_from_filename)

    if len(user_files) != len(ref_files):
        raise ValueError("폴더 간 파일 수가 일치하지 않습니다.")

    results = []
    for user_file, ref_file in zip(user_files, ref_files):
        user_audio_path = os.path.join(user_folder, user_file)
        ref_audio_path = os.path.join(ref_folder, ref_file)
        print(f"Processing: {user_file} vs {ref_file}")
        result = analyze_audio(user_audio_path, ref_audio_path)
        result["File Pair"] = f"{user_file} vs {ref_file}"
        results.append(result)

    with open(output_file, "w") as f:
        json.dump(results, f, indent=4)
    print(f"Results saved to {output_file}")

def main():
    user_folder = "./test_data/child_original_audio_100"
    ref_folder = "./test_data/tts_tortoise_audio_100"
    compare_audio_folders(user_folder, ref_folder)

In [61]:
if __name__ == "__main__":
    main()

Processing: E0001A671-BFG33-L1N2D1-E-F5NX-04822489.wav vs sentence_1.wav


  return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))


Processing: E0001A671-BFG33-L1N2D1-E-F5NX-04822502.wav vs sentence_2.wav
Processing: E0001A671-BFG33-L1N2D1-E-F5NX-04822529.wav vs sentence_3.wav
Processing: E0001A671-BFG33-L1N2D1-E-F5NX-04822580.wav vs sentence_4.wav
Processing: E0001A671-BFG33-L1N2D1-E-F5NX-04822594.wav vs sentence_5.wav
Processing: E0001A671-BFG33-L1N2D1-E-F5NX-04822603.wav vs sentence_6.wav
Processing: E0001A671-BFG33-L1N2D1-E-F5NX-04822635.wav vs sentence_7.wav
Processing: E0001A671-BFG33-L1N2D1-E-F5NX-04822655.wav vs sentence_8.wav
Processing: E0001A671-BFG33-L1N2D1-E-F5NX-04822681.wav vs sentence_9.wav
Processing: E0001A671-BFG33-L1N2D1-E-F5NX-04822693.wav vs sentence_10.wav
Processing: E0001A671-BFG33-L1N2D1-E-F5NX-04822705.wav vs sentence_11.wav
Processing: E0001A671-BFG33-L1N2D1-E-F5NX-04822723.wav vs sentence_12.wav
Processing: E0001A671-BFG33-L1N2D1-E-F5NX-04822746.wav vs sentence_13.wav
Processing: E0001A671-BFG33-L1N2D1-E-F5NX-04822759.wav vs sentence_14.wav
Processing: E0001A671-BFG33-L1N2D1-E-F5NX-0482