## Import Python libary

In [56]:
import librosa
import numpy as np
import pandas as pd
import webrtcvad 
import os
import re
import matplotlib.pyplot as plt

## Global variable

In [57]:
Speakers = ['Emily', 'Ethan', 'Olivia', 'Liam', 'Sophia', 'Jackson', 'Ava', 'Aiden', 'Emma', 'Noah', 'Isabella', 
            'Lucas', 'Mia', 'Mason', 'Harper', 'Elijah', 'Abigail', 'Logan', 'Grace', 'Benjamin']
NumOfSpeakers = 20
AudioFiles = []  #包含所有的音檔：[[], [], []......]
DataFrame = pd.DataFrame()  #包含所有音檔的DataFrame

## Read file

In [58]:
def ReadFile(AudioFiles):
    base_path = 'dataset'
    for actor_id in os.listdir(base_path):
        actor_path = os.path.join(base_path, actor_id)     # actor_path = dataset/id
        tmp = []
        
        for book_id in os.listdir(actor_path):
            book_path = os.path.join(actor_path, book_id)  # book_path = dataset/id/number
            
            for file_name in os.listdir(book_path):
                if file_name.endswith('.flac'):
                    file_path = os.path.join(book_path, file_name)  # file_path = dataset/id/number/file_name

                    # 判斷音檔長度，大於10秒才保留
                    duration = librosa.get_duration(path=file_path)
                    if duration < 10: continue
                    
                    # 使用librosa加载音频
                    y, sr = librosa.load(file_path, sr=None)
                    tmp.append((y, sr))
        AudioFiles.append(tmp)

ReadFile(AudioFiles)

## VAD and Noise reduction

In [59]:
def VAD():
    pass

def NoiseReduction():
    pass

## Extract Features

##### Melspectrograms

In [60]:
def Melspectrogram(y, sr):
    # 計算梅爾頻譜圖
    melspectrogram = librosa.feature.melspectrogram(y=y, sr=sr)
    # 將梅爾頻譜圖轉換為分貝表示
    log_melspectrogram = librosa.power_to_db(melspectrogram, ref=np.max)
    # 顯示梅爾頻譜圖
    # librosa.display.specshow(log_melspectrogram, sr=sr, x_axis='time', y_axis='mel')
    # plt.colorbar(format='%+2.0f dB')
    # plt.show()
    return melspectrogram

##### MFCC coefficient

In [61]:
def MFCC(y, sr):
    # 提取MFCC参数
    mfcc = librosa.feature.mfcc(y=y, sr=sr)
    return mfcc

## Build DataFrame

In [62]:
def BuildDataFrame():
    DataFrame = pd.DataFrame(columns=['Melspectrogram', 'MFCC', 'Speaker'])
    for i in range(NumOfSpeakers):
        for audio_file in AudioFiles[i]:
            data = pd.DataFrame({
                'Melspectrogram': [Melspectrogram(audio_file[0], audio_file[1])], 
                'MFCC': [MFCC(audio_file[0], audio_file[1])], 
                'Speaker': Speakers[i]
            })
            DataFrame = pd.concat([DataFrame, data], ignore_index=True)
    return DataFrame

DataFrame = BuildDataFrame()

In [63]:
DataFrame[:5]

Unnamed: 0,Melspectrogram,MFCC,Speaker
0,"[[0.008053505, 0.004100157, 0.0042814706, 0.00...","[[-496.41382, -488.91962, -495.58084, -495.352...",Emily
1,"[[0.011456508, 0.007573933, 0.0045426646, 0.00...","[[-366.32916, -360.35193, -398.2274, -442.2688...",Emily
2,"[[0.016795631, 0.005932196, 0.006791729, 0.005...","[[-388.48203, -367.78357, -379.08673, -408.614...",Emily
3,"[[0.014441341, 0.012013311, 0.0070638466, 0.00...","[[-366.47244, -343.3722, -358.45984, -371.5522...",Emily
4,"[[0.016517285, 0.004992439, 0.0033724483, 0.01...","[[-444.45438, -414.44543, -419.1343, -425.3680...",Emily


Words counts

In [None]:
target_words = ["ZERO", "ONE", "TWO", "THREE", "FOUR", "FIVE", "SIX", "EIGHT", "NINE", "TEN"]
dataset_root = 'testest-code'
word_counts = {word: 0 for word in target_words}

actorNums = 0
chapterNums = 0
for actor_folder in os.listdir(dataset_root):
    actor_path = os.path.join(dataset_root, actor_folder)
    
    if os.path.isdir(actor_path):

        actorNums += 1
        # print(f'Actor Folder: {actor_folder}')
        
        for script_folder in os.listdir(actor_path):
            script_path = os.path.join(actor_path, script_folder)
            
            if os.path.isdir(script_path):
                trans_file = os.path.join(script_path, f'{actor_folder}-{script_folder}.trans.txt')
                
                if os.path.exists(trans_file):
                    chapterNums += 1
                    with open(trans_file, 'r', encoding='utf-8') as f:
                        script_text = f.read()
                    
                    for word in target_words:
                        word_counts[word] += len(re.findall(r'\b' + word + r'\b', script_text, re.IGNORECASE))


print(str(dataset_root))
print('Total actors   : ' + str(actorNums))
print('Total chapters : ' + str(chapterNums))
for word, count in word_counts.items():
    print(f'{word:{6}}: {count}')