In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
import tensorflow as tf
from tqdm import tqdm
from glob import glob
from google.colab import drive

import librosa
import librosa.display as dsp
import IPython.display as ipd

warnings.filterwarnings(action='ignore')
drive.mount('/content/drive')

%cd '/content/drive/MyDrive/deep-voice/data/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/deep-voice/data


In [10]:
import torch

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') #GPU 할당

In [None]:
import random

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

seed_everything(813)

In [None]:
# 디렉토리 경로
deepvoice_dir = './abnormal_data'
normal_dir = './normal_data'

In [None]:
def count_files_in_directory(directory):
    """주어진 디렉토리 안의 파일 개수를 세어 반환합니다."""
    num_files = len([name for name in os.listdir(directory) if os.path.isfile(os.path.join(directory, name))])
    return num_files

print(f"{deepvoice_dir} 안에 있는 파일 개수: {count_files_in_directory(deepvoice_dir)}")
print(f"{normal_dir} 안에 있는 파일 개수: {count_files_in_directory(normal_dir)}")

./abnormal_data 안에 있는 파일 개수: 1772
./normal_data 안에 있는 파일 개수: 1391


In [11]:
# RAM 부족 이슈로 각 폴더 당 800개의 음성 데이터만 사용

def find_max_length(directory):
    max_length = 0
    for filename in os.listdir(directory)[:800]:
        filepath = os.path.join(directory, filename)
        audio, sr = librosa.load(filepath, sr=None)
        if len(audio) > max_length:
            max_length = len(audio)
    return max_length

def load_and_pad_audio(directory, max_length):
    audios = []
    for filename in os.listdir(directory)[:800]:
        filepath = os.path.join(directory, filename)
        audio, sr = librosa.load(filepath, sr=None)
        # 패딩 혹은 잘라내기
        if len(audio) > max_length:
            audio = audio[:max_length]  # 오디오를 max_length 길이로 잘라냄
        else:
            padding = max_length - len(audio)
            audio = np.pad(audio, (0, padding), mode='constant')  # 부족한 부분을 0으로 패딩
        audios.append(audio)
    return audios

In [12]:
# def calculate_max_length(directory):
#     max_length = 0
#     for filename in os.listdir(directory):
#         path = os.path.join(directory, filename)
#         audio, sr = librosa.load(path, sr=16000)
#         if len(audio) > max_length:
#             max_length = len(audio)
#     return max_length

# print(calculate_max_length(deepvoice_dir))

158367


In [13]:
# def calculate_max_length(directory):
#     max_length = 0
#     for filename in os.listdir(directory):
#         path = os.path.join(directory, filename)
#         audio, sr = librosa.load(path, sr=16000)
#         if len(audio) > max_length:
#             max_length = len(audio)
#     return max_length

# print(calculate_max_length(normal_dir))

227955


In [None]:
# 두 디렉토리의 최대 길이 찾기
max_length1 = find_max_length(deepvoice_dir)
max_length2 = find_max_length(normal_dir)
final_max_length = max(max_length1, max_length2)

# 더 긴 최대 길이를 사용하여 각 디렉토리의 오디오 로드 및 패딩
deepvoice_audios = load_and_pad_audio(deepvoice_dir, final_max_length)
normal_audios = load_and_pad_audio(normal_dir, final_max_length)

In [None]:
def check_dimensions(list1, list2):
    shapes1 = {np.array(item).shape for item in list1}
    shapes2 = {np.array(item).shape for item in list2}

    # 두 리스트의 모든 요소가 동일한 차원을 가지는지 확인
    if len(shapes1) == 1 and len(shapes2) == 1 and list(shapes1)[0] == list(shapes2)[0]:
        print("두 리스트의 모든 요소는 동일한 차원을 가집니다.")
    else:
        print("두 리스트의 요소 중에 서로 다른 차원을 가지는 것이 있습니다.")
check_dimensions(deepvoice_audios, normal_audios)

두 리스트의 모든 요소는 동일한 차원을 가집니다.


In [None]:
def extract_melspectrogram(audio_data, sr=16000, n_mels=128, hop_length=160, n_fft=400):
    melspectrograms = []
    for audio in audio_data:
        # librosa.feature.melspectrogram 함수는 로드된 오디오 데이터 배열을 직접 처리할 수 있음
        melspec = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=n_mels, hop_length=hop_length, n_fft=n_fft)
        melspec = librosa.power_to_db(melspec, ref=np.max)  # 데시벨 스케일로 변환
        melspectrograms.append(melspec)
    return melspectrograms

In [None]:
def extract_mfcc(audio_data, sr=16000, n_mfcc=100, hop_length=160, n_fft=400):
    mfccs = []
    for audio in audio_data:
        mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc, hop_length=hop_length, n_fft=n_fft)
        mfccs.append(mfcc)
    return mfccs

In [None]:
# 특성 추출
X_deepvoice_mel, X_deepvoice_mfcc = extract_melspectrogram(deepvoice_audios), extract_mfcc(deepvoice_audios)
X_normal_mel, X_normal_mfcc = extract_melspectrogram(normal_audios), extract_mfcc(normal_audios)

In [None]:
np.save('X_deepvoice_mel.npy', X_deepvoice_mel)
np.save('X_deepvoice_mfcc.npy', X_deepvoice_mfcc)
np.save('X_normal_mel.npy', X_normal_mel)
np.save('X_normal_mfcc.npy', X_normal_mfcc)