In [1]:
import librosa
from librosa.util import normalize

def load_and_preprocess_audio(audio_path, sr=16000):
    audio, sr = librosa.load(audio_path, sr=sr)
    audio = normalize(librosa.to_mono(audio))
    return audio, sr

In [2]:
from crepe import predict

def extract_crepe_features(audio, sr=16000, model_capacity='full'):
    time, frequency, confidence, activation = predict(audio, sr, model_capacity=model_capacity, viterbi=True)
    return frequency, confidence


In [3]:
import json
import os

def load_json_labels(json_folder_path):
    pitch_labels = {}
    for json_filename in os.listdir(json_folder_path):
        if json_filename.endswith('.json'):
            base_filename = json_filename[:-5]
            with open(os.path.join(json_folder_path, json_filename), 'r', encoding='utf-8') as file:
                data = json.load(file)
            pitch_labels[base_filename] = data['notes']
    return pitch_labels

def extract_single_label(notes):
    if notes:
        return notes[0]['midi_num']
    return None


In [4]:
def prepare_dataset(folder_path, json_folder_path, sr=16000):
    features = []
    labels = []
    pitch_labels = load_json_labels(json_folder_path)

    for file_name in os.listdir(folder_path):
        if file_name.lower().endswith('.wav'):
            base_filename = file_name[:-4]
            label_found = False

            for label_key in pitch_labels.keys():
                if base_filename.startswith(label_key):
                    audio_path = os.path.join(folder_path, file_name)
                    audio, sample_rate = load_and_preprocess_audio(audio_path, sr)
                    if audio is not None:
                        frequency, confidence = extract_crepe_features(audio, sample_rate)
                        if len(frequency) > 0:
                            features.append(frequency)
                            extracted_label = extract_single_label(pitch_labels[label_key])
                            labels.append(extracted_label)
                        label_found = True
                        break
            if not label_found:
                print(f"No label found for {file_name}")

    features = np.array(features) if features else print("No features collected")
    labels = np.array(labels) if labels else print("No labels collected")

    return features, labels


In [5]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

def build_simple_model(input_shape, num_classes):
    model = Sequential([
        Dense(128, activation='relu', input_shape=input_shape),
        Dropout(0.3),
        Dense(64, activation='relu'),
        Dropout(0.3),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model


In [6]:
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

def cross_validate_model(X, y, n_splits=5, epochs=30):
    kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    cvscores = []
    
    for train, test in kfold.split(X, np.argmax(y, axis=1)):
        model = build_simple_model(X.shape[1:], y.shape[1])
        history = model.fit(X[train], y[train], epochs=epochs, validation_data=(X[test], y[test]), verbose=1)
        scores = model.evaluate(X[test], y[test], verbose=0)
        print(f"Fold accuracy: {scores[1] * 100}%")
        cvscores.append(scores[1] * 100)
        
    print("Average accuracy: %.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))


In [9]:
audio_dir = "C:\\Users\\ksma0\\University\\Speech\\Processed_Test_4\\Processed_Test_4"
json_folder_path = "C:\\Users\\ksma0\\University\\Speech\\data\\data\\Validation\\Labeling"

X, y = prepare_dataset(audio_dir, json_folder_path)

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
y_categorical = to_categorical(y_encoded)

cross_validate_model(X, y_categorical)


[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 929ms/step
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 951ms/step
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 1s/step
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 857ms/step
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 748ms/step
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 726ms/step
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 742ms/step
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 674ms/step
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 714ms/step
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 716ms/step
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 748ms/step
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 736ms/step
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 732ms/step
[1m12/12[0m [32m━━━━

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (734,) + inhomogeneous part.

In [10]:
import numpy as np
import librosa
from librosa.util import normalize
import os
import json
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from crepe import predict

# JSON 파일 로드 함수
def load_json_labels(json_folder_path):
    pitch_labels = {}
    for json_filename in os.listdir(json_folder_path):
        if json_filename.endswith('.json'):
            base_filename = json_filename[:-5]
            with open(os.path.join(json_folder_path, json_filename), 'r', encoding='utf-8') as file:
                data = json.load(file)
            pitch_labels[base_filename] = data['notes']
    return pitch_labels

# 첫 번째 라벨 추출 함수
def extract_single_label(notes):
    if notes:
        return notes[0]['midi_num']
    return None

# 오디오 파일 로드 및 전처리 함수
def load_and_preprocess_audio(audio_path, sr=16000):
    audio, sr = librosa.load(audio_path, sr=sr)
    audio = normalize(librosa.to_mono(audio))
    return audio, sr

# CREPE 특징 추출 함수
def extract_crepe_features(audio, sr=16000, model_capacity='full'):
    time, frequency, confidence, activation = predict(audio, sr, model_capacity=model_capacity)
    return frequency, confidence

# 데이터셋 준비 함수
def prepare_dataset(folder_path, json_folder_path, sr=16000):
    features = []
    labels = []
    pitch_labels = load_json_labels(json_folder_path)

    for file_name in os.listdir(folder_path):
        if file_name.lower().endswith('.wav'):
            base_filename = file_name[:-4]
            label_found = False

            for label_key in pitch_labels.keys():
                if base_filename.startswith(label_key):
                    audio_path = os.path.join(folder_path, file_name)
                    audio, sample_rate = load_and_preprocess_audio(audio_path, sr)
                    if audio is not None:
                        frequency, confidence = extract_crepe_features(audio, sample_rate)
                        if len(frequency) > 0:
                            features.append(frequency)
                            extracted_label = extract_single_label(pitch_labels[label_key])
                            labels.append(extracted_label)
                        label_found = True
                        break
            if not label_found:
                print(f"No label found for {file_name}")

    if features:
        # 모든 특징 배열의 길이를 동일하게 패딩
        max_length = max([len(f) for f in features])
        features = np.array([np.pad(f, (0, max_length - len(f)), 'constant') for f in features])
    else:
        print("No features collected")

    labels = np.array(labels) if labels else print("No labels collected")

    return features, labels

# 모델 빌드 함수
def build_simple_model(input_shape, num_classes):
    model = Sequential([
        Dense(128, activation='relu', input_shape=input_shape),
        Dropout(0.3),
        Dense(64, activation='relu'),
        Dropout(0.3),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# 모델 교차 검증 함수
def cross_validate_model(X, y, n_splits=5, epochs=30):
    kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    cvscores = []
    
    for train, test in kfold.split(X, np.argmax(y, axis=1)):
        model = build_simple_model(X.shape[1:], y.shape[1])
        history = model.fit(X[train], y[train], epochs=epochs, validation_data=(X[test], y[test]), verbose=1)
        scores = model.evaluate(X[test], y[test], verbose=0)
        print(f"Fold accuracy: {scores[1] * 100}%")
        cvscores.append(scores[1] * 100)
        
    print("Average accuracy: %.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))

# 데이터 디렉토리 설정
audio_dir = "C:\\Users\\ksma0\\University\\Speech\\Processed_Test_4\\Processed_Test_4"
json_folder_path = "C:\\Users\\ksma0\\University\\Speech\\data\\data\\Validation\\Labeling"

# 데이터 준비
X, y = prepare_dataset(audio_dir, json_folder_path)

# 레이블 인코딩 및 원핫 인코딩
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
y_categorical = to_categorical(y_encoded)

# 교차 검증을 통한 모델 평가
cross_validate_model(X, y_categorical)


[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 892ms/step
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 824ms/step
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 801ms/step
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 789ms/step
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 794ms/step
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 857ms/step
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 854ms/step
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 876ms/step
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 859ms/step
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 872ms/step
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 874ms/step
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 857ms/step
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 848ms/step
[1m12/12[0m [32

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.4091 - loss: 247.7553 - val_accuracy: 0.7007 - val_loss: 51.0627
Epoch 2/30
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.5123 - loss: 65.0754 - val_accuracy: 0.6395 - val_loss: 16.0487
Epoch 3/30
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.3162 - loss: 34.3356 - val_accuracy: 0.6259 - val_loss: 5.0816
Epoch 4/30
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.5423 - loss: 8.6503 - val_accuracy: 0.6667 - val_loss: 5.8253
Epoch 5/30
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.6543 - loss: 3.2561 - val_accuracy: 0.6395 - val_loss: 4.5967
Epoch 6/30
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7245 - loss: 4.9668 - val_accuracy: 0.6803 - val_loss: 3.4988
Epoch 7/30
[1m19/19[0m [32m━━━━━━━━━━━━━━