In [None]:
import numpy as np
import librosa
from librosa.util import normalize
import librosa.display
import matplotlib.pyplot as plt
import os
import json
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dropout, GlobalAveragePooling2D, Dense
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [None]:
def load_json_labels(json_folder_path):
    pitch_labels = {}
    for json_filename in os.listdir(json_folder_path):
        if json_filename.endswith('.json'):
            base_filename = json_filename[:-5]  # '.json' 확장자 제거
            with open(os.path.join(json_folder_path, json_filename), 'r', encoding='utf-8') as file:
                data = json.load(file)
            pitch_labels[base_filename] = data['notes']
    return pitch_labels

def extract_single_label(notes):
    if notes:
        return notes[0]['midi_num']
    return None

In [None]:
def load_and_preprocess_audio(audio_path, sr=22050, min_length=128):
    audio, sr = librosa.load(audio_path, sr=sr)
    audio = librosa.util.normalize(librosa.to_mono(audio))
    
    # 오디오 길이가 min_length보다 짧은 경우
    if len(audio) < min_length:
        audio = np.pad(audio, pad_width=(min_length - len(audio), 0), mode='constant')

    return audio, sr

In [None]:
def extract_cqt_features(audio, sr=22050):
    hop_length = max(1, len(audio) // 32)
    if len(audio) < hop_length:
        audio = np.pad(audio, pad_width=(0, hop_length - len(audio)), mode='constant')

    CQT = np.abs(librosa.cqt(audio, sr=sr, hop_length=hop_length, n_bins=84, bins_per_octave=12))
    CQT_db = librosa.amplitude_to_db(CQT, ref=np.max)
    return CQT_db
    
def pad_cqt_features(cqt_features, max_length):
    padding = max_length - cqt_features.shape[1]
    if padding > 0:
        return np.pad(cqt_features, ((0, 0), (0, padding)), mode='constant')
    return cqt_features

In [None]:
def prepare_dataset(folder_path, json_folder_path, sr=22050):
    features = []
    labels = []
    max_length = 0
    pitch_labels = load_json_labels(json_folder_path)

    for file_name in os.listdir(folder_path):
        if file_name.lower().endswith('.wav'):
            base_filename = file_name[:-4]  # '.wav' 확장자 제거
            label_found = False

            # JSON 레이블 키와 WAV 파일 이름 매치 확인
            for label_key in pitch_labels.keys():
                if base_filename.startswith(label_key):
                    audio_path = os.path.join(folder_path, file_name)
                    audio, sample_rate = load_and_preprocess_audio(audio_path, sr)
                    if audio is not None:
                        cqt_features = extract_cqt_features(audio, sample_rate)
                        if cqt_features.shape[1] > max_length:
                            max_length = cqt_features.shape[1]
                        features.append(cqt_features)
                        extracted_label = extract_single_label(pitch_labels[label_key])
                        labels.append(extracted_label)
                        label_found = True
                        break
            if not label_found:
                print(f"No label found for {file_name}")

    if features:
        padded_features = np.array([np.pad(feature, ((0, 0), (0, max_length - feature.shape[1])), 'constant') for feature in features])
        padded_features = np.expand_dims(padded_features, -1)
    else:
        print("No features collected")

    if labels:
        labels = np.array(labels)
    else:
        print("No labels collected")

    return padded_features, labels, max_length


In [None]:
def build_cnn_model(input_shape, num_classes):
    model = Sequential([
        Conv2D(16, kernel_size=2, activation='relu', input_shape=input_shape),
        MaxPooling2D(2),
        Dropout(0.2),

        Conv2D(32, kernel_size=2, activation='relu'),
        MaxPooling2D(2),
        Dropout(0.2),

        Conv2D(64, kernel_size=2, activation='relu'),
        MaxPooling2D(2),
        Dropout(0.2),

        Conv2D(128, kernel_size=2, activation='relu'),
        MaxPooling2D(2),
        Dropout(0.2),

        GlobalAveragePooling2D(),
        Dense(num_classes, activation='softmax')
    ])
    
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [None]:
folder_path = 'dataset\\Processed_Train_3'
json_folder_path = 'dataset\\Labeling'
features, labels, max_length = prepare_dataset(folder_path,json_folder_path)

In [None]:
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)
categorical_labels = to_categorical(encoded_labels)

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(features, categorical_labels, test_size=0.2, random_state=42)


# 모델 구성
input_shape = X_train.shape[1:]  # 첫 번째 데이터의 모양으로 입력 형태를 설정
num_classes = y_train.shape[1]  # 범주의 수
model = build_cnn_model(input_shape, num_classes)

# 모델 훈련
model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test))
model.summary()

# 모델 평가
score = model.evaluate(X_test, y_test, verbose=1)
print('Pre-training accuracy: %.4f%%' % (100 * score[1]))