In [2]:
import sys
sys.path.append('/app')

import os

import tensorflow as tf
# import tensorflow_hub as hub
import tensorflow_io as tfio

from model import yamnet_params
from model import yamnet as yamnet_model

2024-10-07 05:29:37.070402: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-07 05:29:37.121609: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
print("GPU 사용 가능:", tf.test.is_built_with_cuda())
print("사용 가능한 GPU:", tf.config.list_physical_devices('GPU'))
print(tf.__version__)

GPU 사용 가능: True
사용 가능한 GPU: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU')]
2.12.0


In [4]:
# GPU 1번 사용
tf.config.set_visible_devices([tf.config.list_physical_devices('GPU')[0]], 'GPU')


In [5]:
hub_dir = '/ai_hub_data/Training/01.원천데이터'
# dirs = os.listdir(hub_dir)
# my_classes = set()
# for folder_name in dirs:
#     if folder_name.split('_')[1] == 'A.층간소음':
#         my_classes.add(folder_name.split('.')[-1])
# for my_class in my_classes:
#     print(my_class)
    
my_classes = [
    "어른발걸음소리",
    "아이들발걸음소리",
    "망치질소리",
    "가구끄는소리",
    "문여닫는소리",
    "런닝머신에서뛰는소리",
    "골프퍼팅(골굴리는소리)",
    "화장실물내리는소리",
    "샤워할때물소리",
    "드럼세탁기소리",
    "통돌이세탁기소리",
    "진공청소기소리",
    "식기세척기소리",
    "바이올린연주소리",
    "피아노연주소리",
    "강아지짓는소리",
    "고양이우는소리"
]


## 목표
```
Inputlayer(audio) --> KerasLayer(yamnet) --> Sequential(my_model) --> ReduceMenaLayer(classifier)
```

In [23]:

# Make my model
map_class_to_id = {name: index for index, name in enumerate(my_classes)}


my_model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(1024), dtype=tf.float32,
                          name='input_embedding'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(len(my_classes))
], name='home_noise_classifier')

my_model.summary()


Model: "home_noise_classifier"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dropout_1 (Dropout)         (None, 1024)              0         
                                                                 
 dense_7 (Dense)             (None, 128)               131200    
                                                                 
 dropout_2 (Dropout)         (None, 128)               0         
                                                                 
 dense_8 (Dense)             (None, 17)                2193      
                                                                 
Total params: 133,393
Trainable params: 133,393
Non-trainable params: 0
_________________________________________________________________


In [24]:

my_model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                 optimizer="adam",
                 metrics=['accuracy'])

callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                            patience=10,
                                            restore_best_weights=True)

In [19]:
# Load yamnet
params = yamnet_params.Params(sample_rate=16000, patch_hop_seconds=0.1)
yamnet = yamnet_model.yamnet_frames_model(params)
yamnet.load_weights('/app/model/yamnet.h5')


In [20]:
# Utility functions for loading audio files and making sure the sample rate is correct.

@tf.function
def load_wav_16k_mono(filename):
    try:
        file_contents = tf.io.read_file(filename)
        wav, sample_rate = tf.audio.decode_wav(file_contents, desired_channels=1)
        wav = tf.squeeze(wav, axis=-1)
        sample_rate = tf.cast(sample_rate, dtype=tf.int64)
        wav = tfio.audio.resample(wav, rate_in=sample_rate, rate_out=16000)
    except Exception as e:
        print(f"파일을 읽는 중 오류 발생: {filename} - {str(e)}")
        return None
    return wav

def load_wav_for_map(filename, label):
    return load_wav_16k_mono(filename), label

# applies the embedding extraction model to a wav data
def extract_embedding(wav_data, label):
  ''' run YAMNet to extract embedding from the wav data '''
  scores, embeddings, spectrogram = yamnet(wav_data)
  num_embeddings = tf.shape(embeddings)[0]
  return (embeddings, tf.repeat(label, num_embeddings))


In [13]:
# Train dataset 생성
train_dir = '/ai_hub_data/Training/04.balanced_cropped음원'

labels = []
filenames = []
counter = 0
for folder in os.listdir(train_dir):
    for file in os.listdir(os.path.join(train_dir, folder)):
        if file.split('.')[-1] == 'wav':
            counter += 1
            labels.append(map_class_to_id[folder])
            filenames.append(os.path.join(train_dir, folder, file))
print(counter)

filenames_tensor = tf.constant(filenames)
labels_tensor = tf.constant(labels)

# 데이터 로드 및 전처리
train_ds = tf.data.Dataset.from_tensor_slices((filenames_tensor, labels_tensor))
print(train_ds.element_spec)
train_ds = train_ds.map(load_wav_for_map)

# None 값 필터링
train_ds = train_ds.filter(lambda x, y: x is not None)

print(train_ds.element_spec)

# extract embedding with yamnet
train_ds = train_ds.map(extract_embedding).unbatch()

# 데이터셋 섞기 및 배치 설정
BATCH_SIZE = 32
train_ds = train_ds.shuffle(buffer_size=100).batch(BATCH_SIZE)

# 데이터셋 캐싱 및 프리페치
train_ds = train_ds.cache().prefetch(tf.data.AUTOTUNE)

print("훈련 데이터셋 생성 완료")
print(train_ds.element_spec)

# # 데이터셋 크기 확인
# dataset_size = tf.data.experimental.cardinality(train_ds).numpy()
# print(f"데이터셋 크기: {dataset_size}")


15300
(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int32, name=None))
(TensorSpec(shape=<unknown>, dtype=tf.float32, name=None), TensorSpec(shape=(), dtype=tf.int32, name=None))


2024-10-07 05:57:24.073648: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'args_0' with dtype string
	 [[{{node args_0}}]]


훈련 데이터셋 생성 완료
(TensorSpec(shape=(None, 1024), dtype=tf.float32, name=None), TensorSpec(shape=(None,), dtype=tf.int32, name=None))


In [28]:
# Validation dataset 생성
# val_dir = '/ai_hub_data/Validation/01.원천데이터'
val_dir = '/ai_hub_data/Validation/04.balanced_cropped음원'

labels = []
filenames = []
counter = 0
for folder in os.listdir(val_dir):
    for file in os.listdir(os.path.join(val_dir, folder)):
        if file.split('.')[-1] == 'wav':
            counter += 1
            labels.append(map_class_to_id[folder])
            filenames.append(os.path.join(val_dir, folder, file))
print(counter)

filenames_tensor = tf.constant(filenames)
labels_tensor = tf.constant(labels)

# 데이터 로드 및 전처리
val_ds = tf.data.Dataset.from_tensor_slices((filenames_tensor, labels_tensor))
val_ds = val_ds.map(load_wav_for_map)

# None 값 필터링
val_ds = val_ds.filter(lambda x, y: x is not None)

# extract embedding with yamnet
val_ds = val_ds.map(extract_embedding).unbatch()

# 데이터셋 섞기
BATCH_SIZE = 4
val_ds = val_ds.shuffle(buffer_size=100)

# small_train 셋 나누기, 배치 설정
train_size = 1500
small_train_ds = val_ds.take(train_size)
val_ds = val_ds.skip(train_size)

# 데이터셋 캐싱 및 프리페치
small_train_ds = small_train_ds.cache().batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
val_ds = val_ds.cache().batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

print("검증 데이터셋 생성 완료")
print(val_ds.element_spec)

1700


2024-10-07 06:12:19.626268: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'args_0' with dtype string
	 [[{{node args_0}}]]


검증 데이터셋 생성 완료
(TensorSpec(shape=(None, 1024), dtype=tf.float32, name=None), TensorSpec(shape=(None,), dtype=tf.int32, name=None))


In [None]:
# 데이터셋의 형태 확인
for waveforms, labels in val_ds.take(1):
    print("파형 형태:", waveforms.shape)
    print("레이블 형태:", labels.shape)
       
print("모델 입력 형태:", my_model.input_shape)

In [29]:
# Train my_model
my_model.fit(small_train_ds, epochs=100, validation_data=val_ds, callbacks=[callback])
# my_model.fit(val_ds, epochs=10, callbacks=[callback]) # 데이터가 너무 많아서 val 셋으로 train하기
my_model.save('model/bs_home_noise_model_new1.h5')




Epoch 1/100


2024-10-07 06:12:26.481980: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_99' with dtype resource
	 [[{{node Placeholder/_99}}]]
2024-10-07 06:12:26.484797: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_85' with dtype resource
	 [[{{node Placeholder/_85}}]]


    375/Unknown - 5s 11ms/step - loss: 0.1956 - accuracy: 0.9480

2024-10-07 06:12:31.636145: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_89' with dtype resource
	 [[{{node Placeholder/_89}}]]
2024-10-07 06:12:31.638890: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_66' with dtype resource
	 [[{{node Placeholder/_66}}]]


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100


In [34]:
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# seaborn 한글 폰트 설정
sns.set_theme(font='NanumGothic')

# my_model = tf.saved_model.load('/app/model/bs_home_noise_model.h5')
my_model = tf.keras.models.load_model('model/bs_home_noise_model_new1.h5')


# # 검증 데이터셋으로 모델 평가
# loss, accuracy = my_model.evaluate(val_ds)

# print("검증 손실:", loss)
# print("검증 정확도:", accuracy)
# 클래스별 정확도 계산
y_pred = my_model.predict(val_ds)
y_true = np.concatenate([y for x, y in val_ds], axis=0)

class_accuracies = {}
for i, class_name in enumerate(my_classes):
    class_mask = y_true == i
    class_accuracy = accuracy_score(y_true[class_mask], y_pred.argmax(axis=1)[class_mask])
    class_accuracies[class_name] = class_accuracy
    print(f"{class_name} 정확도: {class_accuracy:.4f}")

# 혼동 행렬 생성 및 시각화
cm = confusion_matrix(y_true, y_pred.argmax(axis=1))
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=my_classes, yticklabels=my_classes)
plt.title('Confusion Matrix')
plt.xlabel('예측 클래스')
plt.ylabel('실제 클래스')
plt.show()


2024-10-07 06:31:42.139559: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs_3' with dtype int32 and shape [?]
	 [[{{node inputs_3}}]]
2024-10-07 06:31:42.142041: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs_1' with dtype int32 and shape [3]
	 [[{{node inputs_1}}]]
2024-10-07 06:31:42.142132: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs_1' with dtype int32 and shape [3]
	 [[{{node inputs_1}}]]
2024-10-07 06:31

ValueError: When input_signature is provided, all inputs to the Python function must be convertible to tensors:
  inputs: (
    <_PrefetchDataset element_spec=(TensorSpec(shape=(None, 1024), dtype=tf.float32, name=None), TensorSpec(shape=(None,), dtype=tf.int32, name=None))>)
  input_signature: (
    TensorSpec(shape=(None,), dtype=tf.float32, name=None)).

In [None]:
import librosa

# Check if the model is overfitting
loss, accuracy = my_model.evaluate(val_ds)

print("Loss: ", loss)
print("Accuracy: ", accuracy)

# Test
# import soundfile as sf


wav_file_name = '/ai_hub_data/Training/03.cropped음원/진공청소기소리/N-10_220915_A_3_e_12499_0.wav'
# wav_data, sr = sf.read(wav_file_name, dtype=np.int16)
# waveform = wav_data / 32768.0

waveform = librosa.load(wav_file_name, sr=16000, mono=True)[0]

print(waveform.shape)
scores, embeddings, spectrogram = yamnet(waveform)
print(embeddings.shape)
print(scores.shape)
result = my_model(embeddings).numpy()
for i in range(result.shape[0]):
    inferred_class = my_classes[result[i].argmax()]
print(result.shape)
print(result)
inferred_class = my_classes[result.mean(axis=0).argmax()]
print(f'The main sound is: {inferred_class}')
