In [9]:
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_io as tfio
import os, csv

# YAMNet 모델 로드
model = hub.load('https://tfhub.dev/google/yamnet/1')

def load_wav_16k_mono(filename):
    """ 16kHz 모노 WAV 파일 로드 """
    file_contents = tf.io.read_file(filename)
    wav, sample_rate = tf.audio.decode_wav(file_contents, desired_channels=1)
    wav = tf.squeeze(wav, axis=-1)
    sample_rate = tf.cast(sample_rate, dtype=tf.int64)
    wav = tfio.audio.resample(wav, rate_in=sample_rate, rate_out=16000)
    return wav

def get_embedding(wav_data):
    """ YAMNet을 사용하여 오디오 임베딩 생성 """
    scores, embeddings, log_mel_spectrogram = model(wav_data)
    return embeddings

# # 오디오 파일 로드 및 임베딩 생성
# audio_file = 'path_to_your_audio_file.wav'  # 실제 오디오 파일 경로로 변경해야 합니다
# wav_data = load_wav_16k_mono(audio_file)
# embedding = get_embedding(wav_data)
# print(f"Audio embedding shape: {embedding.shape}")


2024-10-07 06:42:44.933367: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs_3' with dtype int32 and shape [?]
	 [[{{node inputs_3}}]]
2024-10-07 06:42:44.935773: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs_1' with dtype int32 and shape [3]
	 [[{{node inputs_1}}]]
2024-10-07 06:42:44.935851: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs_1' with dtype int32 and shape [3]
	 [[{{node inputs_1}}]]
2024-10-07 06:42

In [10]:
from urllib.request import urlopen

# YAMNet 클래스 이름 로드
class_map_path = "https://raw.githubusercontent.com/tensorflow/models/master/research/audioset/yamnet/yamnet_class_map.csv"
class_names = []
with urlopen(class_map_path) as f:
    reader = csv.DictReader(f.read().decode('utf-8').splitlines())
    for row in reader:
        class_names.append(row['display_name'])

In [11]:

my_classes = [
    "어른발걸음소리",
    "아이들발걸음소리",
    "망치질소리",
    "가구끄는소리",
    "문여닫는소리",
    "런닝머신에서뛰는소리",
    "골프퍼팅(골굴리는소리)",
    "화장실물내리는소리",
    "샤워할때물소리",
    "드럼세탁기소리",
    "통돌이세탁기소리",
    "진공청소기소리",
    "식기세척기소리",
    "바이올린연주소리",
    "피아노연주소리",
    "강아지짓는소리",
    "고양이우는소리"
]

test_dir = '/ai_hub_data/Training/04.balanced_cropped음원'

for cls in my_classes:
    files = os.listdir(os.path.join(test_dir, cls))
    print(f"Class: {cls}")
    counter = 0
    for f in files:
        counter += 1
        audio_file = os.path.join(test_dir, cls, f)
        wav_data = load_wav_16k_mono(audio_file)
        class_scores, embeddings, _ = model(wav_data)
        # 상위 5개 예측 클래스 출력
        top_classes = tf.argsort(class_scores, direction='DESCENDING')[0][:5]
        for i in top_classes:
            print(f"{counter}.{class_names[i]}: {class_scores[0][i]:.3f}")

Class: 어른발걸음소리


2024-10-07 06:42:57.100639: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'waveform' with dtype float and shape [?]
	 [[{{node waveform}}]]


1.Scrape: 0.448
1.Tearing: 0.383
1.Shuffling cards: 0.361
1.Crumpling, crinkling: 0.248
1.Scratch: 0.223
2.Inside, small room: 0.224
2.Speech: 0.152
2.Inside, large room or hall: 0.038
2.Door: 0.023
2.Wood: 0.017
3.Hands: 0.542
3.Slap, smack: 0.443
3.Bouncing: 0.412
3.Inside, small room: 0.340
3.Door: 0.211




4.Inside, small room: 0.378
4.Wood: 0.120
4.Shuffle: 0.069
4.Inside, large room or hall: 0.063
4.Zipper (clothing): 0.039




5.Typewriter: 0.942
5.Typing: 0.608
5.Rattle (instrument): 0.127
5.Inside, small room: 0.088
5.Maraca: 0.074
6.Cap gun: 0.793
6.Gunshot, gunfire: 0.652
6.Explosion: 0.442
6.Wood: 0.094
6.Arrow: 0.048
7.Tap: 0.311
7.Ping: 0.309
7.Door: 0.224
7.Shuffle: 0.095
7.Inside, large room or hall: 0.069
8.Knock: 0.757
8.Thunk: 0.635
8.Bouncing: 0.548
8.Door: 0.418
8.Thump, thud: 0.381
9.Hands: 0.714
9.Finger snapping: 0.685
9.Slap, smack: 0.202
9.Scratch: 0.121
9.Inside, small room: 0.118
10.Shuffle: 0.148
10.Inside, small room: 0.107
10.Hands: 0.068
10.Explosion: 0.061
10.Cap gun: 0.047
11.Inside, small room: 0.451
11.Domestic animals, pets: 0.167
11.Animal: 0.146
11.Dog: 0.120
11.Door: 0.112
12.Music: 0.876
12.Percussion: 0.760
12.Drum: 0.699
12.Musical instrument: 0.581
12.Snare drum: 0.447
13.Hands: 0.333
13.Whack, thwack: 0.324
13.Cap gun: 0.241
13.Slap, smack: 0.175
13.Wood: 0.122
14.Inside, small room: 0.296
14.Hands: 0.296
14.Ping: 0.195
14.Bouncing: 0.177
14.Slap, smack: 0.170
15.Inside,

KeyboardInterrupt: 