2. was-> mel Spectrogram 변환

wav데이터를 fold1부터 fold10까지 순차적으로 이미지 변환 시켜주는 코드

In [None]:
import os
import pandas as pd
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np

# ✅ 설정: 처리할 폴더 번호만 수정하세요 (1~10)
target_fold = 10  # <- 여기만 매번 바꿔서 실행

# 경로 설정
metadata_path = "/content/drive/MyDrive/UrbanSound8K/metadata/UrbanSound8K.csv"
audio_base_path = "/content/drive/MyDrive/UrbanSound8K/audio"
save_base_path = "/content/drive/MyDrive/UrbanSound8K/images"

# 메타데이터 불러오기
df = pd.read_csv(metadata_path)

# 해당 fold 데이터만 필터링
fold_df = df[df["fold"] == target_fold]

# 저장 폴더 준비
save_folder = os.path.join(save_base_path, f"fold{target_fold}")
os.makedirs(save_folder, exist_ok=True)

# Mel 이미지 생성 루프
for idx, row in fold_df.iterrows():
    file_name = row['slice_file_name']
    audio_path = os.path.join(audio_base_path, f"fold{target_fold}", file_name)
    save_path = os.path.join(save_folder, file_name.replace('.wav', '.png'))

    try:
        y, sr = librosa.load(audio_path, sr=None)
        S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
        S_DB = librosa.power_to_db(S, ref=np.max)

        plt.figure(figsize=(3, 3))
        librosa.display.specshow(S_DB, sr=sr, x_axis=None, y_axis=None)
        plt.axis("off")
        plt.savefig(save_path, bbox_inches='tight', pad_inches=0)
        plt.close()

    except Exception as e:
        print(f"❌ 오류 발생: {file_name} - {e}")


**3.클래스 라벨링 필터링**

클래스 3개 이상 만들기 사이렌 경적 드릴링

In [None]:
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/UrbanSound8K/metadata/UrbanSound8K.csv")

siren_df = df[df['class']=='siren']
non_siren_df = df[df['class'] !='siren']
final_df = pd.concat([siren_df, non_siren_df.sample(n=len(siren_df))])

In [None]:
import pandas as pd

df = pd.read_csv(r"/content/drive/MyDrive/UrbanSound8K/metadata/UrbanSound8K.csv")

# 사이렌 classID == 10
siren_df = df[df['classID'] == 10].copy()
siren_df['binary_label'] = 1

non_siren_df = df[df['classID'] != 10].copy()
non_siren_df['binary_label'] = 0

# 이진 분류용 데이터프레임
final_df = pd.concat([siren_df, non_siren_df], ignore_index=True)


In [None]:
print(final_df.head())


      slice_file_name    fsID  start        end  salience  fold  classID  \
0    100032-3-0-0.wav  100032    0.0   0.317551         1     5        3   
1  100263-2-0-117.wav  100263   58.5  62.500000         1     5        2   
2  100263-2-0-121.wav  100263   60.5  64.500000         1     5        2   
3  100263-2-0-126.wav  100263   63.0  67.000000         1     5        2   
4  100263-2-0-137.wav  100263   68.5  72.500000         1     5        2   

              class  binary_label  
0          dog_bark             0  
1  children_playing             0  
2  children_playing             0  
3  children_playing             0  
4  children_playing             0  


In [None]:
print(final_df['binary_label'].value_counts())

binary_label
0    8732
Name: count, dtype: int64


In [None]:
print(final_df[final_df['binary_label'] == 1].head())


Empty DataFrame
Columns: [slice_file_name, fsID, start, end, salience, fold, classID, class, binary_label]
Index: []


In [None]:
print(df.columns)


Index(['slice_file_name', 'fsID', 'start', 'end', 'salience', 'fold',
       'classID', 'class'],
      dtype='object')


In [None]:
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/UrbanSound8K/metadata/UrbanSound8K.csv")
print(df['class'].unique())  # 클래스를 먼저 확인해보기

# classID 또는 class 기준으로 필터링
siren_df = df[df['class'] == 'siren'].copy()
siren_df['binary_label'] = 1

non_siren_df = df[df['class'] != 'siren'].copy()
non_siren_df['binary_label'] = 0

final_df = pd.concat([siren_df, non_siren_df], ignore_index=True)
print(final_df['binary_label'].value_counts())


['dog_bark' 'children_playing' 'car_horn' 'air_conditioner' 'street_music'
 'gun_shot' 'siren' 'engine_idling' 'jackhammer' 'drilling']
binary_label
0    7803
1     929
Name: count, dtype: int64


4.PyTorch Dataset 구성

In [None]:
from torch.utils.data import Dataset

class SpectrogramDataset(Dataset):
    def __init__(self, dataframe, base_path, transform=None):
        self.df = dataframe.reset_index(drop=True)
        self.base_path = base_path
        self.transform = transform

    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        filepath = f"{self.base_path}/fold"
        y,sr = librosa.load(filepath, sr=None)
        mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
        mel_db = librosa.power_to_db(mel, ref=np.max)
        mel_tensor = torch.tensor(mel_db).unsqueeze(0).float()
        label = int (row.classID == 10)
        return mel_tensor, label

In [None]:
from torch.utils.data import Dataset
import torch
import librosa
import numpy as np
import os

class SpectrogramDataset(Dataset):
    def __init__(self, dataframe, base_path, transform=None):
        self.df = dataframe.reset_index(drop=True)
        self.base_path = base_path
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        fold = f"fold{row['fold']}"
        filename = row['slice_file_name']
        filepath = os.path.join(self.base_path, fold, filename)

        y, sr = librosa.load(filepath, sr=None)
        mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
        mel_db = librosa.power_to_db(mel, ref=np.max)
        mel_tensor = torch.tensor(mel_db).unsqueeze(0).float()  # shape: (1, 128, time)

        label = int(row['classID'] == 10)  # 사이렌이면 1, 아니면 0

        return mel_tensor, label


5.모델 학습

In [None]:
import torch
import torch.nn as nn
import torchvision.models as models

model = models.resnet18(pretrained=False)
model.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)
model.fc = nn.Linear(model.fc.in_features, 1)



In [None]:
print(model)

ResNet(
  (conv1): Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

6. 모델 저장

In [None]:
torch.save(model.state_dict(), "/content/drive/MyDrive/resnet_siren_classifier.pth")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# 같은 모델 구조로 불러올 준비
model = models.resnet18(weights=None)
model.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)
model.fc = nn.Linear(model.fc.in_features, 1)

# 저장된 가중치 불러오기
model.load_state_dict(torch.load("resnet_siren_classifier.pth"))
model.eval()  # 추론 모드 전환


In [None]:
import torch

# 저장된 모델 파라미터 로드
state_dict = torch.load("/content/drive/MyDrive/resnet_siren_classifier.pth")

# 키 목록 확인 (모델 각 층의 이름)
print(state_dict.keys())


odict_keys(['conv1.weight', 'bn1.weight', 'bn1.bias', 'bn1.running_mean', 'bn1.running_var', 'bn1.num_batches_tracked', 'layer1.0.conv1.weight', 'layer1.0.bn1.weight', 'layer1.0.bn1.bias', 'layer1.0.bn1.running_mean', 'layer1.0.bn1.running_var', 'layer1.0.bn1.num_batches_tracked', 'layer1.0.conv2.weight', 'layer1.0.bn2.weight', 'layer1.0.bn2.bias', 'layer1.0.bn2.running_mean', 'layer1.0.bn2.running_var', 'layer1.0.bn2.num_batches_tracked', 'layer1.1.conv1.weight', 'layer1.1.bn1.weight', 'layer1.1.bn1.bias', 'layer1.1.bn1.running_mean', 'layer1.1.bn1.running_var', 'layer1.1.bn1.num_batches_tracked', 'layer1.1.conv2.weight', 'layer1.1.bn2.weight', 'layer1.1.bn2.bias', 'layer1.1.bn2.running_mean', 'layer1.1.bn2.running_var', 'layer1.1.bn2.num_batches_tracked', 'layer2.0.conv1.weight', 'layer2.0.bn1.weight', 'layer2.0.bn1.bias', 'layer2.0.bn1.running_mean', 'layer2.0.bn1.running_var', 'layer2.0.bn1.num_batches_tracked', 'layer2.0.conv2.weight', 'layer2.0.bn2.weight', 'layer2.0.bn2.bias', '

In [None]:
print(state_dict['fc.weight'])  # 최종 분류기 가중치


tensor([[ 0.0028,  0.0218,  0.0332, -0.0162,  0.0062,  0.0027, -0.0121,  0.0164,
          0.0312, -0.0044, -0.0294,  0.0063, -0.0008,  0.0404,  0.0004, -0.0357,
         -0.0439,  0.0331, -0.0352, -0.0056, -0.0091, -0.0038, -0.0125,  0.0002,
          0.0160, -0.0128,  0.0275, -0.0016,  0.0201, -0.0388,  0.0268, -0.0404,
         -0.0010, -0.0201, -0.0168,  0.0165, -0.0347,  0.0053, -0.0010, -0.0385,
          0.0160, -0.0142,  0.0280, -0.0087,  0.0254,  0.0340, -0.0193, -0.0432,
         -0.0042,  0.0002, -0.0416,  0.0134, -0.0390, -0.0201,  0.0233, -0.0146,
         -0.0008, -0.0182, -0.0212,  0.0120, -0.0207, -0.0081, -0.0253, -0.0344,
         -0.0200, -0.0283, -0.0301,  0.0420, -0.0233,  0.0065, -0.0369, -0.0151,
          0.0218,  0.0377, -0.0326, -0.0004,  0.0204, -0.0328,  0.0163,  0.0162,
          0.0005,  0.0071,  0.0135,  0.0197,  0.0045,  0.0265, -0.0280,  0.0141,
         -0.0189,  0.0083, -0.0331,  0.0095, -0.0155, -0.0144, -0.0187,  0.0333,
          0.0175,  0.0189,  