# preprocessing

In [None]:
# 用于音频分析与特征提取
import librosa
# 命令行解析参数
import argparse

import pandas as pd
import numpy as np

# 于序列化与反序列化对象，提取好的音频特征保存成 .pkl 文件
import pickle as pkl 

import torch
import torchaudio
import torchvision

#打开、编辑、转换图像数据
from PIL import Image

In [None]:
def extract_spectrogram(values, clip, entries, sampling_rate=44100):
	for data in entries:

		num_channels = 3
		window_sizes = [25, 50, 100] # 窗口大小，单位毫秒
		hop_sizes = [10, 25, 50] # 步长，单位毫秒
		centre_sec = 2.5 # 中心时间点，单位秒

		specs = [] # 用于存储不同声道的频谱图
		

		# 对每个通道（每组窗口参数）循环，生成对应尺度的 Mel 频谱。
		for i in range(num_channels):
			window_length = int(round(window_sizes[i]*sampling_rate/1000))
			hop_length = int(round(hop_sizes[i]*sampling_rate/1000))

			# 把 NumPy 数组格式的音频波形转换成 PyTorch 张量，以便后续用 torchaudio 处理。
			clip = torch.Tensor(clip)

			# 计算 Mel 频谱图
			spec = torchaudio.transforms.MelSpectrogram(
				sample_rate=sampling_rate, 
				n_fft=4410, 
				win_length=window_length, 
				hop_length=hop_length, 
				n_mels=128)(clip)
			
			eps = 1e-6
			spec = spec.numpy()
			spec = np.log(spec+ eps)
			# 把频谱数组转成图片对象（Image.fromarray(spec)）
			spec = np.asarray(torchvision.transforms.Resize((128, 250))(Image.fromarray(spec)))
			specs.append(spec)
		
		# 创建一个字典 new_entry 保存当前音频样本的信息：
		# "audio"：原始音频波形；
		# "values"：3 个通道的频谱图数组（形状大致为 [3, 128, 250]）
		# "target"：对应的分类标签。
		new_entry = {}
		new_entry["audio"] = clip.numpy()
		new_entry["values"] = np.array(specs)
		new_entry["target"] = data["target"]
		values.append(new_entry)

In [None]:
def extract_features(audios,sampling_rate=44100, data_dir="/Users/panzeqi/Documents/MonashUni/For_Research/ESC-50-master/audio"):
    # 获取所有唯一的音频文件名，避免重复处理
    audio_names = list(audios.filename.unique())
    # 初始化空列表，用于存放每个音频提取后的特征结果
    values = []
    
    # 遍历每个音频文件
    for audio in audio_names:
        # 使用 librosa 加载音频文件
        # "{}/{}".format(data_dir, audio)" 拼接成完整路径，例如 "data/audio_1.wav"
        # sr=sampling_rate 表示重新采样到指定采样率（如 44100 Hz）
        clip, sr = librosa.load("{}/{}".format(data_dir, audio), sr=sampling_rate)
        
        # 从 DataFrame 中筛选出当前音频对应的行（可能包含标签、fold 等）
        # 然后转换为字典列表，例如：
        # [{'filename': 'dog.wav', 'fold': 1, 'target': 'dog'}]
        entries = audios.loc[audios["filename"] == audio].to_dict(orient="records")
        
        # 调用提取函数 extract_spectrogram()
        # 传入当前音频波形 clip 和元信息 entries
        # 函数内部会提取多尺度的 Mel 频谱特征，并把结果追加到 values 列表中
        extract_spectrogram(values, clip, entries)
        
        # 打印进度，方便观察当前处理到哪个音频文件
        print("Finished audio {}".format(audio))
    
    # 返回最终的特征结果列表，每个元素是一个字典，包含：
    # {"audio": 原始波形, "values": Mel 频谱特征, "target": 标签}
    return values

In [None]:
data_dir = '/Users/panzeqi/Documents/MonashUni/For_Research/ESC-50-master/audio'
sampling_rate = 44100

store_dir = '/Users/panzeqi/Documents/MonashUni/For_Research/ESC-50-master/img_data'
csv_file = '/Users/panzeqi/Documents/MonashUni/For_Research/ESC-50-master/meta/esc50.csv'

In [47]:
audios = pd.read_csv(csv_file, skipinitialspace=True)
training_audios = audios.loc[audios["fold"]!=1]
validation_audios = audios.loc[audios["fold"]==1]
training_values = extract_features(training_audios)
with open("{}training128mel{}.pkl".format(store_dir, 1),"wb") as handler:
	pkl.dump(training_values, handler, protocol=pkl.HIGHEST_PROTOCOL)
validation_values = extract_features(validation_audios)
with open("{}validation128mel{}.pkl".format(store_dir, 1),"wb") as handler:
	pkl.dump(validation_values, handler, protocol=pkl.HIGHEST_PROTOCOL)

Finished audio 2-100648-A-43.wav
Finished audio 2-100786-A-1.wav
Finished audio 2-101676-A-10.wav
Finished audio 2-85434-A-27.wav
Finished audio 2-85471-A-34.wav
Finished audio 2-85945-A-18.wav
Finished audio 2-86160-A-27.wav
Finished audio 2-87282-A-34.wav
Finished audio 2-87412-A-24.wav
Finished audio 2-87780-A-33.wav
Finished audio 2-87781-A-10.wav
Finished audio 2-87794-A-24.wav
Finished audio 2-87795-A-24.wav
Finished audio 2-87799-A-24.wav
Finished audio 2-88724-A-38.wav
Finished audio 2-89516-A-37.wav
Finished audio 2-91912-A-33.wav
Finished audio 2-91912-B-33.wav
Finished audio 2-92627-A-27.wav
Finished audio 2-92978-A-29.wav
Finished audio 2-93030-A-21.wav
Finished audio 2-94230-A-27.wav
Finished audio 2-94807-A-29.wav
Finished audio 2-95035-A-1.wav
Finished audio 2-95258-A-1.wav
Finished audio 2-95258-B-1.wav
Finished audio 2-95567-A-23.wav
Finished audio 2-96033-A-13.wav
Finished audio 2-96063-A-37.wav
Finished audio 2-96460-A-1.wav
Finished audio 2-96654-A-47.wav
Finished a

# 准备data loader 方法

In [49]:
from torch.utils.data import *
import lmdb
import torchvision
import pandas as pd
import numpy as np
import pickle
import torch
from PIL import Image

class AudioDataset(Dataset):
	def __init__(self, pkl_dir, dataset_name, transforms=None):
		self.data = []
		self.length = 1500 if dataset_name=="GTZAN" else 250
		self.transforms = transforms
		with open(pkl_dir, "rb") as f:
			self.data = pickle.load(f)
	def __len__(self):
		return len(self.data)
	def __getitem__(self, idx):
		entry = self.data[idx]
		output_data = {}
		values = entry["values"].reshape(-1, 128, self.length)
		values = torch.Tensor(values)
		if self.transforms:
			values = self.transforms(values)
		target = torch.LongTensor([entry["target"]])
		return (values, target)

In [50]:
def fetch_dataloader(pkl_dir, dataset_name, batch_size, num_workers):
	dataset = AudioDataset(pkl_dir, dataset_name)
	dataloader = DataLoader(dataset, shuffle=True, batch_size=batch_size, num_workers=num_workers)
	return dataloader

# 训练方法

In [51]:
import torch
import torchvision
import torch.nn as nn
import numpy as np
import json
import utils
import validate
import argparse
import models.densenet
import models.resnet
import models.inception
import time
import dataloaders.datasetaug
import dataloaders.datasetnormal

from tqdm import tqdm
from tensorboardX import SummaryWriter

In [52]:
config_path = '/Users/panzeqi/Documents/MonashUni/For_Research/Audio-Classification/config/esc_densenet.json'

In [53]:
def train(model, device, data_loader, optimizer, loss_fn):
    model.train()
    loss_avg = utils.RunningAverage()

    with tqdm(total=len(data_loader)) as t:
        for batch_idx, data in enumerate(data_loader):
            inputs = data[0].to(device)
            target = data[1].squeeze(1).to(device)

            outputs = model(inputs)

            loss = loss_fn(outputs, target)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            loss_avg.update(loss.item())

            t.set_postfix(loss='{:05.3f}'.format(loss_avg()))
            t.update()
    return loss_avg()

In [54]:
def train_and_evaluate(model, device, train_loader, val_loader, optimizer, loss_fn, writer, params, split, scheduler=None):
    best_acc = 0.0

    for epoch in range(params.epochs):
        avg_loss = train(model, device, train_loader, optimizer, loss_fn)

        acc = validate.evaluate(model, device, val_loader)
        print("Epoch {}/{} Loss:{} Valid Acc:{}".format(epoch, params.epochs, avg_loss, acc))

        is_best = (acc > best_acc)
        if is_best:
            best_acc = acc
        if scheduler:
            scheduler.step()

        utils.save_checkpoint({"epoch": epoch + 1,
                               "model": model.state_dict(),
                               "optimizer": optimizer.state_dict()}, is_best, split, "{}".format(params.checkpoint_dir))
        writer.add_scalar("data{}/trainingLoss{}".format(params.dataset_name, split), avg_loss, epoch)
        writer.add_scalar("data{}/valLoss{}".format(params.dataset_name, split), acc, epoch)
    writer.close()

# 训练配置

In [76]:
params = utils.Params('/Users/panzeqi/Documents/MonashUni/For_Research/Audio-Classification/config/esc_densenet.json')
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [77]:
train_loader = fetch_dataloader( "{}training128mel{}.pkl".format(params.data_dir, 1), params.dataset_name, params.batch_size, params.num_workers)
val_loader = fetch_dataloader("{}validation128mel{}.pkl".format(params.data_dir, 1), params.dataset_name, params.batch_size, params.num_workers)

In [78]:
writer = SummaryWriter(comment=params.dataset_name)

In [79]:
model = models.densenet.DenseNet(params.dataset_name, params.pretrained).to(device)



In [80]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=params.lr, weight_decay=params.weight_decay)
if params.scheduler:
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 30, gamma=0.1)
else:
    scheduler = None

In [81]:
params.num_workers

0

In [82]:
train_and_evaluate(model, device, train_loader, val_loader, optimizer, loss_fn, writer, params, 1, scheduler)

100%|██████████| 3/3 [00:22<00:00,  7.36s/it, loss=3.802]


Epoch 0/70 Loss:3.8023056983947754 Valid Acc:0.0
Checkpoint Directory does not exist


100%|██████████| 3/3 [00:22<00:00,  7.43s/it, loss=2.532]


Epoch 1/70 Loss:2.5315953890482583 Valid Acc:0.0


100%|██████████| 3/3 [00:20<00:00,  6.89s/it, loss=1.711]


Epoch 2/70 Loss:1.7114782333374023 Valid Acc:20.0


100%|██████████| 3/3 [00:20<00:00,  6.67s/it, loss=1.135]


Epoch 3/70 Loss:1.134678800900777 Valid Acc:20.0


100%|██████████| 3/3 [00:19<00:00,  6.59s/it, loss=0.757]


Epoch 4/70 Loss:0.7574990590413412 Valid Acc:20.0


100%|██████████| 3/3 [00:19<00:00,  6.39s/it, loss=0.504]


Epoch 5/70 Loss:0.503511925538381 Valid Acc:20.0


100%|██████████| 3/3 [00:20<00:00,  6.75s/it, loss=0.381]


Epoch 6/70 Loss:0.3812357783317566 Valid Acc:40.0


100%|██████████| 3/3 [00:20<00:00,  6.72s/it, loss=0.233]


Epoch 7/70 Loss:0.23338399330774942 Valid Acc:40.0


100%|██████████| 3/3 [00:19<00:00,  6.57s/it, loss=0.174]


Epoch 8/70 Loss:0.1744750738143921 Valid Acc:40.0


100%|██████████| 3/3 [00:19<00:00,  6.65s/it, loss=0.136]


Epoch 9/70 Loss:0.1359978268543879 Valid Acc:40.0


100%|██████████| 3/3 [00:19<00:00,  6.51s/it, loss=0.105]


Epoch 10/70 Loss:0.105216217537721 Valid Acc:40.0


100%|██████████| 3/3 [00:20<00:00,  6.86s/it, loss=0.087]


Epoch 11/70 Loss:0.0869710569580396 Valid Acc:40.0


100%|██████████| 3/3 [00:19<00:00,  6.54s/it, loss=0.076]


Epoch 12/70 Loss:0.07639180123806 Valid Acc:40.0


100%|██████████| 3/3 [00:19<00:00,  6.60s/it, loss=0.064]


Epoch 13/70 Loss:0.06354184448719025 Valid Acc:40.0


100%|██████████| 3/3 [00:19<00:00,  6.58s/it, loss=0.060]


Epoch 14/70 Loss:0.060121419529120125 Valid Acc:40.0


100%|██████████| 3/3 [00:19<00:00,  6.57s/it, loss=0.057]


Epoch 15/70 Loss:0.05698063845435778 Valid Acc:40.0


100%|██████████| 3/3 [00:21<00:00,  7.28s/it, loss=0.046]


Epoch 16/70 Loss:0.045892554024855294 Valid Acc:40.0


100%|██████████| 3/3 [00:20<00:00,  6.73s/it, loss=0.045]


Epoch 17/70 Loss:0.0452279324332873 Valid Acc:40.0


100%|██████████| 3/3 [00:20<00:00,  6.91s/it, loss=0.046]


Epoch 18/70 Loss:0.046414277205864586 Valid Acc:40.0


100%|██████████| 3/3 [00:20<00:00,  6.93s/it, loss=0.035]


Epoch 19/70 Loss:0.03464870651563009 Valid Acc:40.0


100%|██████████| 3/3 [00:20<00:00,  6.88s/it, loss=0.036]


Epoch 20/70 Loss:0.03617993742227554 Valid Acc:40.0


100%|██████████| 3/3 [00:22<00:00,  7.45s/it, loss=0.035]


Epoch 21/70 Loss:0.0345105038334926 Valid Acc:40.0


100%|██████████| 3/3 [00:20<00:00,  6.71s/it, loss=0.030]


Epoch 22/70 Loss:0.030286913737654686 Valid Acc:40.0


100%|██████████| 3/3 [00:21<00:00,  7.31s/it, loss=0.028]


Epoch 23/70 Loss:0.028170902902881306 Valid Acc:40.0


100%|██████████| 3/3 [00:22<00:00,  7.49s/it, loss=0.030]


Epoch 24/70 Loss:0.02977628695468108 Valid Acc:40.0


100%|██████████| 3/3 [00:22<00:00,  7.56s/it, loss=0.032]


Epoch 25/70 Loss:0.03230841954549154 Valid Acc:40.0


100%|██████████| 3/3 [00:23<00:00,  7.91s/it, loss=0.029]


Epoch 26/70 Loss:0.028629037241141003 Valid Acc:40.0


100%|██████████| 3/3 [00:22<00:00,  7.54s/it, loss=0.026]


Epoch 27/70 Loss:0.02589004983504613 Valid Acc:40.0


100%|██████████| 3/3 [00:22<00:00,  7.37s/it, loss=0.021]


Epoch 28/70 Loss:0.02133405456940333 Valid Acc:40.0


100%|██████████| 3/3 [00:21<00:00,  7.30s/it, loss=0.025]


Epoch 29/70 Loss:0.024758956705530483 Valid Acc:40.0


100%|██████████| 3/3 [00:22<00:00,  7.41s/it, loss=0.021]


Epoch 30/70 Loss:0.020659872020284336 Valid Acc:40.0


100%|██████████| 3/3 [00:22<00:00,  7.41s/it, loss=0.022]


Epoch 31/70 Loss:0.022073788568377495 Valid Acc:40.0


100%|██████████| 3/3 [00:22<00:00,  7.36s/it, loss=0.023]


Epoch 32/70 Loss:0.0230102576315403 Valid Acc:40.0


100%|██████████| 3/3 [00:21<00:00,  7.26s/it, loss=0.024]


Epoch 33/70 Loss:0.023794422547022503 Valid Acc:40.0


100%|██████████| 3/3 [00:21<00:00,  7.30s/it, loss=0.025]


Epoch 34/70 Loss:0.024786490947008133 Valid Acc:40.0


100%|██████████| 3/3 [00:22<00:00,  7.35s/it, loss=0.021]


Epoch 35/70 Loss:0.02096675770978133 Valid Acc:40.0


100%|██████████| 3/3 [00:21<00:00,  7.30s/it, loss=0.023]


Epoch 36/70 Loss:0.023161013300220173 Valid Acc:40.0


100%|██████████| 3/3 [00:22<00:00,  7.54s/it, loss=0.021]


Epoch 37/70 Loss:0.020967818175752957 Valid Acc:40.0


100%|██████████| 3/3 [00:21<00:00,  7.30s/it, loss=0.019]


Epoch 38/70 Loss:0.019042784969011944 Valid Acc:40.0


100%|██████████| 3/3 [00:21<00:00,  7.29s/it, loss=0.026]


Epoch 39/70 Loss:0.026213090245922405 Valid Acc:40.0


100%|██████████| 3/3 [00:22<00:00,  7.49s/it, loss=0.018]


Epoch 40/70 Loss:0.017950306956966717 Valid Acc:40.0


100%|██████████| 3/3 [00:21<00:00,  7.21s/it, loss=0.020]


Epoch 41/70 Loss:0.0204341368128856 Valid Acc:40.0


100%|██████████| 3/3 [00:22<00:00,  7.42s/it, loss=0.021]


Epoch 42/70 Loss:0.02144251236071189 Valid Acc:40.0


100%|██████████| 3/3 [00:21<00:00,  7.30s/it, loss=0.020]


Epoch 43/70 Loss:0.020147614181041718 Valid Acc:40.0


100%|██████████| 3/3 [00:22<00:00,  7.49s/it, loss=0.027]


Epoch 44/70 Loss:0.02688032699127992 Valid Acc:40.0


100%|██████████| 3/3 [00:22<00:00,  7.52s/it, loss=0.021]


Epoch 45/70 Loss:0.020545356596509617 Valid Acc:40.0


100%|██████████| 3/3 [00:21<00:00,  7.26s/it, loss=0.019]


Epoch 46/70 Loss:0.018967966238657635 Valid Acc:40.0


100%|██████████| 3/3 [00:23<00:00,  7.77s/it, loss=0.022]


Epoch 47/70 Loss:0.021995603417356808 Valid Acc:40.0


100%|██████████| 3/3 [00:23<00:00,  7.71s/it, loss=0.023]


Epoch 48/70 Loss:0.023238808537522953 Valid Acc:40.0


100%|██████████| 3/3 [00:22<00:00,  7.60s/it, loss=0.025]


Epoch 49/70 Loss:0.025462886318564415 Valid Acc:40.0


100%|██████████| 3/3 [00:22<00:00,  7.65s/it, loss=0.018]


Epoch 50/70 Loss:0.0177295816441377 Valid Acc:40.0


100%|██████████| 3/3 [00:21<00:00,  7.33s/it, loss=0.020]


Epoch 51/70 Loss:0.020119805509845417 Valid Acc:40.0


100%|██████████| 3/3 [00:23<00:00,  7.93s/it, loss=0.021]


Epoch 52/70 Loss:0.021012553324302036 Valid Acc:40.0


100%|██████████| 3/3 [00:21<00:00,  7.32s/it, loss=0.018]


Epoch 53/70 Loss:0.017670420929789543 Valid Acc:40.0


100%|██████████| 3/3 [00:21<00:00,  7.30s/it, loss=0.019]


Epoch 54/70 Loss:0.018910609806577366 Valid Acc:40.0


100%|██████████| 3/3 [00:22<00:00,  7.44s/it, loss=0.019]


Epoch 55/70 Loss:0.01925298012793064 Valid Acc:40.0


100%|██████████| 3/3 [00:22<00:00,  7.44s/it, loss=0.018]


Epoch 56/70 Loss:0.01780678393940131 Valid Acc:40.0


100%|██████████| 3/3 [00:22<00:00,  7.44s/it, loss=0.019]


Epoch 57/70 Loss:0.019017283494273823 Valid Acc:40.0


100%|██████████| 3/3 [00:23<00:00,  7.77s/it, loss=0.020]


Epoch 58/70 Loss:0.020195110390583675 Valid Acc:40.0


100%|██████████| 3/3 [00:22<00:00,  7.51s/it, loss=0.019]


Epoch 59/70 Loss:0.019351980338493984 Valid Acc:40.0


100%|██████████| 3/3 [00:22<00:00,  7.49s/it, loss=0.022]


Epoch 60/70 Loss:0.021605716397364933 Valid Acc:40.0


100%|██████████| 3/3 [00:22<00:00,  7.58s/it, loss=0.018]


Epoch 61/70 Loss:0.018486222873131435 Valid Acc:40.0


100%|██████████| 3/3 [00:22<00:00,  7.37s/it, loss=0.017]


Epoch 62/70 Loss:0.017360294858614605 Valid Acc:40.0


100%|██████████| 3/3 [00:22<00:00,  7.35s/it, loss=0.018]


Epoch 63/70 Loss:0.018069473405679066 Valid Acc:40.0


100%|██████████| 3/3 [00:21<00:00,  7.25s/it, loss=0.021]


Epoch 64/70 Loss:0.021227749064564705 Valid Acc:40.0


100%|██████████| 3/3 [00:22<00:00,  7.51s/it, loss=0.019]


Epoch 65/70 Loss:0.01895884796977043 Valid Acc:40.0


100%|██████████| 3/3 [00:22<00:00,  7.40s/it, loss=0.018]


Epoch 66/70 Loss:0.01754080007473628 Valid Acc:40.0


100%|██████████| 3/3 [00:23<00:00,  7.73s/it, loss=0.021]


Epoch 67/70 Loss:0.021185032402475674 Valid Acc:40.0


100%|██████████| 3/3 [00:22<00:00,  7.52s/it, loss=0.017]


Epoch 68/70 Loss:0.017014289274811745 Valid Acc:40.0


100%|██████████| 3/3 [00:23<00:00,  7.81s/it, loss=0.018]


Epoch 69/70 Loss:0.01842161174863577 Valid Acc:40.0
