In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
import numpy as np
import os
import glob
import librosa
import numpy as np

In [2]:
import pandas as pd
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split


In [3]:
feature = []
label = []
# 建立类别标签，不同类别对应不同的数字。
label_dict = {'aloe': 0, 'burger': 1, 'cabbage': 2,'candied_fruits':3, 'carrots': 4, 'chips':5,
                  'chocolate': 6, 'drinks': 7, 'fries': 8, 'grapes': 9, 'gummies': 10, 'ice-cream':11,
                  'jelly': 12, 'noodles': 13, 'pickles': 14, 'pizza': 15, 'ribs': 16, 'salmon':17,
                  'soup': 18, 'wings': 19}
label_dict_inv = {v:k for k,v in label_dict.items()}

In [4]:
import librosa
import numpy as np
from tqdm import tqdm
import glob
import os

def extract_features(parent_dir, sub_dirs, max_file=10, file_ext="*.wav"):
    label, feature = [], []
    for sub_dir in sub_dirs:
        for fn in tqdm(glob.glob(os.path.join(parent_dir, sub_dir, file_ext))[:max_file]):
            # 加载原始音频（不添加噪声）
            X, sample_rate = librosa.load(fn, res_type='kaiser_fast')
            
            # 提取梅尔频谱
            mels = librosa.feature.melspectrogram(y=X, sr=sample_rate, n_mels=128)
            mels_db = librosa.power_to_db(mels)
            mels_mean = np.mean(mels_db.T, axis=0)
            
            # 提取MFCC
            mfccs = librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40)
            mfccs_delta = librosa.feature.delta(mfccs)
            mfccs_delta2 = librosa.feature.delta(mfccs, order=2)
            mfccs_combined = np.concatenate([mfccs, mfccs_delta, mfccs_delta2], axis=0)
            mfccs_mean = np.mean(mfccs_combined.T, axis=0)
            
            # 特征融合
            combined_feature = np.concatenate([mels_mean, mfccs_mean])
            feature.append(combined_feature)
            label.append(label_dict[sub_dir])
    
    return [feature, label]

In [5]:
parent_dir = './train/'
save_dir = "./"
folds = sub_dirs = np.array(['aloe','burger','cabbage','candied_fruits',
                             'carrots','chips','chocolate','drinks','fries',
                            'grapes','gummies','ice-cream','jelly','noodles','pickles',
                            'pizza','ribs','salmon','soup','wings'])

X, Y = extract_features(parent_dir, sub_dirs, max_file=1000)

100%|██████████| 340/340 [00:07<00:00, 44.95it/s]
100%|██████████| 340/340 [00:07<00:00, 44.95it/s]
100%|██████████| 372/372 [00:08<00:00, 44.23it/s]
100%|██████████| 372/372 [00:08<00:00, 44.23it/s]
100%|██████████| 329/329 [00:14<00:00, 22.38it/s]
100%|██████████| 329/329 [00:14<00:00, 22.38it/s]
100%|██████████| 499/499 [00:20<00:00, 24.25it/s]
100%|██████████| 499/499 [00:20<00:00, 24.25it/s]
100%|██████████| 413/413 [00:14<00:00, 28.09it/s]
100%|██████████| 413/413 [00:14<00:00, 28.09it/s]
100%|██████████| 446/446 [00:16<00:00, 27.37it/s]
100%|██████████| 446/446 [00:16<00:00, 27.37it/s]
100%|██████████| 178/178 [00:06<00:00, 27.94it/s]
100%|██████████| 178/178 [00:06<00:00, 27.94it/s]
100%|██████████| 191/191 [00:05<00:00, 32.92it/s]
100%|██████████| 191/191 [00:05<00:00, 32.92it/s]
100%|██████████| 405/405 [00:13<00:00, 30.28it/s]
100%|██████████| 405/405 [00:13<00:00, 30.28it/s]
100%|██████████| 345/345 [00:12<00:00, 28.09it/s]
100%|██████████| 345/345 [00:12<00:00, 28.09it/s]


In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=1, stratify=Y)

# 转换为NumPy数组
X_train = np.array(X_train)
X_test = np.array(X_test)
Y_train = np.array(Y_train)
Y_test = np.array(Y_test)

print("X_train.shape:", X_train.shape)  # (4900, 248)

# 补零到256维，再reshape为(N, 1, 16, 16)
pad_width = 256 - X_train.shape[1]
X_train = np.pad(X_train, ((0,0),(0,pad_width)), 'constant')
X_test = np.pad(X_test, ((0,0),(0,pad_width)), 'constant')

X_train = X_train.reshape(-1, 1, 16, 16)
X_test = X_test.reshape(-1, 1, 16, 16)

# 转换为PyTorch张量
train_dataset = TensorDataset(torch.FloatTensor(X_train), torch.LongTensor(Y_train))
test_dataset = TensorDataset(torch.FloatTensor(X_test), torch.LongTensor(Y_test))

X_train.shape: (4900, 248)


In [7]:
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [8]:
class BasicBlock(nn.Module):
    def __init__(self, in_channels, out_channels, downsample=False):
        super().__init__()
        stride = 2 if downsample else 1

        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU(inplace=True)

        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.dropout = nn.Dropout(0.25)

        self.downsample = downsample
        if downsample or in_channels != out_channels:
            self.residual = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride),
                nn.BatchNorm2d(out_channels)
            )
        else:
            self.residual = nn.Identity()

    def forward(self, x):
        identity = self.residual(x)
        out = self.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out = self.dropout(out)
        out += identity
        return self.relu(out)


In [9]:
class ResNetAudioClassifier(nn.Module):
    def __init__(self, num_classes=20):
        super().__init__()
        self.prep = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU()
        )

        self.layer1 = nn.Sequential(
            BasicBlock(32, 64, downsample=True),
            BasicBlock(64, 64)
        )
        self.layer2 = nn.Sequential(
            BasicBlock(64, 128, downsample=True),
            BasicBlock(128, 128)
        )
        self.layer3 = nn.Sequential(
            BasicBlock(128, 256, downsample=True),
            BasicBlock(256, 256)
        )

        self.pool = nn.AdaptiveAvgPool2d((1,1))
        self.fc = nn.Sequential(
            nn.Linear(256, num_classes)
        )

    def forward(self, x):
        x = self.prep(x)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.pool(x)
        x = x.view(x.size(0), -1)
        return self.fc(x)


In [10]:
class LabelSmoothingLoss(nn.Module):
    def __init__(self, smoothing=0.1):
        super().__init__()
        self.smoothing = smoothing

    def forward(self, pred, target):
        log_prob = F.log_softmax(pred, dim=-1)
        nll_loss = -log_prob.gather(dim=-1, index=target.unsqueeze(1))
        nll_loss = nll_loss.squeeze(1)
        smooth_loss = -log_prob.mean(dim=-1)
        loss = (1 - self.smoothing) * nll_loss + self.smoothing * smooth_loss
        return loss.mean()

In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#model = AudioClassifier().to(device)
model = ResNetAudioClassifier().to(device)
#optimizer = optim.Adam(model.parameters())
optimizer = optim.Adam(
    model.parameters(),
    lr=0.0001,          # 默认学习率
    weight_decay=0.01  # L2正则化系数λ（建议初始值范围[1e-5, 1e-3]）
)

criterion = LabelSmoothingLoss(smoothing=0.1)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=100)


In [12]:
def train_model(model, train_loader, test_loader, criterion, optimizer, 
                num_epochs=100, validate_every=10, show_progress_every=1):
    """
    参数:
    - validate_every: 每多少轮验证一次
    - show_progress_every: 每多少轮显示一次进度条
    """
    for epoch in range(num_epochs):
        # 训练阶段
        model.train()
        train_loss = 0.0
        train_correct = 0
        train_total = 0
        
        # 控制进度条显示频率
        loader = train_loader if (epoch+1) % show_progress_every == 0 else train_loader
        disable_progress = not ((epoch+1) % show_progress_every == 0)
        
        for inputs, labels in tqdm(loader, 
                                 desc=f"Epoch {epoch+1}/{num_epochs}", 
                                 unit="batch",
                                 disable=disable_progress):
            inputs, labels = inputs.to(device), labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            train_total += labels.size(0)
            train_correct += (predicted == labels).sum().item()
        
        train_loss = train_loss / len(train_loader)
        train_acc = 100 * train_correct / train_total
        
        # 验证阶段（根据设置频率）
        if (epoch + 1) % validate_every == 0 or epoch == num_epochs - 1:
            model.eval()
            val_loss = 0.0
            val_correct = 0
            val_total = 0
            
            with torch.no_grad():
                for inputs, labels in test_loader:
                    inputs, labels = inputs.to(device), labels.to(device)
                    outputs = model(inputs)
                    loss = criterion(outputs, labels)
                    
                    val_loss += loss.item()
                    _, predicted = torch.max(outputs.data, 1)
                    val_total += labels.size(0)
                    val_correct += (predicted == labels).sum().item()
            
            val_loss = val_loss / len(test_loader)
            val_acc = 100 * val_correct / val_total
            
            print(f"\nEpoch {epoch+1}/{num_epochs}")
            print(f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.2f}%")
            print(f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.2f}%")
            print("-" * 50)
        else:
            # 非验证epoch只显示简单信息
            if not disable_progress:
                print(f"\nEpoch {epoch+1}/{num_epochs} - Train Acc: {train_acc:.2f}%")
        
        # 每个epoch后更新学习率
        scheduler.step()
    
    # 训练完成后打印最终结果
    print("\nTraining completed!")
    print(f"Final Train Accuracy: {train_acc:.2f}%")
    
    # 确保最后一个epoch验证过
    if (num_epochs - 1) % validate_every != 0:
        model.eval()
        val_loss = 0.0
        val_correct = 0
        val_total = 0
        with torch.no_grad():
            for inputs, labels in test_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                val_total += labels.size(0)
                val_correct += (predicted == labels).sum().item()
        val_acc = 100 * val_correct / val_total
        print(f"Final Validation Accuracy: {val_acc:.2f}%")


In [13]:
train_model(model, train_loader, test_loader, criterion, optimizer, 
           num_epochs=500, validate_every=50, show_progress_every=50)


Epoch 50/500: 100%|██████████| 77/77 [00:00<00:00, 232.03batch/s]




Epoch 50/500
Train Loss: 0.6317 | Train Acc: 99.69%
Val Loss: 0.7166 | Val Acc: 95.86%
--------------------------------------------------


Epoch 100/500: 100%|██████████| 77/77 [00:00<00:00, 233.46batch/s]




Epoch 100/500
Train Loss: 0.6011 | Train Acc: 100.00%
Val Loss: 0.6852 | Val Acc: 96.67%
--------------------------------------------------


Epoch 150/500: 100%|██████████| 77/77 [00:00<00:00, 232.38batch/s]




Epoch 150/500
Train Loss: 0.6194 | Train Acc: 99.71%
Val Loss: 0.6977 | Val Acc: 96.19%
--------------------------------------------------


Epoch 200/500: 100%|██████████| 77/77 [00:00<00:00, 228.38batch/s]




Epoch 200/500
Train Loss: 0.6148 | Train Acc: 99.88%
Val Loss: 0.6925 | Val Acc: 96.62%
--------------------------------------------------


Epoch 250/500: 100%|██████████| 77/77 [00:00<00:00, 231.92batch/s]




Epoch 250/500
Train Loss: 0.6039 | Train Acc: 100.00%
Val Loss: 0.6633 | Val Acc: 97.90%
--------------------------------------------------


Epoch 300/500: 100%|██████████| 77/77 [00:00<00:00, 222.15batch/s]




Epoch 300/500
Train Loss: 0.6009 | Train Acc: 100.00%
Val Loss: 0.6540 | Val Acc: 98.14%
--------------------------------------------------


Epoch 350/500: 100%|██████████| 77/77 [00:00<00:00, 230.27batch/s]




Epoch 350/500
Train Loss: 0.6039 | Train Acc: 100.00%
Val Loss: 0.6602 | Val Acc: 98.10%
--------------------------------------------------


Epoch 400/500: 100%|██████████| 77/77 [00:00<00:00, 234.60batch/s]




Epoch 400/500
Train Loss: 0.6136 | Train Acc: 99.80%
Val Loss: 0.6828 | Val Acc: 96.90%
--------------------------------------------------


Epoch 450/500: 100%|██████████| 77/77 [00:00<00:00, 234.70batch/s]




Epoch 450/500
Train Loss: 0.6034 | Train Acc: 100.00%
Val Loss: 0.6615 | Val Acc: 97.76%
--------------------------------------------------


Epoch 500/500: 100%|██████████| 77/77 [00:00<00:00, 232.79batch/s]




Epoch 500/500
Train Loss: 0.6016 | Train Acc: 100.00%
Val Loss: 0.6497 | Val Acc: 98.43%
--------------------------------------------------

Training completed!
Final Train Accuracy: 100.00%
Final Validation Accuracy: 98.43%


In [14]:
def extract_test_features(test_dir, max_file=None, file_ext="*.wav"):
    features = []
    file_list = glob.glob(os.path.join(test_dir, file_ext))
    if max_file is not None:
        file_list = file_list[:max_file]
    
    for fn in tqdm(file_list):
        # 加载音频（不添加噪声）
        X, sample_rate = librosa.load(fn, res_type='kaiser_fast')
        
        # 梅尔频谱提取（与训练集参数一致）
        mels = librosa.feature.melspectrogram(y=X, sr=sample_rate, n_mels=128)
        mels_db = librosa.power_to_db(mels)
        mels_mean = np.mean(mels_db.T, axis=0)
        
        # MFCC及差分特征提取（与训练集逻辑相同）
        mfccs = librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40)
        mfccs_delta = librosa.feature.delta(mfccs)
        mfccs_delta2 = librosa.feature.delta(mfccs, order=2)
        mfccs_combined = np.concatenate([mfccs, mfccs_delta, mfccs_delta2], axis=0)
        mfccs_mean = np.mean(mfccs_combined.T, axis=0)
        
        # 特征融合（保持128+120维）
        combined_feature = np.concatenate([mels_mean, mfccs_mean])
        # 补零到256维
        pad_width = 256 - combined_feature.shape[0]
        combined_feature = np.pad(combined_feature, (0, pad_width), 'constant')
        features.append(combined_feature)
    
    return torch.FloatTensor(np.array(features)), file_list


In [15]:
test_features, file_list = extract_test_features('./test_a/')
test_features = test_features.view(-1, 1, 16, 16)

100%|██████████| 2000/2000 [01:12<00:00, 27.41it/s]
100%|██████████| 2000/2000 [01:12<00:00, 27.41it/s]


In [17]:
def predict(model, features):
    model.eval()
    features = features.to(device)
    with torch.no_grad():
        outputs = model(features)
        _, predicted = torch.max(outputs, 1)
        # 返回类别名称而不是数字
        return [label_dict_inv[int(i)] for i in predicted.cpu().numpy()]
predictions = predict(model, test_features)
preds = predictions


In [18]:
# 用file_list保证name和label顺序一致
result = pd.DataFrame({'name': [os.path.basename(x) for x in file_list], 'label': preds})

In [19]:
result['name'] = result['name'].apply(lambda x: x.split('/')[-1])
result.to_csv('submit.csv',index=None)


In [20]:
!ls ./test_a/*.wav | wc -l
!wc -l submit.csv

2000
2001 submit.csv


# 训练集验证分数高但测试集分数低的常见原因与排查建议

1. **数据分布不一致**  
   - 检查训练/验证集与测试集的分布是否一致，类别比例是否均衡。
   - 检查是否存在数据泄漏或采样方式不当。

2. **特征处理不一致**  
   - 确认训练/验证集与测试集的特征提取、padding、reshape等所有预处理步骤完全一致。
   - 包括梅尔频谱、MFCC、补零、reshape等。

3. **标签映射问题**  
   - 检查label_dict和label_dict_inv的使用，确保预测输出与提交label一致。
   - 确认predict函数输出为类别名称且顺序正确。

4. **模型过拟合**  
   - 如果模型在训练/验证集表现很好但测试集很差，可能过拟合。
   - 可尝试增加正则化、数据增强、减少模型复杂度等。

5. **提交文件格式问题**  
   - 检查提交的csv文件格式，name和label列是否与官方要求一致。
   - name列应为音频文件名（不含路径），label列为类别名称。

6. **建议排查步骤**  
   - 随机抽查部分测试集样本，手动对比特征与训练集一致性。
   - 检查predict函数和label映射逻辑。
   - 检查csv文件内容与格式。