# Clasificación de Sonidos ESC-50 con ResNet-50
Audio (.wav) → Mel-espectrograma → ResNet-50 → 50 clases

In [1]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchvision.models as models
import torchvision.transforms as transforms
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Limpiar memoria GPU
if torch.cuda.is_available():
    torch.cuda.empty_cache()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Dispositivo: {device}")

Dispositivo: cuda


In [2]:
# Configuración
DATASET_PATH = "data/ESC-50-master"
AUDIO_DIR = os.path.join(DATASET_PATH, "audio")
SPECTROGRAM_DIR = os.path.join(DATASET_PATH, "audio_image")
META_PATH = os.path.join(DATASET_PATH, "meta", "esc50.csv")

BATCH_SIZE = 16  # Reducido para evitar errores de memoria
NUM_EPOCAS = 15
NUM_CLASES = 50
LR = 0.001
CONGELAR_BACKBONE = False  # False = fine-tuning (más memoria)

In [3]:
# Dataset
class ESC50Dataset(Dataset):
    def __init__(self, dataframe, spectrogram_dir, transform=None):
        self.dataframe = dataframe.reset_index(drop=True)
        self.spectrogram_dir = spectrogram_dir
        self.transform = transform
        self.categorias = sorted(dataframe['category'].unique())
        self.cat_to_idx = {cat: idx for idx, cat in enumerate(self.categorias)}
    
    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        fila = self.dataframe.iloc[idx]
        etiqueta = self.cat_to_idx[fila['category']]
        
        # Cargar espectrograma pre-computado
        spec_path = os.path.join(self.spectrogram_dir, fila['filename'].replace('.wav', '.npy'))
        mel_spec = np.load(spec_path)
        
        # Normalizar [0, 1] y convertir a tensor RGB
        mel_spec = (mel_spec - mel_spec.min()) / (mel_spec.max() - mel_spec.min() + 1e-8)
        mel_spec = torch.tensor(mel_spec, dtype=torch.float32).unsqueeze(0).repeat(3, 1, 1)
        
        if self.transform:
            mel_spec = self.transform(mel_spec)
        
        return mel_spec, etiqueta

In [4]:
# Preparar datos
df = pd.read_csv(META_PATH)
df_train = df[df['fold'].isin([1, 2, 3, 4])]
df_val = df[df['fold'] == 5]

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

train_dataset = ESC50Dataset(df_train, SPECTROGRAM_DIR, transform)
val_dataset = ESC50Dataset(df_val, SPECTROGRAM_DIR, transform)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)

print(f"Train: {len(df_train)} | Val: {len(df_val)}")

Train: 1600 | Val: 400


In [5]:
# Modelo ResNet-50 con cabeza personalizada
modelo = models.resnet50(weights='IMAGENET1K_V2')

# Congelar backbone (opcional)
if CONGELAR_BACKBONE:
    for param in modelo.parameters():
        param.requires_grad = False

# Nueva cabeza clasificadora
modelo.fc = nn.Sequential(
    nn.Dropout(0.5),
    nn.Linear(2048, 512),
    nn.ReLU(),
    nn.Dropout(0.3),
    nn.Linear(512, NUM_CLASES)
)

modelo = modelo.to(device)
print(f"Params entrenables: {sum(p.numel() for p in modelo.parameters() if p.requires_grad):,}")

Params entrenables: 24,582,770


In [6]:
# Configuración de entrenamiento
criterio = nn.CrossEntropyLoss()
optimizador = optim.Adam(filter(lambda p: p.requires_grad, modelo.parameters()), lr=LR, weight_decay=1e-4)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizador, mode='min', factor=0.5, patience=3)

In [7]:
# Funciones de entrenamiento y evaluación
def entrenar(modelo, loader, criterio, optimizador):
    modelo.train()
    total_loss, correctos, total = 0, 0, 0
    
    for imgs, labels in tqdm(loader, leave=False):
        imgs, labels = imgs.to(device), labels.to(device)
        
        optimizador.zero_grad()
        outputs = modelo(imgs)
        loss = criterio(outputs, labels)
        loss.backward()
        optimizador.step()
        
        total_loss += loss.item() * imgs.size(0)
        correctos += (outputs.argmax(1) == labels).sum().item()
        total += labels.size(0)
    
    return total_loss / total, correctos / total

def evaluar(modelo, loader, criterio):
    modelo.eval()
    total_loss, correctos, total = 0, 0, 0
    
    with torch.no_grad():
        for imgs, labels in loader:
            imgs, labels = imgs.to(device), labels.to(device)
            outputs = modelo(imgs)
            loss = criterio(outputs, labels)
            
            total_loss += loss.item() * imgs.size(0)
            correctos += (outputs.argmax(1) == labels).sum().item()
            total += labels.size(0)
    
    return total_loss / total, correctos / total

In [8]:
# Entrenamiento
mejor_acc = 0.0

for epoca in range(NUM_EPOCAS):
    train_loss, train_acc = entrenar(modelo, train_loader, criterio, optimizador)
    val_loss, val_acc = evaluar(modelo, val_loader, criterio)
    scheduler.step(val_loss)
    
    marca = "*" if val_acc > mejor_acc else ""
    if val_acc > mejor_acc:
        mejor_acc = val_acc
        torch.save(modelo.state_dict(), 'mejor_modelo.pth')
    
    print(f"Ep {epoca+1:2d}/{NUM_EPOCAS} | Train: {train_acc*100:5.2f}% | Val: {val_acc*100:5.2f}% {marca}")

print(f"\nMejor accuracy: {mejor_acc*100:.2f}%")

                                                                                                                       

Ep  1/15 | Train: 15.69% | Val: 24.50% *


                                                                                                                       

Ep  2/15 | Train: 32.94% | Val: 30.25% *


                                                                                                                       

Ep  3/15 | Train: 41.88% | Val: 47.00% *


                                                                                                                       

Ep  4/15 | Train: 58.75% | Val: 44.00% 


                                                                                                                       

Ep  5/15 | Train: 62.06% | Val: 51.00% *


                                                                                                                       

Ep  6/15 | Train: 72.00% | Val: 53.50% *


                                                                                                                       

Ep  7/15 | Train: 74.12% | Val: 57.50% *


                                                                                                                       

Ep  8/15 | Train: 76.31% | Val: 60.25% *


                                                                                                                       

Ep  9/15 | Train: 78.81% | Val: 53.50% 


                                                                                                                       

Ep 10/15 | Train: 85.06% | Val: 56.50% 


                                                                                                                       

Ep 11/15 | Train: 85.19% | Val: 63.00% *


                                                                                                                       

Ep 12/15 | Train: 87.00% | Val: 57.50% 


                                                                                                                       

Ep 13/15 | Train: 85.12% | Val: 54.75% 


                                                                                                                       

Ep 14/15 | Train: 88.19% | Val: 63.75% *


                                                                                                                       

Ep 15/15 | Train: 87.38% | Val: 65.75% *

Mejor accuracy: 65.75%


In [9]:
# Evaluación final
modelo.load_state_dict(torch.load('mejor_modelo.pth'))
val_loss, val_acc = evaluar(modelo, val_loader, criterio)
print(f"Accuracy final: {val_acc*100:.2f}%")

Accuracy final: 65.75%
