In [None]:
import os
import torch
import random
import torchaudio
import numpy as np
from typing import List
from sklearn.metrics import accuracy_score
from torch.utils.data import Dataset, DataLoader
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2Processor

from torchvision.models import resnet18, resnet34, resnet50, densenet121, densenet169
from transformers import AutoModelForSequenceClassification, Wav2Vec2ForSequenceClassification

import librosa
import matplotlib.pyplot as plt
import librosa.display
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast, GradScaler

import os

from tqdm.notebook import tqdm
import torchaudio
from torchvision.transforms import ToTensor

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


In [None]:
# Load the pre-trained Wav2Vec model
wav2vec_model = Wav2Vec2ForSequenceClassification.from_pretrained("facebook/wav2vec2-base-960h")

# Modify the classifier head for your classification task
num_labels = 50  # Replace with the number of classes in your dataset
wav2vec_model.classifier = nn.Linear(wav2vec_model.config.hidden_size, num_labels)

# Move the model to the desired device (e.g., GPU)
wav2vec_model = wav2vec_model.to(device, non_blocking=True)

In [None]:

# Load the pre-trained BEATs model
beats_model = AutoModelForSequenceClassification.from_pretrained("beomi/kcbert-base")

# Modify the classifier head for your classification task
num_labels = 50  # Replace with the number of classes in your dataset
beats_model.classifier = nn.Linear(beats_model.config.hidden_size, num_labels)

# Move the model to the desired device (e.g., GPU)
beats_model = beats_model.to(device, non_blocking=True)

In [None]:
class ESC50DataWav2Vec(Dataset):
    def __init__(self, base, df, in_col, out_col):
        self.df = df
        self.data = []
        self.labels = []
        self.c2i = {}
        self.i2c = {}
        self.categories = sorted(df[out_col].unique())
        for i, category in enumerate(self.categories):
            self.c2i[category] = i
            self.i2c[i] = category
        for ind in tqdm(range(len(df))):
            row = df.iloc[ind]
            file_path = os.path.join(base, row[in_col])
            waveform, sample_rate = torchaudio.load(file_path)
            self.data.append(waveform)
            self.labels.append(self.c2i[row['category']])

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        waveform = self.data[idx]
        waveform = waveform.float()  # Convert waveform to float tensor
        label = self.labels[idx]
        return waveform, label
    

In [None]:
df = pd.read_csv('/home/almogk/ESC-50-master/meta/esc50.csv')

train = df[df['fold']!=5]
valid = df[df['fold']==5]

In [None]:
df = pd.read_csv('/home/almogk/ESC-50-master/meta/esc50.csv')

train = df[df['fold']!=5]
valid = df[df['fold']==5]

train_data = ESC50DataWav2Vec('/home/almogk/ESC-50-master/audio', train, 'filename', 'category')
valid_data = ESC50DataWav2Vec('/home/almogk/ESC-50-master/audio', valid, 'filename', 'category')

train_loader = DataLoader(train_data, batch_size=20, shuffle=True)
valid_loader = DataLoader(valid_data, batch_size=20, shuffle=True)

In [None]:
class ESC50Data(Dataset):
    def __init__(self, base, df, in_col, out_col):
        self.df = df
        self.data = []
        self.labels = []
        self.c2i={}
        self.i2c={}
        self.categories = sorted(df[out_col].unique())
        for i, category in enumerate(self.categories):
            self.c2i[category]=i
            self.i2c[i]=category
        for ind in tqdm(range(len(df))):
            row = df.iloc[ind]
            file_path = os.path.join(base, row[in_col])
            self.data.append(self.spec_to_image(self.get_melspectrogram_db(file_path))[np.newaxis,...])
            self.labels.append(self.c2i[row['category']])
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]
    
    def spec_to_image(self, spec, eps=1e-6):
        mean = spec.mean() # -6.6268077
        std = spec.std() # 5.358466
        spec_norm = (spec - mean) / (std + eps)
        spec_min, spec_max = spec_norm.min(), spec_norm.max()
        spec_scaled = 255 * (spec_norm - spec_min) / (spec_max - spec_min)
        spec_scaled = spec_scaled.astype(np.uint8)
        return spec_scaled
    
    def get_melspectrogram_db(self, file_path, sr=None, n_fft=2048, hop_length=512, n_mels=128, fmin=24, fmax=8300, top_db=80):
        wav, sr = librosa.load(file_path,sr=sr)
        if wav.shape[0]<5*sr:
            wav=np.pad(wav,int(np.ceil((5*sr-wav.shape[0])/2)),mode='reflect')
        else:
            wav=wav[:5*sr]

        spec=librosa.feature.melspectrogram(wav, sr=sr, n_fft=n_fft, hop_length=hop_length,n_mels=n_mels,fmin=fmin,fmax=fmax)
        spec_db=librosa.power_to_db(spec,top_db=top_db)
        return spec_db

In [None]:
df = pd.read_csv('/home/almogk/ESC-50-master/meta/esc50.csv')

train = df[df['fold']!=5]
valid = df[df['fold']==5]

In [None]:
train_data = ESC50Data('/home/almogk/ESC-50-master/audio', train, 'filename', 'category')
valid_data = ESC50Data('/home/almogk/ESC-50-master/audio', valid, 'filename', 'category')

train_loader = DataLoader(train_data, batch_size=20, shuffle=True)
valid_loader = DataLoader(valid_data, batch_size=20, shuffle=True)

In [None]:
densenet_model_121 = densenet121(pretrained=True)
num_ftrs = densenet_model_121.classifier.in_features
densenet_model_121.classifier = nn.Linear(num_ftrs, 50)
densenet_model_121.features.conv0 = nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
densenet_model_121 = densenet_model_121.to(device, non_blocking=True)

In [None]:
densenet_model_169 = densenet169(pretrained=True)
num_ftrs = densenet_model_169.classifier.in_features
densenet_model_169.classifier = nn.Linear(num_ftrs, 50)
densenet_model_169.features.conv0 = nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
densenet_model_169 = densenet_model_169.to(device, non_blocking=True)

In [None]:
resnet_model_18 = resnet18(pretrained=True)
num_ftrs = resnet_model_18.fc.in_features
resnet_model_18.fc = nn.Linear(num_ftrs, 50)
resnet_model_18.conv1 = nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
resnet_model_18 = resnet_model_18.to(device, non_blocking=True)

In [None]:
resnet_model_34 = resnet34(pretrained=True)
resnet_model_34.fc = nn.Linear(512,50)
resnet_model_34.conv1 = nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
resnet_model_34 = resnet_model_34.to(device, non_blocking=True)

In [None]:
resnet_model_50 = resnet50(pretrained=True)
num_ftrs = resnet_model_50.fc.in_features
resnet_model_50.fc = nn.Linear(num_ftrs, 50)
resnet_model_50.conv1 = nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
resnet_model_50 = resnet_model_50.to(device, non_blocking=True)

In [None]:
def train(model, loss_fn, train_loader, valid_loader, epochs, optimizer, learning_rate, train_losses, valid_losses, device, change_lr=None):
    print('running on ' + str(device))
    torch.set_grad_enabled(True)
    
    scaler = GradScaler()
    
    for epoch in tqdm(range(1, epochs+1)):
        model.train()
        batch_losses=[]
        if change_lr:
            optimizer = change_lr(optimizer, epoch, learning_rate)
            for i, data in enumerate(train_loader):
                x, y = data
                optimizer.zero_grad()
                x = x.to(device, dtype=torch.float32)
                y = y.to(device, dtype=torch.long)
                
                
                with autocast():
                    y_hat = model(x)
                    loss = loss_fn(y_hat, y)
        
                # loss.backward()
                scaler.scale(loss).backward()
        
                batch_losses.append(loss.item())
                # optimizer.step()
                scaler.step(optimizer)
                scaler.update()
        train_losses.append(batch_losses)
        print(f'Epoch - {epoch} Train-Loss : {np.mean(train_losses[-1])}')
        model.eval()
        batch_losses=[]
        trace_y = []
        trace_yhat = []
        for i, data in enumerate(valid_loader):
            x, y = data
            x = x.to(device, dtype=torch.float32)
            y = y.to(device, dtype=torch.long)
            
            y_hat = model(x)
            loss = loss_fn(y_hat, y)
            trace_y.append(y.cpu().detach().numpy())
            trace_yhat.append(y_hat.cpu().detach().numpy())      
            batch_losses.append(loss.item())
        valid_losses.append(batch_losses)
        trace_y = np.concatenate(trace_y)
        trace_yhat = np.concatenate(trace_yhat)
        accuracy = np.mean(trace_yhat.argmax(axis=1)==trace_y)
        print(f'Epoch - {epoch} Valid-Loss : {np.mean(valid_losses[-1])} Valid-Accuracy : {accuracy}')
        # scheduler.step()
        

In [None]:
def setlr(optimizer, lr):
  for param_group in optimizer.param_groups:
    param_group['lr'] = lr
  return optimizer

def lr_decay(optimizer, epoch, learning_rate):
  if epoch%5==0:
    new_lr = learning_rate / (5**(epoch//5))
    optimizer = setlr(optimizer, new_lr)
    print(f'Changed learning rate to {new_lr}')
  return optimizer

In [None]:

learning_rate = 1e-5
trainables = [p for p in wav2vec_model.parameters() if p.requires_grad]
print('Total parameter number is : {:.3f} million'.format(sum(p.numel() for p in wav2vec_model.parameters()) / 1e6))
print('Total trainable parameter number is : {:.3f} million'.format(sum(p.numel() for p in trainables) / 1e6))
optimizer = torch.optim.Adam(trainables, learning_rate, weight_decay=5e-6, betas=(0.95, 0.999))

epochs = 50
loss_fn = nn.CrossEntropyLoss()
resnet_train_losses=[]
resnet_valid_losses=[]

train(wav2vec_model, loss_fn, train_loader, valid_loader, epochs, optimizer, learning_rate, resnet_train_losses, resnet_valid_losses, device, lr_decay)

In [None]:
learning_rate = 1e-4
trainables = [p for p in beats_model.parameters() if p.requires_grad]
print('Total parameter number is : {:.3f} million'.format(sum(p.numel() for p in beats_model.parameters()) / 1e6))
print('Total trainable parameter number is : {:.3f} million'.format(sum(p.numel() for p in trainables) / 1e6))
optimizer = torch.optim.Adam(trainables, learning_rate, weight_decay=5e-7, betas=(0.95, 0.999))

In [None]:
epochs = 50
loss_fn = nn.CrossEntropyLoss()
resnet_train_losses=[]
resnet_valid_losses=[]

train(beats_model, loss_fn, train_loader, valid_loader, epochs, optimizer, learning_rate, resnet_train_losses, resnet_valid_losses, device, lr_decay)

In [None]:
learning_rate = 1e-4
trainables = [p for p in resnet_model_34.parameters() if p.requires_grad]
print('Total parameter number is : {:.3f} million'.format(sum(p.numel() for p in resnet_model_34.parameters()) / 1e6))
print('Total trainable parameter number is : {:.3f} million'.format(sum(p.numel() for p in trainables) / 1e6))
optimizer = torch.optim.Adam(trainables, learning_rate, weight_decay=5e-7, betas=(0.95, 0.999))

# scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, list(range(2, 1000, 5)),gamma=0.85)   

In [None]:
epochs = 50
loss_fn = nn.CrossEntropyLoss()
resnet_train_losses=[]
resnet_valid_losses=[]

train(resnet_model_34, loss_fn, train_loader, valid_loader, epochs, optimizer, learning_rate, resnet_train_losses, resnet_valid_losses, device, lr_decay)

In [None]:
densenet_model_169

learning_rate = 1e-5
trainables = [p for p in densenet_model_169.parameters() if p.requires_grad]
print('Total parameter number is : {:.3f} million'.format(sum(p.numel() for p in densenet_model_169.parameters()) / 1e6))
print('Total trainable parameter number is : {:.3f} million'.format(sum(p.numel() for p in trainables) / 1e6))
optimizer = torch.optim.Adam(trainables, learning_rate, weight_decay=5e-6, betas=(0.95, 0.999))

epochs = 50
loss_fn = nn.CrossEntropyLoss()
resnet_train_losses=[]
resnet_valid_losses=[]

train(densenet_model_169, loss_fn, train_loader, valid_loader, epochs, optimizer, learning_rate, resnet_train_losses, resnet_valid_losses, device, lr_decay)

In [None]:
learning_rate = 1e-5
trainables = [p for p in densenet_model_121.parameters() if p.requires_grad]
print('Total parameter number is : {:.3f} million'.format(sum(p.numel() for p in densenet_model_121.parameters()) / 1e6))
print('Total trainable parameter number is : {:.3f} million'.format(sum(p.numel() for p in trainables) / 1e6))
optimizer = torch.optim.Adam(trainables, learning_rate, weight_decay=5e-6, betas=(0.95, 0.999))

epochs = 50
loss_fn = nn.CrossEntropyLoss()
resnet_train_losses=[]
resnet_valid_losses=[]

train(densenet_model_121, loss_fn, train_loader, valid_loader, epochs, optimizer, learning_rate, resnet_train_losses, resnet_valid_losses, device, lr_decay)

In [None]:
learning_rate = 1e-4
trainables = [p for p in resnet_model_18.parameters() if p.requires_grad]
print('Total parameter number is : {:.3f} million'.format(sum(p.numel() for p in resnet_model_18.parameters()) / 1e6))
print('Total trainable parameter number is : {:.3f} million'.format(sum(p.numel() for p in trainables) / 1e6))
optimizer = torch.optim.Adam(trainables, learning_rate, weight_decay=5e-6, betas=(0.95, 0.999))

epochs = 50
loss_fn = nn.CrossEntropyLoss()
resnet_train_losses=[]
resnet_valid_losses=[]

train(resnet_model_18, loss_fn, train_loader, valid_loader, epochs, optimizer, learning_rate, resnet_train_losses, resnet_valid_losses, device, lr_decay)

In [None]:
learning_rate = 1e-4
trainables = [p for p in resnet_model_50.parameters() if p.requires_grad]
print('Total parameter number is : {:.3f} million'.format(sum(p.numel() for p in resnet_model_50.parameters()) / 1e6))
print('Total trainable parameter number is : {:.3f} million'.format(sum(p.numel() for p in trainables) / 1e6))
optimizer = torch.optim.Adam(trainables, learning_rate, weight_decay=5e-6, betas=(0.95, 0.999))

epochs = 50
loss_fn = nn.CrossEntropyLoss()
resnet_train_losses=[]
resnet_valid_losses=[]

train(resnet_model_50, loss_fn, train_loader, valid_loader, epochs, optimizer, learning_rate, resnet_train_losses, resnet_valid_losses, device, lr_decay)

In [None]:
tl = np.asarray(resnet_train_losses).ravel()
vl = np.asarray(resnet_valid_losses).ravel()
plt.figure(figsize=(12,6))
plt.subplot(1,2,1)
plt.plot(tl)
plt.legend(['Train Loss'])
plt.subplot(1,2,2)
plt.plot(vl,'orange')
plt.legend(['Valid Loss'])