In [9]:
import warnings
warnings.filterwarnings('ignore')

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torchaudio
import torchaudio.models as models
import torchvision.models as vmodels
import torchaudio.transforms as transforms
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import glob
import librosa
import librosa.display
import soundfile as sf
from scipy.interpolate import interp1d
import IPython.display as ipd
import ast

In [10]:
# Hyper parameters
EPOCHS = 1
TRAIN_BATCH_SIZE = 4

In [11]:
## need to work on padding tensors to have consistent shape

class MelSpecDataset(Dataset):
    def __init__(self, source_file):
        self.source = pd.read_csv(source_file)
        
        mel_specs = []
        wvs = []
        for idx in range(len(self.source)):
            ex = self.source.iloc[idx]
            waveform = ex.waveform

            if isinstance(waveform, str): 
                wv = ','.join(ex.waveform.replace('[ ', '[').split())
                wv = np.array(ast.literal_eval(wv))
                waveform = torch.from_numpy(wv).view(1, -1).to(dtype=torch.float32)

                wvs.append(waveform)

            sample_rate = int(ex.sample_rate)
            
            waveform = waveform.view(1, 1, -1)
            mel_spec = transforms.MelSpectrogram(sample_rate=sample_rate)(waveform).repeat(1, 3, 1, 1)
            #mfcc = librosa.feature.mfcc(y=wv, sr=ex.sample_rate)

            mel_specs.append(mel_spec)

        if 'mel_spec' in self.source:
            self.source = self.source.assign(mel_spec=mel_specs)
        else:
            self.source.insert(4, "mel_spec", mel_specs, True)
        if len(wvs) > 0:
            self.source = self.source.assign(waveform=wvs)
    
    def __getitem__(self, idx):
        ex = self.source.iloc[idx]
        return (ex.mel_spec, ex.species_id)
    def __len__(self):
        return len(self.source)

In [12]:
# CUDA
is_cuda = True and torch.cuda.is_available()
device = torch.device('cuda') if is_cuda else torch.device('cpu')

print ('cuda?: ', is_cuda)

torch.cuda.empty_cache()

cuda?:  True


In [13]:
trainset = MelSpecDataset('train_classified-10.csv')
trainloader = torch.utils.data.DataLoader(trainset, batch_size=TRAIN_BATCH_SIZE,shuffle=True)

In [16]:
resnet = vmodels.resnet50(pretrained=True).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(resnet.parameters(), lr=0.0001)

In [36]:
# when tensors are of the same shape from the dataset, will use trainloader instead of trainset directly

for epoch in range(EPOCHS):
    for i, data in enumerate(trainset):
        inputs, labels = data
        inputs, labels = inputs.to(device), torch.tensor([int(labels)]).to(device)
        
        optimizer.zero_grad()
                
        output = resnet(inputs)
        
        loss = criterion(output, labels)
        loss.backward()
        print (loss.item())

7.9114532470703125
7.147956371307373
7.438002586364746
7.384486198425293
7.979250907897949
7.059122085571289
8.262460708618164
7.255756378173828
7.6105055809021
7.416545867919922
8.055137634277344


In [30]:
#train_classified = pd.read_csv('train_classified-10.csv')

In [31]:
# calculate mfccs and parse waveform
# build relevant dataset for pytorch

# mel_specs = []
# wvs = []
# for idx in range(len(train_classified)):
#     ex = train_classified.iloc[idx]
#     waveform = ex.waveform
    
#     if isinstance(waveform, str): 
#         wv = ','.join(ex.waveform.replace('[ ', '[').split())
#         wv = np.array(ast.literal_eval(wv))
#         waveform = torch.from_numpy(wv).view(1, -1).to(dtype=torch.float32)

#         wvs.append(waveform)
    
#     sample_rate = int(ex.sample_rate)
    
#     mel_spec = transforms.MelSpectrogram(sample_rate=sample_rate)(waveform)
#     #mfcc = librosa.feature.mfcc(y=wv, sr=ex.sample_rate)
    
#     mel_specs.append(mel_spec)
    
# if 'mel_spec' in train_classified:
#     train_classified = train_classified.assign(mel_spec=mel_specs)
# else:
#     train_classified.insert(4, "mel_spec", mel_specs, True)
# if len(wvs) > 0:
#     train_classified = train_classified.assign(waveform=wvs)

In [32]:
#train_classified.head()

In [33]:
# resnet = vmodels.resnet50(pretrained=True).to(device)
# for idx in range(len(train_classified)):
#     ex = train_classified.iloc[idx]
    
#     mel_spec = ex.mel_spec.repeat(1, 3, 1, 1).to(device)
#     print (mel_spec.shape)
    
#     output = resnet(mel_spec)
    
#     print (output.shape)
#     break

In [34]:
# CUDA
# is_cuda = True and torch.cuda.is_available()
# device = torch.device('cuda') if is_cuda else torch.device('cpu')

# print ('cuda?: ', is_cuda)

# torch.cuda.empty_cache()

# res = vmodels.resnet50(pretrained=True).to(device)

# waveform, sample_rate = torchaudio.load('data/train\\00204008d.flac')
# waveform = waveform.view(1, 1, -1)
# #print (waveform)
# mel = transforms.MelSpectrogram(sample_rate=sample_rate)(waveform).repeat(1, 3, 1, 1)

# mel = mel.to(device)

# print (waveform.shape)
# print (mel.shape)

In [35]:
# with torch.no_grad():
#     output = res(mel)

In [7]:
# visualize

# for idx in range(len(train_classified)):
#     ex = train_classified.iloc[idx]
#     mfcc = ex.mfcc
#     fig, ax = plt.subplots()
#     img = librosa.display.specshow(mfcc, x_axis='time', ax=ax)
#     fig.colorbar(img, ax=ax)
#     ax.set(title='MFCC')