In [1]:
from data_loader import SpectLoader
import torch
from torch import nn, optim
import matplotlib.pyplot as plt
import numpy as np
import librosa
import torch.nn.functional as F
from torch.nn import ReLU
from torch.optim.lr_scheduler import ReduceLROnPlateau
import os

In [2]:
data_path = '/kaggle/input/fma-small-mel-spectrograms/fma_small_specs/spectrograms.h5'
spec_minmax_scaler_path = '/kaggle/input/fma-small-mel-spectrograms/scaler.pkl'

In [3]:
paths = {
        "data_path": data_path,
        "scaler_path": spec_minmax_scaler_path
    }

pprocessor = SpectLoader(paths, batch_size=32)
train_keys, val_keys, test_keys = pprocessor.split_data()
pprocessor.setup_pipeline(scaler_type="normalizer",load_model=True)

{'feature_range': (-1, 1), 'min_': -100.0, 'scale_': 212.16319274902344}
feature_range
min_
scale_


In [4]:
sample1_shape = pprocessor.spect_data[pprocessor.train_keys[0]]['spectrogram'].shape
input_shape=(1, sample1_shape[0], sample1_shape[1])
print(input_shape)

(1, 256, 646)


In [5]:
class Reshape(nn.Module):
    def __init__(self, *shape):
        super().__init__()
        self.shape = shape  # The target shape

    def forward(self, x):
        x = x.view(*self.shape)
        return x

class Trim(nn.Module):
    def __init__(self, *shape):
        super().__init__()
        self.shape = shape  # The target shape

    def forward(self, x):
        x = x.narrow(2,0,self.shape[0][1])
        x = x.narrow(3,0,self.shape[0][2])
        return x

class EncoderConvBlock(nn.Module):
    def __init__(self, in_channels, out_channels, last_layer, stride = 1):
        super().__init__()
        if last_layer:
            self.conv = nn.Sequential(
                            nn.Conv2d(in_channels, out_channels, kernel_size = 3, stride = stride, padding = 2),
                            nn.Flatten())
        else:
            self.conv = nn.Sequential(
                            nn.Conv2d(in_channels, out_channels, kernel_size = 3, stride = stride, padding = 2),
                            nn.LeakyReLU(0.01))
        self.out_channels = out_channels

    def forward(self, x):
        out = self.conv(x)
        return out

class DecoderConvBlock(nn.Module):
    def __init__(self, in_channels, out_channels, last_layer, stride = 1):
        super().__init__()
        if last_layer:
            self.conv = nn.Sequential(
                            nn.ConvTranspose2d(in_channels, 1, kernel_size = 3, stride = stride, padding = 1),
                            )
                
        else:
            self.conv = nn.Sequential(
                            nn.ConvTranspose2d(in_channels, out_channels, kernel_size = 3, stride = stride, padding = 1),
                            nn.LeakyReLU(0.01))
        self.out_channels = out_channels

    def forward(self, x):
        out = self.conv(x)
        return out



In [17]:
class AutoEncoder(nn.Module):
    def __init__(self, channels, input_shape):
        super().__init__()
        self.channels = channels
        self.input_shape = input_shape

        # encoder
        self.encoder = nn.ModuleList()
        for layer_id, out_channels in enumerate(self.channels):            
            conv_layer = self._make_conv_layer(EncoderConvBlock, layer_id, out_channels=out_channels)
            self.encoder.append(conv_layer)
            
        # bottleneck
        latent_size = self._calculate_flatten_size()
        bottleneck = nn.Linear(latent_size, 1024)
        self.encoder.append(bottleneck)

        self.decoder = nn.ModuleList()
        upscaler = nn.Sequential(
                        nn.Linear(1024, latent_size),
                        Reshape((-1, 64, 18, 43)) # special for size 41984
                        )
        self.decoder.append(upscaler)
        for layer_id, in_channels in enumerate(self.channels[::-1]):
            conv_layer = self._make_conv_layer(DecoderConvBlock, layer_id, in_channels=in_channels)
            self.decoder.append(conv_layer)
        self.decoder.append(Trim(self.input_shape))

    def _make_conv_layer(self, block, layer_id, in_channels=0, out_channels=0, last_layer_bool=False, stride=2):
        if layer_id == 0: # first layer
            if block == EncoderConvBlock:
                in_channels = 1 
            elif block == DecoderConvBlock:
                out_channels = 64
        else:
            if block == EncoderConvBlock:
                in_channels = self.channels[layer_id-1]
            if layer_id == len(self.channels)-1: # last layer
                last_layer_bool = True
            elif block == DecoderConvBlock:
                out_channels = self.channels[::-1][layer_id+1] 
            
            
        layers = []
        layers.append(block(in_channels, out_channels, last_layer=last_layer_bool, stride=stride))
    
        return nn.Sequential(*layers)
    def _calculate_flatten_size(self):
        x = torch.zeros(1, *self.input_shape)
        for layer in self.encoder:
            x = layer(x)
        return x.numel()
        
    def forward(self, x):
        for layer in self.encoder:
            x = layer(x)
        for layer in self.decoder:
            x = layer(x)
        return x

In [7]:
for x, y in pprocessor.batch_generator(train_keys, batch_size=16):
    print(x.shape, y.shape)
    break

torch.Size([16, 1, 256, 646]) torch.Size([16])


In [18]:
model = AutoEncoder([32,64,64,64], input_shape)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"1: Used CUDA memory: {torch.cuda.memory_allocated() / 1e6} MB")

1: Used CUDA memory: 977.204736 MB


In [19]:
model

AutoEncoder(
  (encoder): ModuleList(
    (0): Sequential(
      (0): EncoderConvBlock(
        (conv): Sequential(
          (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(2, 2), padding=(2, 2))
          (1): LeakyReLU(negative_slope=0.01)
        )
      )
    )
    (1): Sequential(
      (0): EncoderConvBlock(
        (conv): Sequential(
          (0): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(2, 2))
          (1): LeakyReLU(negative_slope=0.01)
        )
      )
    )
    (2): Sequential(
      (0): EncoderConvBlock(
        (conv): Sequential(
          (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(2, 2), padding=(2, 2))
          (1): LeakyReLU(negative_slope=0.01)
        )
      )
    )
    (3): Sequential(
      (0): EncoderConvBlock(
        (conv): Sequential(
          (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(2, 2), padding=(2, 2))
          (1): Flatten(start_dim=1, end_dim=-1)
        )
      )
    )
    (4): Linear(in_features=49536, out_featur

In [26]:
counter = 0
for x, y in pprocessor.batch_generator(train_keys, batch_size=8):
    outputs = model(x)
    print(outputs.shape)
    i_min_val, i_max_val= x.min(), x.max()
    o_min_val, o_max_val= outputs.min(), outputs.max()
    
    print(f"Input Min value: {i_min_val.item()}, Max value: {i_max_val.item()}")
    print(f"Output Min value: {o_min_val.item()}, Max value: {o_max_val.item()}")
    counter+=1
    if counter == 10:
        break


torch.Size([8, 1, 256, 646])
Input Min value: -0.07235373556613922, Max value: 0.8977598547935486
Output Min value: -0.3352450132369995, Max value: -0.21468940377235413
torch.Size([8, 1, 256, 646])
Input Min value: 0.06907837092876434, Max value: 0.9608559012413025
Output Min value: -0.3352465331554413, Max value: -0.21471579372882843
torch.Size([8, 1, 256, 646])
Input Min value: -0.1920861154794693, Max value: 0.8943788409233093
Output Min value: -0.3352445960044861, Max value: -0.21468240022659302
torch.Size([8, 1, 256, 646])
Input Min value: -0.12363727390766144, Max value: 0.9752476811408997
Output Min value: -0.33527040481567383, Max value: -0.2146971970796585
torch.Size([8, 1, 256, 646])
Input Min value: -0.18928714096546173, Max value: 0.9290701746940613
Output Min value: -0.335260808467865, Max value: -0.21468393504619598
torch.Size([8, 1, 256, 646])
Input Min value: -0.08061899244785309, Max value: 0.9168463349342346
Output Min value: -0.33524343371391296, Max value: -0.214701

In [27]:
batch_size = 16

criterion = nn.MSELoss()  
optimizer = optim.Adam(model.parameters(), lr=0.0001)
scheduler = ReduceLROnPlateau(optimizer, 'min', factor=0.7, patience=1)

for x, y in pprocessor.batch_generator(train_keys, batch_size=batch_size):
        optimizer.zero_grad()    

        outputs = model(x)
        loss = criterion(outputs, x)

        loss.backward()
        optimizer.step()
        print(loss.item())

0.1634710133075714
0.19614450633525848
0.17831425368785858
0.13848836719989777
0.14491181075572968
0.1460307240486145
0.13479246199131012
0.1204211637377739
0.16324685513973236
0.16541483998298645
0.12994372844696045
0.16480106115341187
0.15121157467365265
0.15274761617183685
0.16316798329353333
0.13580799102783203
0.15387894213199615
0.12811382114887238
0.12070447206497192
0.11751474440097809
0.11143780499696732
0.12368609756231308
0.0899212434887886
0.09532462060451508
0.11038666218519211
0.08583490550518036
0.07383821159601212
0.07402022927999496
0.06818632036447525
0.07654598355293274
0.08013156056404114
0.06168756261467934
0.04997165873646736
0.05843258649110794
0.0505002960562706
0.044854868203401566
0.0414966382086277
0.04179038479924202
0.035429153591394424
0.03388303145766258
0.04706977680325508
0.028415484353899956
0.02351764589548111
0.023702073842287064
0.02468608133494854
0.02200903929769993
0.021692760288715363
0.021860728040337563
0.019796818494796753
0.01747768931090831

KeyboardInterrupt: 