In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchsummary import summary
from tqdm import tqdm

from CustomDataset.Data import (
    DSD100,
    data_split,
    dataloader
)
from model.Network import UNet
from model.train_model import train

In [2]:
torch.manual_seed(100)

<torch._C.Generator at 0x2472519fc70>

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")
device

device(type='cuda')

In [4]:
dataset = DSD100('../DSD100spectrogram')

In [5]:
for mixture, bass, drum, vocal, instrumental in dataset:
    print(mixture.shape)
    print(bass.shape)
    print(drum.shape)
    print(vocal.shape)
    print(instrumental.shape)
    break

torch.Size([1, 511, 127])
torch.Size([1, 511, 127])
torch.Size([1, 511, 127])
torch.Size([1, 511, 127])
torch.Size([1, 511, 127])


In [6]:
len(dataset)

10284

In [7]:
train_dataset, val_dataset = data_split(dataset, 0.7)

In [8]:
len(train_dataset)

7199

In [9]:
len(val_dataset)

3085

In [10]:
train_dataloader = dataloader(train_dataset, 16, shuffle = True)
val_dataloader = dataloader(val_dataset, 16, shuffle = False)

In [11]:
for mixture, bass, drum, vocal, instrumental in train_dataloader:
    print(mixture.shape)
    print(bass.shape)
    print(drum.shape)
    print(vocal.shape)
    print(instrumental.shape)
    break

torch.Size([16, 1, 511, 127])
torch.Size([16, 1, 511, 127])
torch.Size([16, 1, 511, 127])
torch.Size([16, 1, 511, 127])
torch.Size([16, 1, 511, 127])


UNet model

In [12]:
# model = UNet().to(device)
model = UNet()

In [13]:
summary(model.to(device), input_size = (1, 511, 127), batch_size = 16)

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1          [16, 16, 255, 63]             416
       BatchNorm2d-2          [16, 16, 255, 63]              32
         LeakyReLU-3          [16, 16, 255, 63]               0
           encoder-4          [16, 16, 255, 63]               0
            Conv2d-5          [16, 32, 127, 31]          12,832
       BatchNorm2d-6          [16, 32, 127, 31]              64
         LeakyReLU-7          [16, 32, 127, 31]               0
           encoder-8          [16, 32, 127, 31]               0
            Conv2d-9           [16, 64, 63, 15]          51,264
      BatchNorm2d-10           [16, 64, 63, 15]             128
        LeakyReLU-11           [16, 64, 63, 15]               0
          encoder-12           [16, 64, 63, 15]               0
           Conv2d-13           [16, 128, 31, 7]         204,928
      BatchNorm2d-14           [16, 128

Training

Optimizer

In [14]:
from torch.optim import Adam

In [15]:
optimizer = Adam(model.parameters())

Loss function
- use L1 loss(MAE) as loss function
- use euclidean distance as evaluation metrices 

In [16]:
loss_fn = nn.L1Loss()

In [17]:
def euclidean_distace(true_value, predicted_value):
    return torch.sqrt(torch.sum(torch.square(true_value-predicted_value)))

In [18]:
model.train()
training_loss = []
validation_loss = []
training_distance = []
validation_distance = []
epoch = 2
for i in range(epoch):
    train_loss = []
    val_loss = []
    train_distance = []
    val_distance = []
    train_loop = tqdm(train_dataloader, leave = True)
    for mixture, _, _, vocal, _ in train_loop:
        train_loop.set_description(f"Epoch {i}")
        optimizer.zero_grad()
        y = model(mixture)
        loss = loss_fn(vocal, y)
        loss.backward()
        optimizer.step()

        # pred = torch.mul(mixture, y)
        distance = euclidean_distace(vocal, y)
        train_loss.append(loss.item())
        train_distance.append(distance.item())

        train_loop.set_postfix(
            train_loss = sum(train_loss)/len(train_loss),
            train_euclidean_distance = sum(train_distance)/len(train_distance)
        )
    
    val_loop = tqdm(val_dataloader, leave = True)
    with torch.no_grad():
        for mixture, _, _, vocal, _ in val_loop:
            y = model(mixture)
            loss = loss_fn(vocal, y)

            # pred = torch.mul(mixture, y)
            distance = euclidean_distace(vocal, y)
            val_loss.append(loss.item())
            val_distance.append(distance.item())

            val_loop.set_postfix(
                train_loss = sum(train_loss)/len(train_loss),
                train_euclidean_distance = sum(train_distance)/len(train_distance),
                val_loss = sum(val_loss)/len(val_loss),
                val_euclidean_distance = sum(val_distance)/len(val_distance)
            )
    
    training_loss.append(sum(train_loss)/len(train_loss))
    training_distance.append(sum(train_distance)/len(train_distance))
    validation_loss.append(sum(val_loss)/len(val_loss))
    validation_distance.append(sum(val_distance)/len(val_distance))

Epoch 0: 100%|██████████| 450/450 [22:52<00:00,  3.05s/it, train_euclidean_distance=29, train_loss=0.397]  
100%|██████████| 193/193 [08:54<00:00,  2.77s/it, train_euclidean_distance=29, train_loss=0.397, val_euclidean_distance=28.6, val_loss=0.337]
Epoch 1: 100%|██████████| 450/450 [13:27<00:00,  1.80s/it, train_euclidean_distance=29.2, train_loss=0.29] 
100%|██████████| 193/193 [04:40<00:00,  1.45s/it, train_euclidean_distance=29.2, train_loss=0.29, val_euclidean_distance=28.9, val_loss=0.247]


In [19]:
torch.save(model,'./model/save_model/vocal.pt')

Train using py file

In [14]:
history = train(model, train_dataloader, val_dataloader, "./model/save_model/", stem_name = "vocal", epoch = 1)

Epoch 2: 100%|██████████| 16/16 [00:40<00:00,  2.56s/it, train_euclidean_distance=15.5, train_loss=0.45] 
100%|██████████| 7/7 [00:11<00:00,  1.65s/it, train_euclidean_distance=15.5, train_loss=0.45, val_euclidean_distance=30.4, val_loss=0.447]


In [15]:
history

([0.45759459026157856, 0.45010572113096714],
 [28.555444836616516, 15.460571944713593],
 [0.45242754902158466, 0.4471405787127359],
 [30.338063648768834, 30.36661638532366])