In [6]:
import json
import os
import numpy as np
import pandas as pd
import pickle
import sys
import time

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as Data
import torch.nn.utils.rnn as rnn_utils

device = torch.device("cuda:4" if torch.cuda.is_available() else "cpu")
print("use",device,"now!")

use cuda:4 now!


In [2]:
class Myrnn(nn.Module):
    def __init__(self, input_dim, hidden_size= 40): #input_dim = 23
        super(Myrnn, self).__init__()
        self.hidden_size = hidden_size

        self.Linear1 = nn.Linear(input_dim, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers= 1, bidirectional= True)
        self.Linear2 = nn.Linear(hidden_size*2, 2)
        self.Linear3 = nn.Linear(hidden_size*2, 1)
        
        ##attention layer
        self.attention = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(inplace=True))
    
    def attention_net(self, lstm_output):
        lstm_tmp_output = torch.chunk(lstm_output, 2, -1)
        hidden = lstm_tmp_output[0]+lstm_tmp_output[1] # hidden : [batch_size, n_step , n_hidden]
        attn_weights = self.attention(hidden) # attn_weights : [batch_size, n_step, n_hidden]
        m = nn.Tanh()(hidden) # m : [batch_size, n_step, n_hidden]
        attn_content = torch.bmm(m, attn_weights.transpose(1, 2)) # attn_content : [batch_size, n_step, n_step]
        soft_attn_weights = F.softmax(attn_content, dim=-1) # soft_attn_weights : [batch_size, n_step, n_step]
        context = torch.bmm(hidden.transpose(1, 2), soft_attn_weights) # [batch_size, n_hidden, n_step]
        context_with_attn = hidden.transpose(1, 2)+context
        result = torch.sum(context_with_attn, dim=-1) # result : [batch_size, n_hidden]
        return result 

    def forward(self, input_data):
        out = F.relu(self.Linear1(input_data))
        out, (hn, cn) = self.lstm(out, None) # out:[n_step, batch_size, n_hidden * num_directions(=2)]
        out = out.permute(1, 0, 2) # out:[batch_size, n_step, n_hidden * num_directions(=2)]
        out = self.attention_net(out)
        #out1 is for onset & offset
        out1 = torch.sigmoid(self.Linear2(out))
        #out2 is for pitch
        out2 = self.Linear3(out)
        return out1, out2

# input shape該有的樣子(沒有batch_first): seq_len, batch_size, vector_len
# inputs.shape: torch.Size([150, 14, 1])

class MyData(Data.Dataset):
    def __init__(self, xs, ys):
        self.xs = xs
        self.ys = ys

    def __len__(self): #500
        return len(self.xs)

    def __getitem__(self, idx):
        return self.xs[idx],self.ys[idx]

def collate_fn(samples):
    batch = {}
    print ('collate_fn裡面的0',samples[0]['data'].shape) #8698*23
    print('type of samples',type(samples)) 
    print ('collate_fn裡面的1',samples[1]['data'].shape)
    print ('collate_fn裡面的1',samples[2]['data'].shape) #看batch_size有幾個，smaple這個list就會有幾個
    temp= [torch.from_numpy(np.array(sample['data'], dtype= np.float32)) for sample in samples]
    padded_data = rnn_utils.pad_sequence(temp, batch_first=True, padding_value= 0)
    batch['data']= padded_data
    batch['label']= [np.array(sample['label'], dtype= np.float32) for sample in samples]

    return batch

def post_processing(output1, pitch):
    pitch= pitch.squeeze(1).squeeze(1).cpu().detach().numpy()
    print (pitch.shape)
    print (torch.mean(output1))
    threshold= 0.1
    notes= []
    this_onset= None
    this_offset= None
    this_pitch= None

    for i in range(len(output1)):
        if output1[i][0][0] > threshold and this_onset == None:
            this_onset= i
        elif output1[i][0][1] > threshold and this_onset != None and this_onset+ 1 < i and this_offset == None:
            this_offset= i
            this_pitch= int(round(np.mean(pitch[this_onset:this_offset+ 1])))
            notes.append([this_onset* 0.032+ 0.016, this_offset* 0.032+ 0.016, this_pitch])
            this_onset= None
            this_offset= None
            this_pitch= None

    print (np.array(notes))
    return notes

def testing(net, sample, device):
    net.eval()
    data = sample['data']
    data= torch.Tensor(data)

    target= sample['label']
    target= torch.Tensor(target)

    data= data.unsqueeze(1)
    target= target.unsqueeze(1)

    print (data.shape)
    print (target.shape)

    data_length= list(data.shape)[0]

    data = data.to(device, dtype=torch.float)
    target = target.to(device, dtype=torch.float)

    output1, output2 = net(data)
    print (output1.shape)
    print (output2.shape)
    #answer= post_processing(output1, output2)
    return answer


def do_training(net, loader, optimizer, device):

    num_epoch = 50
    criterion_onset= nn.BCELoss()
    criterion_pitch= nn.L1Loss()
    training_loss = []
    total_length = 0

    for epoch in range(num_epoch):
        net.train()
        total_length= 0.0
        print ("epoch %d start time: %f" %(epoch, time.time()))
        train_loss= 0.0

        for batch_idx, (data,target) in enumerate(loader):
#x             torch.Size([batchsize, 14, 23])
#y             torch.Size([batchsize, 3])            
            data = data.float()
            data= data.permute(1,0,2)
            
#             inputs = Variable(inputs).requires_grad_().to(device)
            
            target = target.float().view(-1, 3)

#             print(target.shape) (3,3) -> (1,batchsize,dim)

#             permute前: torch.Size([1, 7156, 23])
#             permute後: torch.Size([7156, 1, 23])                        
            data = data.to(device, dtype=torch.float)
            target = target.to(device, dtype=torch.float)
            
#             print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
#             print('data.shape:',data.shape)
#             print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
#             print('target.shape:',target.shape)
#             print('XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX')
            
            optimizer.zero_grad()
            output1, output2 = net(data)
            #print (output1)
            #print (output2)

            #print (output1.shape)
            #print (output2.shape)

            #total_loss= criterion_onset(output1, torch.narrow(target, dim= 2, start= 0, length= 2))
            #total_loss = criterion_pitch(output2, torch.narrow(target, dim= 2, start= 2, length= 1))
            total_loss = criterion_pitch(output2, target[:,2].unsqueeze(1))
            train_loss = train_loss+ total_loss.item()
            total_length = total_length+data.shape[1]
            total_loss.backward()
            optimizer.step()

#             if batch_idx % 50 == 0:
#                 print ("epoch %d, sample %d, loss %.6f" %(epoch, batch_idx, total_loss))
#                 #print ("current time: %f" %(time.time()))
#                 sys.stdin.flush()
            
            
        training_loss.append(train_loss/ total_length)
        print('epoch %d, avg loss: %.6f' %(epoch, train_loss/ total_length))
        
        if epoch==49:
            model_path= f'att_ST_{epoch}.pt'
            torch.save(net.state_dict(), model_path)  

    return net

def preprocess(data_seq, label):
    new_label= []
    for i in range(len(label)):
        label_of_one_song= []
        cur_note= 0
        cur_note_onset= label[i][cur_note][0]
        cur_note_offset= label[i][cur_note][1]
        cur_note_pitch= label[i][cur_note][2]

        for j in range(len(data_seq[i])):
            cur_time= j* 0.032+ 0.016
        
            if abs(cur_time - cur_note_onset) < 0.017:
                label_of_one_song.append(np.array([1, 0, cur_note_pitch]))

            elif cur_time < cur_note_onset or cur_note >= len(label[i]):
                label_of_one_song.append(np.array([0, 0, 0.0]))

            elif abs(cur_time - cur_note_offset) < 0.017:
                label_of_one_song.append(np.array([0, 1, cur_note_pitch]))
                cur_note= cur_note+ 1
                if cur_note < len(label[i]):
                    cur_note_onset= label[i][cur_note][0]
                    cur_note_offset= label[i][cur_note][1]
                    cur_note_pitch= label[i][cur_note][2]
            else:
                label_of_one_song.append(np.array([0, 0, cur_note_pitch]))

        new_label.append(label_of_one_song)

    return new_label

In [3]:
# # --------------------------------------------------跑過一次後就可全部註解掉
# THE_FOLDER = "./MIR-ST500"

# data_seq= []
# label= []
# index = []
# for the_dir in os.listdir(THE_FOLDER):
#     index.append(int(the_dir))
#     if not os.path.isdir(THE_FOLDER + "/" + the_dir):
#         continue

#     json_path = THE_FOLDER + "/" + the_dir+ f"/{the_dir}_feature.json"
#     gt_path= THE_FOLDER+ "/" +the_dir+ "/"+ the_dir+ "_groundtruth.txt"

#     youtube_link_path= THE_FOLDER+ "/" + the_dir+ "/"+ the_dir+ "_link.txt"

#     with open(json_path, 'r') as json_file:
#         temp = json.loads(json_file.read())

#     gtdata = np.loadtxt(gt_path)

#     data= []
#     for key, value in temp.items():
#         data.append(value)

#     data= np.array(data).T #7796*23

#     data_seq.append(data)
#     label.append(gtdata)

# label= preprocess(data_seq, label)

In [4]:
THE_FOLDER = "./MIR-ST500"
data_seq= []
label= []

for i in np.arange(1, 501):
    json_path = THE_FOLDER + "/" + str(i) + '/' + str(i) + '_feature.json'
    gt_path = THE_FOLDER + "/" + str(i) + '/' + str(i) + '_groundtruth.txt'
    
    with open(json_path, 'r') as json_file:
        temp = json.loads(json_file.read())

    gtdata = np.loadtxt(gt_path)

    data= []
    for key, value in temp.items():
        data.append(value)

    data= np.array(data).T #7796*23

    data_seq.append(data)
    label.append(gtdata)
    
label= preprocess(data_seq, label)

In [5]:
#---------------------------------開始切割成我們要的樣子---------------------------------

##### 可調參數 L:時間長度，num_of_songs是怕data太多，500首歌
L = 14
num_of_songs = 450
####

xs = []
ys = []
for j in range(num_of_songs):
    nowx = data_seq[j]
    nowy = np.array(label[j])
    for i in range(0,(len(nowx)-L)):
        x = nowx[i:(i+L)]
        xs.append(x)
        #print(x) #總共L個
        y = nowy[i+L]
        ys.append(y)

xs = np.stack(xs)
xs = torch.from_numpy(xs)
ys = np.array(ys) #(38464, 3)
ys = torch.from_numpy(ys.astype(np.int32))        

train_data = MyData(xs, ys)

In [None]:
input_dim= 23
hidden_size= 50
BATCH_SIZE= 128
loader = Data.DataLoader(dataset=train_data, batch_size= BATCH_SIZE, shuffle=True)

model = Myrnn(input_dim, hidden_size)
optimizer = optim.Adam(model.parameters(), lr= 0.001)

print("use",device,"now!")

model.to(device)
model= do_training(model, loader, optimizer, device)

#for testing

#model.load_state_dict(torch.load("ST_5.pt"))
#testing(model, train_data[0], device)


use cuda:4 now!
epoch 0 start time: 1592301461.032362
epoch 0, avg loss: 0.056221
epoch 1 start time: 1592301681.202406
epoch 1, avg loss: 0.053099
epoch 2 start time: 1592301920.197719
epoch 2, avg loss: 0.052599
epoch 3 start time: 1592302146.800921
epoch 3, avg loss: 0.052293
epoch 4 start time: 1592302372.944496
epoch 4, avg loss: 0.051906
epoch 5 start time: 1592302597.608727
epoch 5, avg loss: 0.051930
epoch 6 start time: 1592302827.438080
epoch 6, avg loss: 0.051615
epoch 7 start time: 1592303058.259145
epoch 7, avg loss: 0.051572
epoch 8 start time: 1592303283.983417
epoch 8, avg loss: 0.051437
epoch 9 start time: 1592303510.956627
epoch 9, avg loss: 0.051641
epoch 10 start time: 1592303736.696519
epoch 10, avg loss: 0.051430
epoch 11 start time: 1592303960.773017
epoch 11, avg loss: 0.051226
epoch 12 start time: 1592304182.805492
epoch 12, avg loss: 0.051217
epoch 13 start time: 1592304407.244892
epoch 13, avg loss: 0.051318
epoch 14 start time: 1592304635.768561
epoch 14, avg

In [6]:
#------------testing-------------
input_dim= 23
hidden_size= 50
BATCH_SIZE= 128
L = 14
num_of_songs = 50

model = Myrnn(input_dim, hidden_size)
model.to(device)
model.load_state_dict(torch.load("att_ST_49.pt")) #
####

for j in np.arange(num_of_songs):
    xs = []
    ys = []
    nowx = data_seq[450+j]
    nowy = np.array(label[450+j])
    for i in range(0,(len(nowx)-L)):
        x = nowx[i:(i+L)]
        xs.append(x)
        #print(x) #總共L個
        y = nowy[i+L]
        ys.append(y)

    xs = np.stack(xs)
    xs = torch.from_numpy(xs)
    ys = np.array(ys) #[14::,3]
    ys = torch.from_numpy(ys.astype(np.int32))   
    model.eval()
    xs = xs.float()
    xs= xs.permute(1,0,2)
    ys = ys.float().view(-1, 3)
    xs = xs.to(device, dtype=torch.float)
    ys = ys.to(device, dtype=torch.float)
    
    _, output2 = model(xs)
    pred = output2.cpu().detach().numpy()
    true = ys[:,2].unsqueeze(1).cpu().detach().numpy()
    data_pred = pd.DataFrame(np.hstack((pred, true)))
    data_pred.columns = ['pred', 'true']
    data_pred.to_csv('pitch_pred_attn/song'+str(450+j+1)+'.csv', index=None)

In [16]:
true = 0
data1 = pd.read_csv('pitch_pred_attn/song'+str(450+1)+'.csv')
for j in range(49):
    data2 = pd.read_csv('pitch_pred_attn/song'+str(451+j+1)+'.csv')
    data1 = pd.concat([data1, data2], axis=0)
data1.shape   

(390778, 2)

In [17]:
true = 0
data1 = np.array(data1)
for i in range(data1.shape[0]):
    if abs(data1[i,0]-data1[i,1])<=0.5:
        true += 1
print(true/data1.shape[0])

0.7060632891309132
