
# Train GRU Model from WavLM features and discrete labels

### For GTzan dataset

##### https://github.com/microsoft/unilm/tree/master/wavlm
##### https://github.com/audeering/w2v2-how-to/blob/main/notebook.ipynb

In [1]:
import pandas as pd
import numpy as np
import os

### Process Feature Files

In [2]:
!pwd

/home/etsmtl/akoerich/DEV/Music


In [3]:
# Path to feature files
path_train = 'features'
extension = 'wavlmbase4layerfeat'

train_files = [file for file in os.listdir(path_train) if file.endswith(extension)]

sorted_train_files = sorted(train_files)
sorted_train_files

['blues.00000.wavlmbase4layerfeat',
 'blues.00001.wavlmbase4layerfeat',
 'blues.00002.wavlmbase4layerfeat',
 'blues.00003.wavlmbase4layerfeat',
 'blues.00004.wavlmbase4layerfeat',
 'blues.00005.wavlmbase4layerfeat',
 'blues.00006.wavlmbase4layerfeat',
 'blues.00007.wavlmbase4layerfeat',
 'blues.00008.wavlmbase4layerfeat',
 'blues.00009.wavlmbase4layerfeat',
 'blues.00010.wavlmbase4layerfeat',
 'blues.00011.wavlmbase4layerfeat',
 'blues.00012.wavlmbase4layerfeat',
 'blues.00013.wavlmbase4layerfeat',
 'blues.00014.wavlmbase4layerfeat',
 'blues.00015.wavlmbase4layerfeat',
 'blues.00016.wavlmbase4layerfeat',
 'blues.00017.wavlmbase4layerfeat',
 'blues.00018.wavlmbase4layerfeat',
 'blues.00019.wavlmbase4layerfeat',
 'blues.00020.wavlmbase4layerfeat',
 'blues.00021.wavlmbase4layerfeat',
 'blues.00022.wavlmbase4layerfeat',
 'blues.00023.wavlmbase4layerfeat',
 'blues.00024.wavlmbase4layerfeat',
 'blues.00025.wavlmbase4layerfeat',
 'blues.00026.wavlmbase4layerfeat',
 'blues.00027.wavlmbase4laye

### All feature vectors into a single dataframe

In [4]:
dfs = []
for file in sorted_train_files:
    df = pd.read_csv(os.path.join(path_train, file))
    dfs.append(df)

df_train_feat = pd.concat(dfs, ignore_index=True)

# Drop first index column (unamed 0)
df_train_feat.drop(df_train_feat.columns[[0]], axis=1, inplace=True)

df_train_feat

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,0.204498,-0.093031,-0.039956,-0.562223,0.138806,0.096212,0.176993,-0.055592,-0.219515,-0.034371,...,0.032864,0.273840,0.043254,-0.051994,-0.009695,0.203010,-0.145996,0.044121,0.116998,-0.454236
1,0.009433,-0.111695,0.138206,-0.308871,0.143759,0.035443,0.315600,-0.221085,-0.141687,-0.155910,...,0.049276,0.225967,0.035753,-0.500972,-0.127206,0.349124,-0.081059,-0.021214,0.157881,-0.250526
2,-0.212292,-0.142811,0.119149,-0.172413,0.323740,-0.006175,-0.136769,-0.127913,-0.247721,0.038505,...,0.243754,0.186927,-0.310384,-0.350090,-0.029887,-0.295870,-0.069211,0.012515,0.282609,-0.539947
3,-0.217505,-0.207070,0.157117,-0.136493,0.099840,0.300491,-0.109374,-0.290099,0.096063,-0.049180,...,0.107468,0.170206,0.228354,-0.841270,-0.096904,0.145656,0.060646,-0.117495,0.356633,-0.684860
4,0.067425,-0.095689,0.088243,0.198330,0.169520,0.297094,-0.380811,-0.165345,0.327136,0.081083,...,0.011918,-0.207443,-0.404519,-0.349134,0.058498,0.270632,-0.039725,-0.023145,0.088038,-0.565479
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1500430,-0.186611,-0.040473,-0.010707,-0.056597,-0.069689,-0.261930,-0.232433,-0.146833,-0.144910,0.074270,...,-0.045094,0.128217,0.129677,1.018336,-0.058187,0.269444,-0.158515,0.079553,-0.228222,1.265655
1500431,-0.255262,-0.047176,-0.005718,-0.002291,-0.046345,-0.060902,-0.154236,-0.163157,-0.093274,-0.034746,...,0.053642,0.113619,0.097742,0.858851,-0.024167,0.147876,-0.091196,-0.156619,-0.238391,0.865757
1500432,-0.177335,-0.050510,0.046610,-0.002687,0.054164,-0.095137,-0.166793,-0.071056,0.063231,-0.136739,...,0.092437,0.108632,0.052940,0.289008,0.029921,0.091171,0.023713,-0.138581,-0.210309,1.180158
1500433,-0.254128,-0.052435,-0.123181,-0.125148,-0.051198,-0.381556,-0.161859,-0.092975,-0.077006,-0.060956,...,0.022917,-0.114606,-0.354498,0.549883,-0.089569,0.157596,-0.081541,0.091145,-0.177990,1.068273


### Process label files - Train and Devel

In [5]:
!pwd

/home/etsmtl/akoerich/DEV/Music


In [6]:
import os
import pandas as pd

# Path to the GTzan dataset
gtzan_path = "GTzan_16k_Wav"
gtzan_path_features = "features"

# Dictionary to map genre names to numeric labels
genre_label_map = {
    "blues": 0,
    "classical": 1,
    "country": 2,
    "disco": 3,
    "hiphop": 4,
    "jazz": 5,
    "metal": 6,
    "pop": 7,
    "reggae": 8,
    "rock": 9
}

# Initialize an empty list to store data
data_list = []

# Iterate through the directories
for genre in sorted( os.listdir(gtzan_path) ):
    genre_path = os.path.join(gtzan_path, genre)
    if os.path.isdir(genre_path) :
        for filename in sorted( os.listdir(genre_path) ):
            #print(filename)
            # Extract genre, track number, and extension from filename
            genre_track, extension = os.path.splitext(filename)
            # Split the genre_track into genre and track number
            genre, track_number, au = genre_track.split('.')
            # Ensure it's a wav file
            if extension == ".wav":
                # Count number of lines in the corresponding feature file
                feature_filename = f"{genre}.{track_number}.wavlmbase4layerfeat"
                # feature_filepath = os.path.join(genre_path, feature_filename)
                feature_filepath = os.path.join(gtzan_path_features, feature_filename)
                if os.path.exists(feature_filepath):
                    with open(feature_filepath, 'r') as f:
                        num_lines = sum(1 for line in f)
                        num_lines = num_lines - 1
                    # Append a dictionary to the list replicated by the number of lines
                    for _ in range(num_lines):
                        data_list.append({
                            "genre": genre,
                            "track_number": int(track_number),
                            "label": genre_label_map[genre]
                        })

# Create a DataFrame from the list of dictionaries
df_train_lab = pd.DataFrame(data_list)

# Print first few rows of the DataFrame
print(df_train_lab.head())

# Optionally, save the DataFrame to a CSV file
df_train_lab.to_csv("gtzan_dataset_inflated_wavlmbase4layerfeat.csv", index=False)


   genre  track_number  label
0  blues             0      0
1  blues             0      0
2  blues             0      0
3  blues             0      0
4  blues             0      0


In [7]:
df_train_feat

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,0.204498,-0.093031,-0.039956,-0.562223,0.138806,0.096212,0.176993,-0.055592,-0.219515,-0.034371,...,0.032864,0.273840,0.043254,-0.051994,-0.009695,0.203010,-0.145996,0.044121,0.116998,-0.454236
1,0.009433,-0.111695,0.138206,-0.308871,0.143759,0.035443,0.315600,-0.221085,-0.141687,-0.155910,...,0.049276,0.225967,0.035753,-0.500972,-0.127206,0.349124,-0.081059,-0.021214,0.157881,-0.250526
2,-0.212292,-0.142811,0.119149,-0.172413,0.323740,-0.006175,-0.136769,-0.127913,-0.247721,0.038505,...,0.243754,0.186927,-0.310384,-0.350090,-0.029887,-0.295870,-0.069211,0.012515,0.282609,-0.539947
3,-0.217505,-0.207070,0.157117,-0.136493,0.099840,0.300491,-0.109374,-0.290099,0.096063,-0.049180,...,0.107468,0.170206,0.228354,-0.841270,-0.096904,0.145656,0.060646,-0.117495,0.356633,-0.684860
4,0.067425,-0.095689,0.088243,0.198330,0.169520,0.297094,-0.380811,-0.165345,0.327136,0.081083,...,0.011918,-0.207443,-0.404519,-0.349134,0.058498,0.270632,-0.039725,-0.023145,0.088038,-0.565479
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1500430,-0.186611,-0.040473,-0.010707,-0.056597,-0.069689,-0.261930,-0.232433,-0.146833,-0.144910,0.074270,...,-0.045094,0.128217,0.129677,1.018336,-0.058187,0.269444,-0.158515,0.079553,-0.228222,1.265655
1500431,-0.255262,-0.047176,-0.005718,-0.002291,-0.046345,-0.060902,-0.154236,-0.163157,-0.093274,-0.034746,...,0.053642,0.113619,0.097742,0.858851,-0.024167,0.147876,-0.091196,-0.156619,-0.238391,0.865757
1500432,-0.177335,-0.050510,0.046610,-0.002687,0.054164,-0.095137,-0.166793,-0.071056,0.063231,-0.136739,...,0.092437,0.108632,0.052940,0.289008,0.029921,0.091171,0.023713,-0.138581,-0.210309,1.180158
1500433,-0.254128,-0.052435,-0.123181,-0.125148,-0.051198,-0.381556,-0.161859,-0.092975,-0.077006,-0.060956,...,0.022917,-0.114606,-0.354498,0.549883,-0.089569,0.157596,-0.081541,0.091145,-0.177990,1.068273


## Train a GRU regression model for arousal / valence

In [8]:
import torch

device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
print(torch.cuda.get_device_name(0))

Using device: cuda:1
NVIDIA A100-SXM4-40GB


In [9]:
seed = 42
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

In [10]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from torch.optim.lr_scheduler import (
    StepLR, ReduceLROnPlateau, MultiStepLR, ExponentialLR, CosineAnnealingLR
)

features = df_train_feat.values.astype(np.float32)
labels   = df_train_lab['label'].values.astype(np.float32)

# Normalize the features between -1 and 1 (adjust scaling based on your data)
# features = (features - np.min(features)) / (np.max(features) - np.min(features)) * 2 - 1

# Convert data to PyTorch tensors
features_tensor = torch.from_numpy(features)
labels_tensor   = torch.from_numpy(labels)

# Assuming you want a sequence length of 1
# features_tensor = features_tensor.unsqueeze(1)

######
# Reshape features tensor with sequence length of 50
sequence_length = 1
num_features    = features.shape[1]
num_samples     = features.shape[0]

# Calculate the number of sequences that can be formed
num_sequences = num_samples // sequence_length

# Truncate the tensor to fit the full sequences
features_tensor = features_tensor[:num_sequences * sequence_length, :]
labels_tensor = labels_tensor[:num_sequences * sequence_length]

# Reshape the tensor
features_tensor = features_tensor.view(num_sequences, sequence_length, num_features)

######

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_tensor, labels_tensor, test_size=0.2, random_state=42)

# Initialize the model, loss function, and optimizer
input_size   = num_features
hidden_size  = 64 #128, 64, 32, 16
num_layers   = 4
#output_size  = 1  # Single output for regression between -1 and +1
num_classes  = 10 # GTzan
dropout_prob = 0.20 
# ======================
# Define the GRU model
class GRUModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dropout_prob):
        super(GRUModel, self).__init__()
        self.gru = nn.GRU(input_size, hidden_size, dropout_prob, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        gru_out, _ = self.gru(x)
        output = self.fc(gru_out[:, -1, :])  # Take the output from the last time step
        return output

# model = GRUModel(input_size, hidden_size, output_size, dropout_prob)


# =======================
class GRUModelClassification(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(GRUModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        # GRU layer
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
        
        # Fully connected layer
        self.fc = nn.Linear(hidden_size, num_classes)
        
    def forward(self, x):
        # Initialize hidden state with zeros
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device) 
        
        # Forward propagate GRU
        out, _ = self.gru(x, h0)
        
        # Decode the hidden state of the last time step
        out = self.fc(out[:, -1, :])
        
        # Apply softmax activation
        out = F.softmax(out, dim=1)
        return out
        
# =======================
# Define the Convolutional GRU model
class ConvGRUModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes, dropout_prob):
        super(ConvGRUModel, self).__init__()
        
        # GRU layer
        self.convgru = nn.GRU(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, dropout=dropout_prob, batch_first=True)

        # Fully connected layer
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):

        # Initialize hidden state with zeros
        # h0 = torch.zeros(num_layers, x.size(0), hidden_size).to(x.device) 

        # Forward propagate GRU
        #gru_out, _ = self.convgru(x, h0)
        gru_out, _ = self.convgru(x)

        
        # Decode the hidden state of the last time step
        output = self.fc(gru_out[:, -1, :])  # Take the output from the last time step

        # Apply softmax activation
        output = F.softmax(output, dim=1)
        
        return output

#model = ConvGRUModel(input_size, hidden_size, num_layers, output_size, dropout_prob)
model = ConvGRUModel(input_size, hidden_size, num_layers, num_classes, dropout_prob)

#=============================
# Define the Convolutional GRU model with Tanh activation at the output
class ConvGRUModelTanh(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size, dropout_prob):
        super(ConvGRUModelTanh, self).__init__()
        self.convgru = nn.GRU(input_size=input_size, hidden_size=hidden_size, dropout=dropout_prob, num_layers=num_layers, batch_first=True)
        self.tanh = nn.Tanh()
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        gru_out, _ = self.convgru(x)
        output = self.fc(gru_out[:, -1, :])  # Take the output from the last time step
        output = self.tanh(output)  # Apply Tanh activation
        return output

#model = ConvGRUModelTanh(input_size, hidden_size, num_layers, output_size, dropout_prob)
#=============================

# Move the model to the GPU
model = model.to(device)

# Define your loss function (criterion)
criterion = nn.CrossEntropyLoss()
#optimizer = optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False)

optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = StepLR(optimizer, step_size=10, gamma=0.1)  # Reduce lr by 10% every 10 epochs

# Train the model
num_epochs     = 1000
batch_size     = 1500
validate_every = 2  # Validate every 2 epochs
patience       = 20  # Stop training if validation loss doesn't improve for 5 consecutive validations

train_dataset = TensorDataset(X_train, y_train.to(torch.int64))
train_loader  = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)

# Initialize a list to store the training loss values
train_loss_values = []
validation_loss_values = []

best_validation_loss = float('inf')
early_stop_counter = 0
best_model_path = 'best_model_gtzan.pth'  # Define the path to save the best model

for epoch in range(num_epochs):
    
    epoch_loss = 0.0
    
    model.train()
    
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_X.to(device))
        loss = criterion(outputs, batch_y.to(device))
        loss.backward()
        optimizer.step()
        # Update learning rate after each epoch (StepLR example)
        # scheduler.step()
        
        epoch_loss += loss.item()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')
    
    average_epoch_loss = epoch_loss / len(train_loader)
    train_loss_values.append(average_epoch_loss)
    
    # Validate the model every validate_every epochs using the test partition
    if epoch % validate_every == 0:
        model.eval()
        with torch.no_grad():
            test_outputs = model(X_test.to(device))
            validation_loss = criterion(test_outputs, y_test.to(torch.int64).to(device))  # Adjust target size

        validation_loss_values.append(validation_loss.item())
        print(f'Epoch [{epoch+1}/{num_epochs}], Validation Loss: {validation_loss.item():.4f}')
        
        if validation_loss < best_validation_loss:
            best_validation_loss = validation_loss
            early_stop_counter = 0
            
            # Save the model with the best validation loss
            torch.save(model.state_dict(), best_model_path)
            print(f'Saved model with best validation loss to {best_model_path}')
        else:
            early_stop_counter += 1

        if early_stop_counter >= patience:
            print(f'Early stopping at epoch {epoch+1} as validation loss has not improved for {patience} consecutive validations.')
            break
            
        model.train()  # Set the model back to training mode

Epoch [1/1000], Loss: 1.7597
Epoch [1/1000], Validation Loss: 1.7378
Saved model with best validation loss to best_model_gtzan.pth
Epoch [2/1000], Loss: 1.6999
Epoch [3/1000], Loss: 1.6563
Epoch [3/1000], Validation Loss: 1.6227
Saved model with best validation loss to best_model_gtzan.pth
Epoch [4/1000], Loss: 1.6343
Epoch [5/1000], Loss: 1.6101
Epoch [5/1000], Validation Loss: 1.5962
Saved model with best validation loss to best_model_gtzan.pth
Epoch [6/1000], Loss: 1.6268
Epoch [7/1000], Loss: 1.5979
Epoch [7/1000], Validation Loss: 1.5782
Saved model with best validation loss to best_model_gtzan.pth
Epoch [8/1000], Loss: 1.5935
Epoch [9/1000], Loss: 1.5918
Epoch [9/1000], Validation Loss: 1.5658
Saved model with best validation loss to best_model_gtzan.pth
Epoch [10/1000], Loss: 1.5923
Epoch [11/1000], Loss: 1.5663
Epoch [11/1000], Validation Loss: 1.5574
Saved model with best validation loss to best_model_gtzan.pth
Epoch [12/1000], Loss: 1.5835
Epoch [13/1000], Loss: 1.5676
Epoch 

KeyboardInterrupt: 

In [None]:
# Test the model
# Load the best model for testing
#best_model = ConvGRUModel(input_size, hidden_size, num_layers, output_size, dropout_prob)
best_model = ConvGRUModel(input_size, hidden_size, num_layers, num_classes, dropout_prob)
best_model.load_state_dict(torch.load(best_model_path))
best_model.to(device)

best_model.eval()
with torch.no_grad():
    test_outputs = best_model(X_test.to(device))
    #test_loss    = criterion(test_outputs, y_test.unsqueeze(1).to(device))
    test_loss    = criterion(test_outputs, y_test.to(torch.int64).to(device))

print(f'Test Loss: {test_loss.item():.4f}')


In [None]:
features.shape, labels.shape

### Plot Learning Curves

In [None]:
import matplotlib.pyplot as plt
# Plot the training loss values
plt.plot(train_loss_values, label='Training Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training Loss Over Epochs')
plt.legend()
plt.show()

In [None]:
# Plot the training and validation loss values
epochs = range(1, len(train_loss_values) + 1)
plt.plot(epochs, train_loss_values, label='Training Loss')
plt.plot(range(0, len(validation_loss_values) * validate_every, validate_every), validation_loss_values, label='Validation Loss', linestyle='--')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss Over Epochs')
plt.legend()
plt.show()

### Load Test Set 

In [None]:
features2 = df_train_feat.values.astype(np.float32)
labels2   = df_train_lab['label'].values.astype(np.float32)

# Normalize the features between -1 and 1 (adjust scaling based on your data)
# features = (features - np.min(features)) / (np.max(features) - np.min(features)) * 2 - 1

# Convert data to PyTorch tensors
features_tensor2 = torch.from_numpy(features2)
labels_tensor2   = torch.from_numpy(labels2)

# Assuming you want a sequence length of 1
# features_tensor = features_tensor.unsqueeze(1)

######
# Reshape features tensor with sequence length of 50
sequence_length = 1
num_features    = features.shape[1]
num_samples     = features.shape[0]

# Calculate the number of sequences that can be formed
num_sequences = num_samples // sequence_length

# Truncate the tensor to fit the full sequences
features_tensor2 = features_tensor2[:num_sequences * sequence_length, :]
labels_tensor2   = labels_tensor2[:num_sequences * sequence_length]

# Reshape the tensor
features_tensor2 = features_tensor2.view(num_sequences, sequence_length, num_features)

In [None]:
features_tensor2.size()

In [None]:
# Test the model
# Load the best model for testing
best_model = ConvGRUModel(input_size, hidden_size, num_layers, num_classes, dropout_prob)
best_model.load_state_dict(torch.load(best_model_path))
best_model.to(device)

# A verificar
#best_model.eval()

with torch.no_grad():
    test_outputs = best_model(features_tensor2.to(device))
    test_loss    = criterion(test_outputs, labels_tensor2.to(torch.int64).to(device))

print(f'Test Loss: {test_loss.item():.4f}')

In [None]:
test_outputs.shape

In [None]:
X_test.shape

In [None]:
predict_y = test_outputs.cpu()

In [None]:
df_train_lab

In [None]:
predict_y

In [None]:
import torch

def predict_label(predictions):
  """
  Converts a 10-dimensional probability vector to the class label (index of max probability).

  Args:
      predictions: A torch.Tensor of size (batch_size, 10) containing probability predictions.

  Returns:
      A torch.Tensor of size (batch_size) containing the predicted class labels (integers).
  """
  # Get the index of the maximum probability along the dimension with 10 elements (classes)
  _, predicted_labels = torch.max(predictions, dim=1)
  return predicted_labels

# Example usage
predict_y_int = predict_label(predict_y)
print("Predicted labels:", labels)


In [None]:
import audmetric

print("Accuracy = " + str(audmetric.accuracy(df_train_lab['label'], predict_y_int)))