In [1]:
import os
from torch.utils.data import Dataset
import pandas as pd
import torchaudio
import torch
import gradio as gr

from torch import nn
from torch.utils.data import DataLoader

import time

In [2]:
class ChordTypeDataset(Dataset):

    def __init__(self, annotations_file, audio_dir, transformation, target_sample_rate, num_samples, device, num_data_items, random_state):
        self.num_data_items = num_data_items
        self.random_state = random_state
        self.audio_dir = audio_dir
        self.device = device
        self.annotations = pd.read_csv(annotations_file)
        if num_data_items < len(self.annotations):
            self.annotations = self.annotations.sample(self.num_data_items, random_state=self.random_state)
        self.transformation = transformation.to(self.device)
        self.target_sample_rate = target_sample_rate
        self.num_samples = num_samples

    def __len__(self):
        return len(self.annotations)
        
    def __getitem__(self, index):
        audio_sample_path = self._get_audio_sample_path(index)
        label = self._get_audio_sample_label(index)
        signal, sample_rate = torchaudio.load(audio_sample_path)
        # Signal -> PyTorch Tensor (num_channels, samples)

        # Send signal to device (CUDA or CPU)
        signal = signal.to(device)
        
        # Make the audio signals have a uniform audio rate
        signal = self._resample(signal, sample_rate)
        # Make the audio mono
        signal = self._mixdown(signal)

        # Cut the audio to fit the necessary length
        signal = self._cut(signal)
        # Right pad the audio to fit the necessary length
        signal = self._right_pad(signal)

        
        signal = self.transformation(signal)
        return signal, label

    def _cut(self, signal):
        if signal.shape[1] > self.num_samples:
            signal = signal[:,:self.num_samples]
        return signal

    def _right_pad(self, signal):
        signal_length = signal.shape[1]
        if signal_length < self.num_samples:
            num_missing_samples = self.num_samples - signal_length
            end_padding = (0, num_missing_samples)
            signal = torch.nn.functional.pad(signal, end_padding)
        return signal

    def _resample(self, signal, original_sample_rate):
        # If the current sample rate is not the same as the target sample rate
        if original_sample_rate != self.target_sample_rate:
            resampler = torchaudio.transforms.Resample(original_sample_rate, self.target_sample_rate).to(device)
            signal = resampler(signal)
        return signal

    def _mixdown(self, signal):
        # If the audio is not already mono, make it mono
        if signal.shape[0] > 1:
            signal = torch.mean(signal, dim=0, keepdims=True)
        return signal

    def _get_audio_sample_path(self, index):
        path = os.path.join(self.audio_dir, self.annotations.iloc[index,1])
        return path

    def _get_audio_sample_label(self, index):
        return self.annotations.iloc[index,2]

In [3]:
class MyCNNNetwork(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        # 4 convolutional blocks / flatten / linear / softmax

        self.conv1 = nn.Sequential(
            nn.Conv2d(
                in_channels=1,
                out_channels=16,
                kernel_size=3,
                stride=1,
                padding=2
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )

        self.conv2 = nn.Sequential(
            nn.Conv2d(
                in_channels=16,
                out_channels=32,
                kernel_size=3,
                stride=1,
                padding=2
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )

        self.conv3 = nn.Sequential(
            nn.Conv2d(
                in_channels=32,
                out_channels=64,
                kernel_size=3,
                stride=1,
                padding=2
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )

        self.conv4 = nn.Sequential(
            nn.Conv2d(
                in_channels=64,
                out_channels=128,
                kernel_size=3,
                stride=1,
                padding=2
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )

        self.flatten = nn.Flatten()
        self.linear = nn.Linear(128 * 5 * 9, num_classes)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        # #print(x.shape)
        # x = self.conv1(x)
        # #print(x.shape)
        # x = self.conv2(x)
        # #print(x.shape)
        # x = self.conv3(x)
        # #print(x.shape)
        # x = self.conv4(x)
        # #print(x.shape)
        # x = self.flatten(x)
        # #print(x.shape)
        # x = self.linear(x)
        # #print(x.shape)
        # predictions = self.softmax(x)
        # #print(predictions.shape)
        return self.softmax(self.linear(self.flatten(self.conv4(self.conv3(self.conv2(self.conv1(x)))))))

In [4]:
def predict(model, sample_input, class_mapping):
    model.eval()
    with torch.no_grad():
        predictions = model(sample_input)
        predicted_class = class_mapping[torch.argmax(predictions, dim=1)]
    return predicted_class

In [5]:
def wav_preprocessor(wav_path, transformation, target_sample_rate, num_samples, device):
    signal, sample_rate = torchaudio.load(wav_path)
    # Signal -> PyTorch Tensor (num_channels, samples)
    
    # Send signal to device (CUDA or CPU)
    signal = signal.to(device)
        
    # Make the audio signals have a uniform audio rate
    # If the current sample rate is not the same as the target sample rate
    if sample_rate != target_sample_rate:
        resampler = torchaudio.transforms.Resample(sample_rate, target_sample_rate).to(device)
        signal = resampler(signal)

    # If the audio is not already mono, make it mono
    if signal.shape[0] > 1:
        signal = torch.mean(signal, dim=0, keepdims=True)

    # Cut the audio to fit the necessary length
    if signal.shape[1] > num_samples:
        signal = signal[:,:num_samples]
        
    # Right pad the audio to fit the necessary length
    signal_length = signal.shape[1]
    if signal_length < num_samples:
        num_missing_samples = num_samples - signal_length
        end_padding = (0, num_missing_samples)
        signal = torch.nn.functional.pad(signal, end_padding)

    signal = transformation(signal)
    return signal

In [6]:
def predict_chord_type(model_chord, model_root, wav_path, transformation, target_sample_rate, num_samples, type_mapping, root_mapping, device):
    input_sample = wav_preprocessor(wav_path, transformation, target_sample_rate, num_samples, device)
    input_sample.unsqueeze_(0)

    predicted_chord = predict(model_chord, input_sample, type_mapping)
    predicted_root = predict(model_chord, input_sample, root_mapping)
    return {predicted_chord_type: predicted_chord, predicted_chord_root: predicted_root}

In [7]:
# Sample Rate should be 16000 Hz
SAMPLE_RATE = 16000
# Each item should be 4 seconds long
NUM_SAMPLES = 4*SAMPLE_RATE

CLASS_MAPPING_CHORD = ['Major', 'Minor', 'Diminished', 'Augmented']
CLASS_MAPPING_ROOT = ['Ab', 'A', 'Bb', 'B', 'C', 'Db', 'D', 'Eb', 'E', 'F', 'Gb', 'G']

if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"

print(f"Using device {device}")

mel_spectrogram = torchaudio.transforms.MelSpectrogram(sample_rate=SAMPLE_RATE, n_fft=1024, hop_length=512, n_mels=64).to(device)

chord_classifier = MyCNNNetwork(4).to(device)
state_dict1 = torch.load("50EpochFullChordTypeCNN.pth", weights_only=True)
chord_classifier.load_state_dict(state_dict1)

root_classifier = MyCNNNetwork(12).to(device)
state_dict2 = torch.load("ChordRootCNN.pth", weights_only=True)
root_classifier.load_state_dict(state_dict2)

predict_chord = lambda wav_path: predict_chord_type(chord_classifier, root_classifier, wav_path, mel_spectrogram, SAMPLE_RATE, NUM_SAMPLES, CLASS_MAPPING_CHORD, CLASS_MAPPING_ROOT, device)

with gr.Blocks() as demo:
    with gr.Column() as predictor:
        wav_path = gr.Audio(type="filepath")
        chord_predictor = gr.Button("Predict Chord")
        predicted_chord_type = gr.Text(label="Chord Type")
        predicted_chord_root = gr.Text(label="Chord Root")

        chord_predictor.click(fn=predict_chord, inputs=[wav_path], outputs=[predicted_chord_type,predicted_chord_root])

demo.launch()

Using device cuda
* Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


