In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import sounddevice as sd
import numpy as np
import matplotlib.pyplot as plt
import soundfile as sf
from IPython.display import clear_output
import IPython.display as ipd
import librosa
import time as time
import os

import ruptures as rpt

from chord_prediction import CNN, predict

In [2]:
# Global
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
sr = 44100  
device_index = 1  
# print(sd.query_devices())

root_mapping =  { "NC": 0, "A": 1, "A#": 2, "B": 3, "C": 4, "C#": 5, "D": 6, "D#": 7, "E": 8, "F": 9, "F#": 10, "G": 11, "G#": 12 }
chord_mapping = { "NC": 0, "": 1, "m": 2, "5": 3, "7": 4, "maj7": 5, "m7": 6, "6": 7, "m6": 8, "9": 9, "m9": 10, "dim": 11, "aug": 12, "sus2": 13, "sus4": 14, "m7b5": 15 }

# Root
root_model = CNN(num_classes=len(root_mapping))
root_model.load_state_dict(torch.load('rootcqtcnn_asim1_1005.pth', map_location=device))
root_model.eval()

# Chord
chord_model = CNN(num_classes=len(chord_mapping))
chord_model.load_state_dict(torch.load('chordcqtcnn_asim1_0905.pth', map_location=device))
chord_model.eval()
clear_output(wait=True)

In [None]:
# Record audio 
print("Recording...")

criteria = "tempo"
# criteria = "cosdif"

audio_data = sd.rec(int(sr * 15),
                    samplerate=sr,
                    channels=1,
                    dtype='float32',
                    device=device_index)
sd.wait()

clear_output(wait=True)
print("Recording finished.")

audio_data = audio_data.flatten()
harm, perc = librosa.effects.hpss(audio_data)

if criteria == "cosdif":
    cqt = librosa.cqt(y=harm, sr=sr, n_bins=84, bins_per_octave=12)
    cqt = librosa.amplitude_to_db(abs(cqt))
    cqt = (cqt - np.mean(cqt)) / (np.std(cqt) + 1e-8)

    cqt_smooth = librosa.decompose.nn_filter(cqt, aggregate=np.median, metric='cosine')

    cpd = rpt.Pelt(model="rbf").fit(cqt_smooth.T) # Change-Point Detection
    breakpoints = cpd.predict(pen=10)  # Ajustar 

    frame_times = librosa.frames_to_time(np.arange(cqt.shape[1]), sr=sr, hop_length=512)
    output_onsets = librosa.frames_to_time(breakpoints[:-1], sr=sr, hop_length=512)

elif criteria == "tempo":
    tempo, beat_frames = librosa.beat.beat_track(y=perc, sr=sr, tightness=150) #, trim=False, tightness=100, units='frames')
    print(f"Tempo: {(tempo[0]):.1f}bpm")

    if tempo > 100: output_onsets = librosa.frames_to_time(beat_frames[::2], sr=sr)
    else: output_onsets = librosa.frames_to_time(beat_frames, sr=sr)

final_onsets = np.unique(np.concatenate(([0.0], np.array(output_onsets), [np.floor(len(audio_data)/sr)])))

chord_predictions = []

for i in range(len(final_onsets) - 1):

    start, end = int(final_onsets[i] * sr), int(final_onsets[i+1] * sr)
    window = audio_data[start:end]

    prediction, root, chord = predict(window, root_model, chord_model)
    chord_predictions.append(prediction)

min_duration = 0.5
durations = np.diff(final_onsets)
mask = durations > min_duration

# Apply mask to both onset intervals and predictions
filtered_onsets = final_onsets[:-1][mask]
filtered_preds = [chord_predictions[i] for i, keep in enumerate(mask) if keep]

# Add last onset for display alignment
filtered_onsets = np.append(filtered_onsets, final_onsets[-1])
    
for i in range(len(filtered_preds)):
    print(f"{filtered_onsets[i]:.2f} - {filtered_preds[i]}")

Recording finished.
Tempo: 136.0bpm
0.00 - NC
1.38 - D#maj7
2.22 - D#maj7
3.03 - D#maj7
3.90 - D#7
4.78 - Cm
5.67 - Cm7
6.54 - Cm
7.43 - Cm
8.31 - Fm
9.24 - Fm7
10.12 - Fm
11.03 - Fm
11.92 - A#7
12.78 - A#7
13.63 - A#7
14.49 - NC


In [16]:
clicks = librosa.clicks(times=filtered_onsets, sr=sr, length=len(audio_data))
ipd.Audio(audio_data + clicks, rate=sr)