In [None]:
import os
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from scipy.io.wavfile import read as read_wav
from scipy import fft,signal
from sklearn.preprocessing import minmax_scale
import warnings
from tqdm import tqdm

TONE_A = 440 
NOTES = ['A','A#','B','C','C#','D','D#','E','F','F#','G','G#'] 

# feature extraction on intervals of the first two chords
# find the top 10 most frequent intervals
# use the top 10 most frequent intervals as features

def freq_to_rnote(freq):
    r = 12.0*np.log2(freq/TONE_A)
    return r

def rnote_to_freq(r):
    f = TONE_A*2**(r/12)
    return f

def get_note_volume(rnote,fft_image,fft_freq,rnote_epsilon=0.2):
    """ rnote - name or number of note,fft_image - fourier image of signal,
    fft_freq - frequencies in fft_image,rnote_epsilon - halfwide of window to inspect
    return maximum volume(magnitude) of signal in freq window for rnote """
    if isinstance(rnote,str):
        rnote = NOTES.index(rnote)
    try:
        f0 = rnote_to_freq(rnote-rnote_epsilon)
        f1 = rnote_to_freq(rnote+rnote_epsilon)
        f_idx = np.where((fft_freq>=f0)&(fft_freq<=f1)) 
        maxVol = np.max((fft_image[f_idx]))
    except Exception:
        return 0.
    
    return maxVol

def get_notes_volume(rnote,fft_image,fft_freq,rnote_epsilon=0.5,oct_range_from=-4.,oct_range_to=8.):
    if isinstance(rnote,str):
        rnote = NOTES.index(rnote)
    rnotes = np.arange(rnote+12.*oct_range_from,rnote+12.*oct_range_to,12.0)
    vol = []
    for rn in rnotes:
        vol.append(get_note_volume(rn,fft_image,fft_freq))
        
    return np.max(vol)

def chord_quality(fileName):
    rate, data_raw = read_wav(fileName)
    data = (data_raw[:,0]+data_raw[:,1]).astype(np.float32) # stereo of any type -> mono of float32
    data = minmax_scale(data,(-1.,1.))
    fft_image = np.abs(fft.rfft(data,norm='forward'))
    fft_freq = fft.rfftfreq(len(data),1./rate)

    major_vol = []
    minor_vol = []
    diminished_vol = []
    augmented_vol = []

    for rnote in range(12):
        vol = get_notes_volume(rnote,fft_image,fft_freq)
        v3 = get_notes_volume((rnote+3)%12,fft_image,fft_freq)
        v4 = get_notes_volume((rnote+4)%12,fft_image,fft_freq)
        v6 = get_notes_volume((rnote+6)%12,fft_image,fft_freq)
        v7 = get_notes_volume((rnote+7)%12,fft_image,fft_freq)
        v8 = get_notes_volume((rnote+8)%12,fft_image,fft_freq)

        major_vol.append(vol + v4 + v7)
        minor_vol.append(vol + v3 + v7)
        diminished_vol.append(vol + v3 + v6)
        augmented_vol.append(vol + v4 + v8)
    
    major_max = [max(major_vol), NOTES[major_vol.index(max(major_vol))]]
    minor_max = [max(minor_vol), NOTES[minor_vol.index(max(minor_vol))]] 
    diminished_max = [max(diminished_vol), NOTES[diminished_vol.index(max(diminished_vol))]] 
    augmented_max = [max(augmented_vol), NOTES[augmented_vol.index(max(augmented_vol))]] 

    if max(major_max[0], minor_max[0], diminished_max[0], augmented_max[0]) == major_max[0]:
        return 'maj', major_max[1]
    elif max(major_max[0], minor_max[0], diminished_max[0], augmented_max[0]) == minor_max[0]:
        return 'min', minor_max[1]
    elif max(major_max[0], minor_max[0], diminished_max[0], augmented_max[0]) == diminished_max[0]:
        return 'dim', diminished_max[1]
    else:
        return 'aug', augmented_max[1]

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models

# Load your chord classification model
chord_classification_model = load_chord_classification_model()  # Implement this function

# Define the CNN model for root note prediction with multiple inputs
input_spectrogram = layers.Input(shape=(161, 1501, 1), name='input_spectrogram')
input_chord_classification = layers.Input(shape=(4,), name='input_chord_classification')  # Assuming 4 chord classes

# Spectrogram processing
x = layers.Conv2D(32, (3, 3), activation='relu')(input_spectrogram)
x = layers.MaxPooling2D((2, 2))(x)
x = layers.Conv2D(64, (3, 3), activation='relu')(x)
x = layers.MaxPooling2D((2, 2))(x)
x = layers.Flatten()(x)

# Chord classification processing
y = layers.Dense(16, activation='relu')(input_chord_classification)

# Concatenate the outputs of both paths
concatenated = layers.concatenate([x, y])

# Additional dense layers for joint processing
z = layers.Dense(128, activation='relu')(concatenated)
z = layers.Dropout(0.5)(z)
z = layers.Dense(12, activation='softmax')(z)  # Assuming 12 root notes

# Create the model
model = models.Model(inputs=[input_spectrogram, input_chord_classification], outputs=z)

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model using your dataset
model.fit({'input_spectrogram': root_note_data_spectrogram, 'input_chord_classification': root_note_data_chord_classification},
          root_note_labels, epochs=10, batch_size=32)
