## Stage 3: CNN Classifier
The manual approach, yielded some positive results
Let's see if a machine learning algorithm using similar information might also work

**Plan:**

Chord Qualities:
1. Compute the summed frequency array, and note bin arrays
2. Put these into a CNN (try different combinations)
3. Try a few other models, including:
    - random forest
    - hmm
    - knn

Root Notes:
1. Given the calculated quality and frequency volume, try to work out what the root note is

In [1]:
import os
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from scipy.io.wavfile import read as read_wav
from scipy import fft,signal
from sklearn.preprocessing import minmax_scale
import warnings
from tqdm import tqdm

TONE_A = 440 
NOTES = ['A','A#','B','C','C#','D','D#','E','F','F#','G','G#'] 

In [2]:
def freq_to_rnote(freq):
    r = 12.0*np.log2(freq/TONE_A)
    return r

def rnote_to_freq(r):
    f = TONE_A*2**(r/12)
    return f

def get_note_volume(rnote,fft_image,fft_freq,rnote_epsilon=0.2):
    """ rnote - name or number of note,fft_image - fourier image of signal,
    fft_freq - frequencies in fft_image,rnote_epsilon - halfwide of window to inspect
    return maximum volume(magnitude) of signal in freq window for rnote """
    if isinstance(rnote,str):
        rnote = NOTES.index(rnote)
    try:
        f0 = rnote_to_freq(rnote-rnote_epsilon)
        f1 = rnote_to_freq(rnote+rnote_epsilon)
        f_idx = np.where((fft_freq>=f0)&(fft_freq<=f1)) 
        maxVol = np.max((fft_image[f_idx]))
    except Exception:
        return 0.
    
    return maxVol

def get_notes_volume(rnote,fft_image,fft_freq,rnote_epsilon=0.5,oct_range_from=-4.,oct_range_to=8.):
    if isinstance(rnote,str):
        rnote = NOTES.index(rnote)
    rnotes = np.arange(rnote+12.*oct_range_from,rnote+12.*oct_range_to,12.0)
    vol = []
    for rn in rnotes:
        vol.append(get_note_volume(rn,fft_image,fft_freq))
        
    return np.max(vol)

def plot_notes(fileName):
    """convert the fft image from file to notes notations and plot on"""
    #print(fileName)
    rate,data_raw = read_wav(fileName)
    data = (data_raw[:,0]+data_raw[:,1]).astype(np.float32) # stereo of any type -> mono of float32
    data = minmax_scale(data,(-1.,1.)) 
    fft_image = np.abs(fft.rfft(data,norm='forward')) 
    fft_freq = fft.rfftfreq(len(data),1./rate)
    vol_matrix = np.ndarray(shape=(12),dtype=np.float32)
    for rnote in range(12):
        vol_matrix[rnote] = get_notes_volume(rnote,fft_image,fft_freq)
        
    plt.bar(NOTES, vol_matrix)
    plt.show()

In [3]:
# try it out on a few of the files in /data/train_set.csv:
train_set = pd.read_csv('data/train_set.csv')
test_set = pd.read_csv('data/test_set.csv')
print(train_set.head())

# remove the chords that are not maj or min in the Quality column
train_set = train_set[train_set['Quality'].isin(['maj', 'min', 'dim', 'aug'])]
test_set = test_set[test_set['Quality'].isin(['maj', 'min', 'dim', 'aug'])]

                                    File Path Root Note  Octave Quality  \
0        data/chords/dim/Eb-7-dim-chord-1.wav        Eb       7     dim   
1   data/chords/min7b5/C-3-min7b5-chord-1.wav         C       3  min7b5   
2       data/chords/dim7/E-6-dim7-chord-0.wav         E       6    dim7   
3        data/chords/min/Bb-5-min-chord-0.wav        Bb       5     min   
4  data/chords/maj7_2/Ab-5-maj7_2-chord-0.wav        Ab       5  maj7_2   

   Inversion  
0          1  
1          1  
2          0  
3          0  
4          0  


In [4]:
print(train_set.groupby(['Quality']).count())
print(test_set.groupby(['Quality']).count())

         File Path  Root Note  Octave  Inversion
Quality                                         
aug            179        179     179        179
dim            178        178     178        178
maj            178        178     178        178
min            178        178     178        178
         File Path  Root Note  Octave  Inversion
Quality                                         
aug             76         76      76         76
dim             77         77      77         77
maj             77         77      77         77
min             77         77      77         77


In [5]:
len(train_set)

713

In [6]:
# preprocessing:
def preprocess(fileName):
    rate, data_raw = read_wav(fileName)
    data = (data_raw[:,0]+data_raw[:,1]).astype(np.float32) # stereo of any type -> mono of float32
    data = minmax_scale(data,(-1.,1.))
    fft_image = np.abs(fft.rfft(data,norm='forward'))
    fft_freq = fft.rfftfreq(len(data),1./rate)
    note_volumes = []
    for note in NOTES:
        note_volumes.append(get_note_volume(note,fft_image,fft_freq))
    
    return note_volumes


In [7]:
train_set_note_volumes = []
test_set_note_volumes = []
for i in tqdm(range(len(train_set))):
    fileName = train_set['File Path'].iloc[i]
    train_set_note_volumes.append(preprocess(fileName))

for i in tqdm(range(len(test_set))):
    fileName = test_set['File Path'].iloc[i]
    test_set_note_volumes.append(preprocess(fileName))

# save the new data to a csv, with some reference to the original data
train_set_note_volumes = pd.DataFrame(train_set_note_volumes)
train_set_note_volumes['File Path'] = train_set['File Path']
train_set_note_volumes['Quality'] = train_set['Quality']
# train_set_note_volumes['Chord'] = train_set['Chord']
train_set_note_volumes['Root Note'] = train_set['Root Note']

test_set_note_volumes = pd.DataFrame(test_set_note_volumes)
test_set_note_volumes['File Path'] = test_set['File Path']
test_set_note_volumes['Quality'] = test_set['Quality']
# test_set_note_volumes['Chord'] = test_set['Chord']
test_set_note_volumes['Root Note'] = test_set['Root Note']

train_set_note_volumes.to_csv('data/train_set_note_volumes.csv')
test_set_note_volumes.to_csv('data/test_set_note_volumes.csv')

  0%|          | 0/713 [00:00<?, ?it/s]

100%|██████████| 713/713 [00:01<00:00, 397.44it/s]
100%|██████████| 307/307 [00:00<00:00, 398.18it/s]


In [8]:
print(train_set.head())

                               File Path Root Note  Octave Quality  Inversion
0   data/chords/dim/Eb-7-dim-chord-1.wav        Eb       7     dim          1
3   data/chords/min/Bb-5-min-chord-0.wav        Bb       5     min          0
20  data/chords/maj/Ab-3-maj-chord-1.wav        Ab       3     maj          1
28  data/chords/aug/Ab-2-aug-chord-0.wav        Ab       2     aug          0
32   data/chords/aug/G-4-aug-chord-1.wav         G       4     aug          1


In [19]:
# run a CNN on the data, using the note volumes as the input and the chord quality as the output
# Path: stage_3_cnn_classifier_part_2.ipynb

from keras.models import Sequential
from keras.layers import Conv1D, MaxPooling1D, Flatten, Dense

num_classes = 4
height = 12
width = 1
channels = 1

model = Sequential()

# Add Convolutional layers
model.add(Conv1D(16, 3, activation='relu', input_shape=(height, channels)))
model.add(MaxPooling1D(pool_size=2))

model.add(Conv1D(16, 3, activation='relu'))
model.add(MaxPooling1D(pool_size=2))

# Flatten the output to feed into dense layers
model.add(Flatten())

# Add Dense layers
model.add(Dense(128, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


In [20]:

# train_reshaped = train_set_note_volumes.reshape(train_set_note_volumes.shape[0], height, channels)
# test_reshaped = test_set_note_volumes.reshape(train_set_note_volumes.shape[0], height, channels)

from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Convert string inputs to numerical labels
label_encoder = LabelEncoder()
train_set_encoded = label_encoder.fit_transform(train_set["Quality"])
test_set_encoded = label_encoder.transform(test_set["Quality"])

# One-hot encode the numerical labels
onehot_encoder = OneHotEncoder(sparse=False)
train_set_one_hot = onehot_encoder.fit_transform(train_set_encoded.reshape(-1, 1))
test_set_one_hot = onehot_encoder.transform(test_set_encoded.reshape(-1, 1))

# fit the model using train_set_note_volumes and train_set_one_hot, but exclude the File Path, Quality, and Root Note columns
model.fit(train_set_note_volumes.drop(columns=['File Path', 'Quality', 'Root Note']), train_set_one_hot, epochs=3, batch_size=100)
model.fit(train_set_note_volumes.drop(columns=['File Path', 'Quality', 'Root Note']), train_set_one_hot, epochs=3, batch_size=100)



ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type float).

In [None]:
accuracy = model.evaluate(test_reshaped, test_set_one_hot)[1]
print(f"Test Accuracy: {accuracy * 100:.2f}%")
