<h1>Note Detection using Discrete Fast Fourier Transforms<h1>
<h2>This uses fast fourier transforms to analyse the audio file frame by frame to extract the fundamental frequency</h2>

In [None]:
import matplotlib.pyplot as plt
from scipy.fftpack import fft
from scipy.io import wavfile
import os
import numpy as np
import tqdm

In [204]:
# Configuration

def configuration(audio_file,audio_name):
    FFT_WINDOW_SECONDS = 0.75 # how many seconds of audio make up an FFT window

    # Note range to display
    FREQ_MIN = 10
    FREQ_MAX = 1000

    # Notes to display
    TOP_NOTES = 2

    # Names of the notes
    NOTE_NAMES = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"]

    # Output size. Generally use SCALE for higher res, unless you need a non-standard aspect ratio.
    RESOLUTION = (1920, 1080)
    SCALE = 2 # 0.5=QHD(960x540), 1=HD(1920x1080), 2=4K(3840x2160)
    MAGNITUDE_THRESHOLD = 0.1  # Threshold to ignore low magnitude signals

    AUDIO_FILE = audio_file
    fs, data = wavfile.read(AUDIO_FILE) # load the data
    audio = data.T[0] # this is a two channel soundtrack, get the first track
    FFT_WINDOW_SIZE = int(fs * FFT_WINDOW_SECONDS)  # Samples per FFT window
    AUDIO_LENGTH = len(audio) / fs  # Total audio length in seconds
    FRAME_COUNT = int(AUDIO_LENGTH / FFT_WINDOW_SECONDS)  #Total frames (1 per second)
    FRAME_OFFSET = FFT_WINDOW_SIZE 

    return NOTE_NAMES, FFT_WINDOW_SIZE, fs, FRAME_OFFSET,MAGNITUDE_THRESHOLD,FRAME_COUNT,audio




In [205]:
def freq_to_number(f): return 69 + 12*np.log2(f/440.0)
def number_to_freq(n): return 440 * 2.0**((n-69)/12.0)
def note_name(n,NOTE_NAMES): return NOTE_NAMES[n % 12] + str(int(n/12 - 1))

# Hanning window function

def hanning_window(FFT_WINDOW_SIZE,fs):
    window = 0.5 * (1 - np.cos(np.linspace(0, 2*np.pi, FFT_WINDOW_SIZE, False)))

    xf = np.fft.rfftfreq(FFT_WINDOW_SIZE, 1/fs)

    return window, xf

In [206]:
def extract_sample(audio, frame_number,FRAME_OFFSET,FFT_WINDOW_SIZE):
  end = frame_number * FRAME_OFFSET
  begin = int(end - FFT_WINDOW_SIZE)

  if end == 0:
    # We have no audio yet, return all zeros (very beginning)
    return np.zeros((np.abs(begin)),dtype=float)
  elif begin<0:
    # We have some audio, padd with zeros
    return np.concatenate([np.zeros((np.abs(begin)),dtype=float),audio[0:end]])
  else:
    # Usually this happens, return the next sample
    return audio[begin:end]

In [207]:
def find_top_notes(fft,MAGNITUDE_THRESHOLD,xf,NOTE_NAMES):
    if np.max(fft.real) < MAGNITUDE_THRESHOLD:
        return []

    # Sort frequencies by ascending order
    lst = [(i, val) for i, val in enumerate(fft.real)]
    lst = sorted(lst, key=lambda x: xf[x[0]])  # Sort by frequency

    # Identify only the first fundamental frequency
    for i, magnitude in lst:
        if magnitude >= MAGNITUDE_THRESHOLD:
            f = xf[i]  # Frequency of this FFT bin
            n = freq_to_number(f)
            n0 = int(round(n))
            name = note_name(n0,NOTE_NAMES)
            return [(f, name, magnitude)]  # Return only the first fundamental frequency

    return []  # Return empty if no significant frequency is found


In [208]:
# Pass 1, find out the maximum amplitude so we can scale.

def first_pass(FRAME_COUNT,window,audio,FRAME_OFFSET,FFT_WINDOW_SIZE):
  mx = 0
  for frame_number in range(FRAME_COUNT):
    sample = extract_sample(audio, frame_number,FRAME_OFFSET,FFT_WINDOW_SIZE)

    fft = np.fft.rfft(sample * window)
    fft = np.abs(fft).real 
    mx = max(np.max(fft),mx)

  #print(f"Max amplitude: {mx}")
  return mx

In [209]:
def print_notes(audio, MAGNITUDE_THRESHOLD, xf, NOTE_NAMES, FRAME_COUNT,mx,window,FRAME_OFFSET, FFT_WINDOW_SIZE,name):

    total_notes = []
    for frame_no in range(FRAME_COUNT):
        #audio, frame_number,FRAME_OFFSET,FFT_WINDOW_SIZE
        sample = extract_sample(audio, frame_no,FRAME_OFFSET,FFT_WINDOW_SIZE)
        fft = np.fft.rfft(sample * window)
        fft = np.abs(fft) / mx  # Normalize with max amplitude

        # Get the top notes for this frame
        notes = find_top_notes(fft,MAGNITUDE_THRESHOLD,xf,NOTE_NAMES)
        if notes != []:
            total_notes.append(notes[0][1])

        
        '''print(f"Frame {frame_number + 1}/{FRAME_COUNT}:")
        for note in notes:
            print(f"Frequency: {note[0]:.2f} Hz, Note: {note[1]}, Magnitude: {note[2]:.2f}")
        print()'''
        

    print(name)
    for i in range(0,len(total_notes)):
        print(total_notes[i],end=' ')

In [210]:
audio_files =  [('Recordings/001.Kanakangi/1 kanakAngi_freq40_2.wav','KANAKANGI 40'),('Recordings/001.Kanakangi/5 kanakAngi_freq80_2.wav','KANAKANGI 80'),
              ('Recordings/001.Kanakangi/9 kanakAngi_freq120_2.wav','KANAKANGI 120'),('Recordings/001.Kanakangi/13 kanakAngi_freq160_2.wav','KANAKANGI 160'),
              ('Recordings/001.Kanakangi/17 kanakAngi_freq200_2.wav','KANAKANGI 200'),('Recordings/001.Kanakangi/21 kanakAngi_freq240_2.wav','KANAKANGI 240')]

for audio in audio_files:
    file,name = audio 
    NOTE_NAMES, FFT_WINDOW_SIZE, fs, FRAME_OFFSET,MAGNITUDE_THRESHOLD,FRAME_COUNT,audio = configuration(file,name)
    window,xf = hanning_window(FFT_WINDOW_SIZE,fs)
    mx = first_pass(FRAME_COUNT,window,audio,FRAME_OFFSET,FFT_WINDOW_SIZE)
    print_notes(audio, MAGNITUDE_THRESHOLD, xf, NOTE_NAMES, FRAME_COUNT,mx,window,FRAME_OFFSET, FFT_WINDOW_SIZE,name)
    print('\n')
    
    


KANAKANGI 40
C4 C4 C#4 C#4 D4 D4 F4 G4 G4 G#4 G#4 A4 A4 C5 C5 C5 C5 A4 A4 G#4 G#4 G4 G4 F4 D4 D4 C#4 C#4 C4 C4 

KANAKANGI 80
C4 C#4 D4 F4 G4 G#4 A4 C5 C5 A4 G#4 G4 F4 D4 C#4 C4 

KANAKANGI 120
C4 C#4 D4 G4 A4 C5 G#4 G4 D4 C#4 C4 

KANAKANGI 160
C4 C#4 G4 G#4 A4 G4 D4 C4 C4 

KANAKANGI 200
C4 D4 G4 A4 G4 C#4 C4 

KANAKANGI 240
C4 D4 G#4 G4 C#4 C4 

