# Build a Dataset of Finger Tapping Single Shots

In [None]:
import essentia
import essentia.standard as estd
import IPython
import numpy as np
import os
import sys
from pathlib import Path
from freesound import FreesoundClient
from tempfile import TemporaryDirectory

## [WIP] Retrieve Sounds from Freesound.org

**this requires OAuth2 authentication**

check this: https://gist.github.com/ffont/3607ba4af9814f3877cd42894a564222

In [None]:
FREESOUND_API_KEY = "t3Th3TkPGzn8z7cD45gyFtlSBpOpHIJVkItU3HU4"
tags = ["finger-tapping"]

output_dir = "../data/finger_tapping_downloads/"
os.makedirs(output_dir, exist_ok=True)

api_key = FREESOUND_API_KEY
if api_key is None:
    print("You need to set your API key as an environment variable")
    print("named FREESOUND_API_KEY")
    sys.exit(-1)

freesound_client = FreesoundClient()
freesound_client.set_token(api_key)

results_pager = freesound_client.text_search(query=" ".join(tags), page_size=5)

for sound in results_pager:
    print("Downloading sound with id: %s" % sound.id)
    sound.retrieve(output_dir + sound.name + ".wav")

## Onset-based Audio Segmentation with Essentia: Cutting and Saving Segments

In [None]:
filename = "../data/mono_kick_seq.wav"

audio = estd.MonoLoader(filename=filename)()

In [None]:
# 1. Compute the onset detection function
od_hfc = estd.OnsetDetection(method='hfc')
od_complex = estd.OnsetDetection(method='complex')

# We need the auxilary algorithms to compute magnitude and phase.
w = estd.Windowing(type='hann')
fft = estd.FFT() # Outputs a complex FFT vector.
c2p = estd.CartesianToPolar() # Converts it into a pair of magnitude and phase vectors.

# # # Compute both ODF frame by frame. Store results to a Pool.
pool = essentia.Pool()
for frame in estd.FrameGenerator(audio, frameSize=1024, hopSize=512):
    magnitude, phase = c2p(fft(w(frame)))
    pool.add('odf.hfc', od_hfc(magnitude, phase))
    pool.add('odf.complex', od_complex(magnitude, phase))

# help(estd.FrameCutter)

In [None]:
# 2. Detect onset locations.
onsets = estd.Onsets()

onsets_hfc = onsets(# This algorithm expects a matrix, not a vector.
                    essentia.array([pool['odf.hfc']]),
                    # You need to specify weights, but if we use only one ODF
                    # it doesn't actually matter which weight to give it
                    [1])

onsets_complex = onsets(essentia.array([pool['odf.complex']]), [1])

# Add onset markers to the audio and save it to a file.
# We use beeps instead of white noise and stereo signal as it's more distinctive.

# We want to keep beeps in a separate audio channel.
# Add them to a silent audio and use the original audio as another channel. Mux both into a stereo signal.
silence = [0.] * len(audio)

beeps_hfc = estd.AudioOnsetsMarker(onsets=onsets_hfc, type='beep')(silence)
beeps_complex = estd.AudioOnsetsMarker(onsets=onsets_complex, type='beep')(silence)

audio_hfc = estd.StereoMuxer()(audio, beeps_hfc)
audio_complex = estd.StereoMuxer()(audio, beeps_complex)

# Write audio to files in a temporary directory.
temp_dir = TemporaryDirectory()
estd.AudioWriter(filename=temp_dir.name + '/tmp_onsets_hfc_stereo.mp3', format='mp3')(audio_hfc)
estd.AudioWriter(filename=temp_dir.name + '/tmp_onsets_complex_stereo.mp3', format='mp3')(audio_complex)

In [None]:
IPython.display.Audio(temp_dir.name + '/tmp_onsets_hfc_stereo.mp3')

In [None]:
IPython.display.Audio(temp_dir.name + '/tmp_onsets_complex_stereo.mp3')

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

n_frames = len(pool['odf.hfc'])
frames_position_samples = np.array(range(n_frames)) * 512

fig, ((ax1, ax2, ax3, ax4)) = plt.subplots(4, 1, sharex=True, sharey=False, figsize=(15, 16))

ax1.set_title('HFC ODF')
ax1.plot(frames_position_samples, pool['odf.hfc'], color='magenta')

ax2.set_title('Complex ODF')
ax2.plot(frames_position_samples, pool['odf.complex'], color='red')

ax3.set_title('Audio waveform and the estimated onset positions (HFC ODF)')
ax3.plot(audio)
for onset in onsets_hfc:
    ax3.axvline(x=onset*44100, color='magenta')

ax4.set_title('Audio waveform and the estimated onset positions (complex ODF)')
ax4.plot(audio)
for onset in onsets_complex:
    ax4.axvline(x=onset*44100, color='red')

In [None]:
output_dir = "../data/output_onsets/"
os.makedirs(output_dir, exist_ok=True)

# Convert numpy float32 arrays to Python lists
onset_times_hfc_list = list(onsets_hfc)
onset_times_complex_list = list(onsets_complex)

# Function to cut audio around each onset
def cut_audio_around_onsets(audio, onset_times, output_dir):
    for i, onset_time in enumerate(onset_times):
        start_time = max(0, onset_time - 0.2)  # 200 ms before the onset
        end_time = onset_times[i + 1] if i + 1 < len(onset_times) else len(audio)
        end_time = min(len(audio), end_time + 0.2)  # 200 ms before the next onset

        # Extract the portion of audio
        onset_audio = audio[int(start_time * 44100):int(end_time * 44100)]

        # Save the onset audio to a file
        output_filename = os.path.join(output_dir, f'onset_{i + 1}.wav')
        estd.AudioWriter(filename=output_filename, format='wav')(onset_audio)

# Cut audio around onsets for both methods
cut_audio_around_onsets(audio_hfc, onset_times_hfc_list, output_dir)
cut_audio_around_onsets(audio_complex, onset_times_complex_list, output_dir)