In [1]:
import scipy.io.wavfile as wav
import numpy as np
import matplotlib.pyplot as plt
import os
import math

from collections import namedtuple
from scipy.signal import find_peaks

TARGET_DIR = "segmented_digits/"

Signal = namedtuple("Signal", ["y", "fs", "file"])


def signal_from_file(filename):
    # read the audio file
    fs, y = wav.read(filename)

    # normalize the signal
    # signal values are stored as 16-bit integers
    y = y / 2**15

    return Signal(y, fs, os.path.basename(filename))


def plot_signal(y, fs, title, peaks=None, vline=None):
    t = np.arange(len(y)) / fs

    plt.plot(t, y)

    if peaks is not None:
        plt.plot(peaks / fs, y[peaks], "x")

    if vline is not None:
        for x in vline:
            plt.axvline(x, color="r")

    plt.xlabel("Time (s)")
    plt.ylabel("Amplitude")
    plt.title(title)
    plt.show()


def non_overlapping_frames(signal, frame_duration):
    frame_len = round(frame_duration * signal.fs)
    total_frames = len(signal.y) // frame_len

    frames = signal.y[: total_frames * frame_len]
    frames = frames.reshape(frame_len, total_frames, order="F")

    return frames

In [2]:
signal = signal_from_file("digitos.wav")
plot_signal(signal.y, signal.fs, signal.file)

frame_duration = 0.02
frames = non_overlapping_frames(signal, frame_duration)
frame_len, total_frames = frames.shape
frames_fs = 1 / frame_duration

energy_frames = np.sum(frames**2, axis=0)

peaks, _ = find_peaks(energy_frames, height=0.3)

last_peak = peaks[0]
current_peaks = []
mid_values = []
dispersion = []
for peak in peaks:
    if peak - last_peak < 0.5 * frames_fs:
        current_peaks.append(peak)
    else:
        mid_values.append(np.mean(current_peaks))
        current_peaks = [peak]

    last_peak = peak

# add the last peak
mid_values.append(np.mean(current_peaks))
dispersion.append(np.std(current_peaks))

mid_values = np.array(mid_values).astype(int)
signal_mid_values = mid_values / frames_fs

plot_signal(energy_frames, frames_fs, "Energy", peaks=peaks)
plot_signal(energy_frames, frames_fs, "Energy", peaks=mid_values)
plot_signal(signal.y, signal.fs, "Energy", vline=signal_mid_values)

empty_signal_duration = 0.5  # empty signal duration in seconds
empty_signal_len = round(empty_signal_duration * signal.fs)  # empty signal samples
empty_signal = np.zeros(empty_signal_len)
for i, value in enumerate(signal_mid_values):
    speech_start = round((value - 0.5) * signal.fs)
    speech_end = round((value + 0.5) * signal.fs)
    speech = np.concatenate(
        [empty_signal, signal.y[speech_start:speech_end], empty_signal]
    )

    plot_signal(speech, signal.fs, "Speech")

    digit_path = os.path.join(TARGET_DIR, f"{i}.wav")
    wav.write(digit_path, signal.fs, speech)

FileNotFoundError: [Errno 2] No such file or directory: 'digitos.wav'