In [103]:
import numpy as np
import soundfile as sf

frame_length = 32
frame_shift = 16

In [104]:
speech1 = sf.read('resources/speech1.wav')
phone = sf.read('resources/phone.wav')

In [105]:
speech1[0].shape

(45466,)

In [106]:
phone[0].shape

(32000,)

In [107]:
print(f'sampling rate of speech 1 is {speech1[1]}')
print(f'sampling rate of phone is {phone[1]}')

sampling rate of speech 1 is 16000
sampling rate of phone is 16000


In [108]:
sampling_rate = speech1[1]
nyquist_frequency = sampling_rate/2
speech1_data = speech1[0]
phone_data = phone[0]

In [109]:
def my_windowing(v_signal: np.ndarray, sampling_rate: int, frame_length: int, frame_shift: int) -> [np.ndarray, np.ndarray]:
    millis_per_point = 1000 / sampling_rate
    frame_length_num_points = frame_length / millis_per_point
    frame_shift_num_points = frame_shift / millis_per_point
    num_frames = int(np.floor((len(v_signal) - frame_length_num_points) / frame_shift_num_points) + 1)
    m_frames = np.zeros((num_frames, int(frame_length_num_points)))
    v_time_frame = np.zeros(num_frames)
    for i in range(num_frames):
        start = int(i * frame_shift_num_points)
        end = int(i * frame_shift_num_points + frame_length_num_points)
        m_frames[i] = v_signal[start:end]
        v_time_frame[i] = (start + end)/(2*sampling_rate)
    return [m_frames, v_time_frame]

In [140]:
def compute_stft(v_signal: np.ndarray, fs: int, frame_length: int, frame_shift: int, v_analysis_window: np.ndarray) -> [np.ndarray, np.ndarray, np.ndarray]:
    m_frames, v_time_frame = my_windowing(v_signal, fs, frame_length, frame_shift)
    m_stft_full = np.zeros(m_frames.shape, dtype=np.complex128)
    v_analysis_window = v_analysis_window(m_frames.shape[1])
    for i in range(m_frames.shape[0]):
        m_stft_full[i] = np.fft.fft(m_frames[i]*v_analysis_window)
    v_freq = np.fft.rfftfreq(m_stft_full.shape[1], 1/fs)
    #v_freq = v_freq[:int(v_freq.shape[0]/2)+1]
    m_stft = remove_upper_half_spectrum(m_stft_full)
    return [m_stft, v_freq, v_time_frame]


In [123]:
def remove_upper_half_spectrum(m_stft: np.ndarray) -> np.ndarray:
    m_stft_new = m_stft[:, :(int(m_stft.shape[1]/2)+1)]
    return m_stft_new

In [None]:
def compute_freq_axis(m_mstft_full, num_samples: int):
    freq = i_max * sampling_rate / num_samples
    pass

In [141]:
m_stft, v_freq, v_time_frame = compute_stft(speech1_data, sampling_rate, frame_length, frame_shift, np.hanning)

In [142]:
v_freq

array([   0.  ,   31.25,   62.5 ,   93.75,  125.  ,  156.25,  187.5 ,
        218.75,  250.  ,  281.25,  312.5 ,  343.75,  375.  ,  406.25,
        437.5 ,  468.75,  500.  ,  531.25,  562.5 ,  593.75,  625.  ,
        656.25,  687.5 ,  718.75,  750.  ,  781.25,  812.5 ,  843.75,
        875.  ,  906.25,  937.5 ,  968.75, 1000.  , 1031.25, 1062.5 ,
       1093.75, 1125.  , 1156.25, 1187.5 , 1218.75, 1250.  , 1281.25,
       1312.5 , 1343.75, 1375.  , 1406.25, 1437.5 , 1468.75, 1500.  ,
       1531.25, 1562.5 , 1593.75, 1625.  , 1656.25, 1687.5 , 1718.75,
       1750.  , 1781.25, 1812.5 , 1843.75, 1875.  , 1906.25, 1937.5 ,
       1968.75, 2000.  , 2031.25, 2062.5 , 2093.75, 2125.  , 2156.25,
       2187.5 , 2218.75, 2250.  , 2281.25, 2312.5 , 2343.75, 2375.  ,
       2406.25, 2437.5 , 2468.75, 2500.  , 2531.25, 2562.5 , 2593.75,
       2625.  , 2656.25, 2687.5 , 2718.75, 2750.  , 2781.25, 2812.5 ,
       2843.75, 2875.  , 2906.25, 2937.5 , 2968.75, 3000.  , 3031.25,
       3062.5 , 3093

The computed spectra is complex conjugate symmetric because the input signal is real, and one of the properties of the discrete Fourier transform is that the spectrum of a real signal is complex conjugate symmetric.

Since the upper half of the spectrum is redundant it can be removed, this saves memory and computation time. Moreover, given a sampling frequency, only frequencies up to the Nyquist frequency are relevant for speech signal processing.

Another advantage is that considering only the non-redundant part of the spectrum makes visualization easier.

In [115]:
#m_stft = np.zeros((m_frames.shape[0], (int(m_frames.shape[1]/2)+1)), dtype=np.complex128)
#m_stft.shape

In [116]:
#m_stft[0]