In [1]:
import numpy as np
import soundfile as sf

frame_length = 32
frame_shift = 16

In [2]:
speech1 = sf.read('resources/speech1.wav')
phone = sf.read('resources/phone.wav')

In [15]:
speech1[0].shape

(45466,)

In [16]:
phone[0].shape

(32000,)

In [5]:
print(f'sampling rate of speech 1 is {speech1[1]}')
print(f'sampling rate of phone is {phone[1]}')

sampling rate of speech 1 is 16000
sampling rate of phone is 16000


In [6]:
sampling_rate = speech1[1]
nyquist_frequency = sampling_rate/2
speech1_data = speech1[0]
phone_data = phone[0]

In [7]:
def my_windowing(v_signal: np.ndarray, sampling_rate: int, frame_length: int, frame_shift: int) -> [np.ndarray, np.ndarray]:
    millis_per_point = 1000 / sampling_rate
    frame_length_num_points = frame_length / millis_per_point
    frame_shift_num_points = frame_shift / millis_per_point
    num_frames = int(np.floor((len(v_signal) - frame_length_num_points) / frame_shift_num_points) + 1)
    m_frames = np.zeros((num_frames, int(frame_length_num_points)))
    v_time_frame = np.zeros(num_frames)
    for i in range(num_frames):
        start = int(i * frame_shift_num_points)
        end = int(i * frame_shift_num_points + frame_length_num_points)
        m_frames[i] = v_signal[start:end]
        v_time_frame[i] = (start + end)/(2*sampling_rate)
    return [m_frames, v_time_frame]

In [55]:
def compute_freq_axis(m_mstft: np.ndarray):
    return np.linspace(0, sampling_rate//2, num=m_mstft.shape[1])

In [9]:
def remove_upper_half_spectrum(m_stft: np.ndarray) -> np.ndarray:
    m_stft_new = m_stft[:, :(int(m_stft.shape[1]/2)+1)]
    return m_stft_new

In [61]:
def compute_stft(v_signal: np.ndarray, fs: int, frame_length: int, frame_shift: int, v_analysis_window: np.ndarray) -> [np.ndarray, np.ndarray, np.ndarray]:
    m_frames, v_time_frame = my_windowing(v_signal, fs, frame_length, frame_shift)
    m_stft_full = np.zeros(m_frames.shape, dtype=np.complex128)
    v_analysis_window = v_analysis_window(m_frames.shape[1])
    for i in range(m_frames.shape[0]):
        m_stft_full[i] = np.fft.fft(m_frames[i]*v_analysis_window)
    #v_freq = np.fft.rfftfreq(m_stft_full.shape[1], 1/fs)
    m_stft = remove_upper_half_spectrum(m_stft_full)
    v_freq = compute_freq_axis(m_stft)
    return [m_stft, v_freq, v_time_frame]


In [62]:
m_stft, v_freq, v_time_frame = compute_stft(speech1_data, sampling_rate, frame_length, frame_shift, np.hanning)

In [63]:
m_stft[0]

array([ 3.15159594e-02+0.00000000e+00j, -1.72918832e-02+4.49958989e-04j,
        2.18811541e-03-8.10150384e-04j,  3.53872997e-04+6.49216715e-04j,
       -1.33583015e-03-2.59414873e-04j, -6.96425290e-04+5.99013280e-04j,
        1.42475514e-03-4.43600426e-04j, -6.83208479e-05-2.03867300e-04j,
       -7.60944489e-04-9.97047127e-05j,  6.53874644e-04+7.23824081e-04j,
       -2.63308687e-04-5.54928947e-04j,  4.43373300e-06-6.76756137e-04j,
        2.08669742e-04+8.98452626e-04j, -3.90999562e-04-1.71943415e-04j,
        4.50362646e-04-6.64912599e-05j, -3.01389701e-04+5.43142255e-04j,
       -3.98808777e-04-7.45394124e-04j,  5.35688191e-04+4.20543685e-04j,
        2.89302304e-04-2.88862047e-05j,  2.74539438e-05-3.08962850e-04j,
       -6.55832667e-04+2.91524501e-04j,  1.20304262e-04-2.02279350e-04j,
        3.61079258e-04+2.70801805e-04j, -5.91415020e-04-4.83771648e-04j,
        3.74110396e-04+4.12149460e-04j,  1.37310408e-04+1.52894608e-05j,
       -8.64456711e-05-1.87645642e-04j,  3.89741626

In [64]:
v_freq

array([   0.  ,   31.25,   62.5 ,   93.75,  125.  ,  156.25,  187.5 ,
        218.75,  250.  ,  281.25,  312.5 ,  343.75,  375.  ,  406.25,
        437.5 ,  468.75,  500.  ,  531.25,  562.5 ,  593.75,  625.  ,
        656.25,  687.5 ,  718.75,  750.  ,  781.25,  812.5 ,  843.75,
        875.  ,  906.25,  937.5 ,  968.75, 1000.  , 1031.25, 1062.5 ,
       1093.75, 1125.  , 1156.25, 1187.5 , 1218.75, 1250.  , 1281.25,
       1312.5 , 1343.75, 1375.  , 1406.25, 1437.5 , 1468.75, 1500.  ,
       1531.25, 1562.5 , 1593.75, 1625.  , 1656.25, 1687.5 , 1718.75,
       1750.  , 1781.25, 1812.5 , 1843.75, 1875.  , 1906.25, 1937.5 ,
       1968.75, 2000.  , 2031.25, 2062.5 , 2093.75, 2125.  , 2156.25,
       2187.5 , 2218.75, 2250.  , 2281.25, 2312.5 , 2343.75, 2375.  ,
       2406.25, 2437.5 , 2468.75, 2500.  , 2531.25, 2562.5 , 2593.75,
       2625.  , 2656.25, 2687.5 , 2718.75, 2750.  , 2781.25, 2812.5 ,
       2843.75, 2875.  , 2906.25, 2937.5 , 2968.75, 3000.  , 3031.25,
       3062.5 , 3093

The computed spectra is complex conjugate symmetric because the input signal is real, and one of the properties of the discrete Fourier transform is that the spectrum of a real signal is complex conjugate symmetric.

Since the upper half of the spectrum is redundant it can be removed, this saves memory and computation time. Moreover, given a sampling frequency, only frequencies up to the Nyquist frequency are relevant for speech signal processing.

Another advantage is that considering only the non-redundant part of the spectrum makes visualization easier.