In [None]:
import os
import glob
import numpy as np
import scipy.io.wavfile as wav
from scipy.fftpack import fft
from scipy import interpolate
from scipy.ndimage import gaussian_filter
import librosa
import warnings
from tqdm import tqdm
from sklearn.linear_model import LinearRegression
from importlib import reload
import h5py

### Preprocessor

In [1]:
class Preprocessor:
    def load_audio(self, audio_filename, signal_length=None, sample_rate=44100):
        warnings.simplefilter("ignore")
        fs, y = wav.read(audio_filename)

        if y.dtype == np.int16:
            y = y / 32768.0
        if len(y.shape) == 2:
            y = y[:, 0]
        y = np.asarray(y)

        if signal_length is not None:
            if y.size / fs > signal_length:
                # Cut the signal if it is too long
                y = y[:int(signal_length *fs)]
            else:
                # Pad the signal with zeros if it is too short
                y = np.pad(y, int((signal_length * fs - y.size) / 2), mode='constant')
        
        if fs != sample_rate:
            y = librosa.core.resample(y, fs, sample_rate)
            fs = sample_rate
        
        return y, fs

    def extract_feature(self, signal, sample_rate, win_len, hop_len, mel_band_num, adj_feat_num, rm_filts_len):
        window = np.hamming(win_len)
        y = signal
        fs = sample_rate

        # Pad the signal to take into account window at the edges
        y = np.pad(y, int(win_len / 2), mode='constant')
        nb_frames = int((y.size - win_len) / hop_len) + 1

        fft_len = win_len
        fft_mel_bands = librosa.filters.mel(fs, fft_len, mel_band_num, fmin=0.0).T
        fft_en = np.zeros((nb_frames, int(1 + fft_len / 2)))

        lms = np.zeros((nb_frames, mel_band_num))
        st_energy = np.zeros((nb_frames,))
        time_vec = np.zeros((nb_frames,))

        for i in range(nb_frames):
            y_win = y[i * hop_len: i * hop_len + win_len] * window
            fft_aux = np.abs(fft(y_win)[: 1 + int(fft_len / 2)])
            fft_aux[fft_aux == 0] = np.finfo(np.float32).eps
            fft_en[i, :] = fft_aux
            lms[i, :] = np.dot(fft_en[i, :] ** 2, fft_mel_bands)
            st_energy[i] = np.sum(np.power(y[i * hop_len: i * hop_len + win_len], 2))
            time_vec[i] = (i * hop_len) / fs
        short_time_ft = fft_en.T

        ste_feat = self._energy_feature(st_energy.copy(), adj_feat_num, rm_filts_len)
        trf_feat = self._trf_feature(short_time_ft.copy(), adj_feat_num, rm_filts_len)
        hfp_feat = self._hfp_feature(short_time_ft.copy(), fs, adj_feat_num, rm_filts_len)
        lms_feat = self._lms_feature(lms)
        features_full = np.concatenate((ste_feat, trf_feat, hfp_feat, lms_feat), axis=0).T

        return features_full, time_vec

    def _energy_feature(self, energy, extra_points, rm_filts_len):
        energy = self._filter_and_normalize(energy, rm_filts_len)
        if extra_points > 0:
            energy = self._extend_feature(energy, extra_points)
        else:
            energy = energy.reshape(1, -1)
        return energy
    
    def _extend_feature(self, feat, extra_points):
        refer_points = 10 # Number of edge points used in extrapolation
        feat_len = feat.size
        add_left = self._extrapolate_points(feat[:refer_points], extra_points, 'L', np.finfo(np.float32).eps)
        feat_exp = np.append(add_left, feat)
        add_right = self._extrapolate_points(feat[-refer_points:], extra_points, 'R', np.finfo(np.float32).eps)
        feat_exp = np.append(feat_exp, add_right)
        feat_exp = [feat_exp[i: i + feat_len] for i in range(2 * extra_points + 1)]
        feat_exp = np.array(feat_exp)

        return feat_exp

    def _extrapolate_points(self, x, point_no, direction='R', threshold=None):
        assert direction == 'L' or direction == 'R', "Extrapolation direction not valid"
        extra_points = np.zeros((point_no, ))
        refer_points = x.size
        interp = interpolate.interp1d(range(refer_points), x, fill_value="extrapolate", kind="quadratic")
        for k in range(point_no):
            if direction == 'L':
                extra_points[k] = interp(-point_no + k)
            else:
                extra_points[k] = interp(refer_points + k)
        
        if threshold is not None:
            extra_points[extra_points < threshold] = threshold
        return extra_points
    
    def _lms_feature(self, lms):
        lms[lms == 0] = np.finfo(np.float32).eps
        lms = np.log(lms.T)

        mean_diff = np.mean(np.mean(lms[:, 2:5], axis=1) - np.mean(lms[:, 0:2], axis=1))
        mean_diff_thresh = 3.0
        inter_points = 7
        if mean_diff > mean_diff_thresh:
            for k in range(lms.shape[0]):
                inter_val, coef, interc = self._linear_interp(lms[k, 2:2+inter_points], 2, direction='L')
                lms[k, 0:2] = inter_val
        return lms
    
    def _linear_interp(self, array, point_no, direction='L'):
        assert direction == 'L' or direction == 'R', "Extrapolation direction not valid"

        arr_len = np.size(array)
        x = np.linspace(0, arr_len-1, arr_len).reshape(-1, 1)
        lin_reg = LinearRegression().fit(x, array.reshape(-1, 1))
        coef = np.float(lin_reg.coef_)
        interc = np.float(lin_reg.intercept_)

        if direction == 'L':
            x_extra = np.linspace(-point_no, -1, point_no)
        else:
            x_extra = np.linspace(arr_len, arr_len+point_no-1, point_no)
        
        y_extra = coef * x_extra + interc
        return y_extra, coef, interc

    def _trf_feature(self, stft, extra_points, rm_filts_len):
        stft = gaussian_filter(stft, sigma=3, truncate=3)
        thresh = 3 * np.median(stft)
        trf = np.zeros((stft.shape[1], ))
        for k in range(stft.shape[1]):
            inds = np.argwhere(stft[:, k] > thresh)
            trf[k] = inds[-1] if inds.size > 0 else 0
        trf = self._filter_and_normalize(trf, rm_filts_len)
        if extra_points > 0:
            trf = self._extend_feature(trf, extra_points)
        else:
            trf = trf.reshape(1, -1)
        return trf

    def _running_mean(self, x, n):
        if n % 2 == 0:
            raise ValueError("Filter length is not odd")
        aver = np.convolve(x, np.ones((n,)) / n, mode='same')
        n_half = int((n - 1) / 2)
        corr_length = np.array(range(n_half + 1, n)) / n
        aver[:n_half] = np.divide(aver[:n_half], corr_length)
        aver[-n_half:] = np.divide(aver[-n_half:], corr_length[::-1])
        return aver

    def _filter_and_normalize(self, x, filts_len):
        for filt_len in filts_len:
            x = self._running_mean(x, filt_len)        
        x -= np.min(x)
        x = x / np.max(x)
        return x

    def _hfp_feature(self, stft, fs, extra_points, rm_filts_len):
        freq_limit = 6000  # Lower limit for high-freq range
        freq = np.linspace(0, fs / 2, stft.shape[0])
        freq_limit_ind = int(np.argwhere(freq >= freq_limit)[0])
        hf_en = np.sum(np.power(stft[freq_limit_ind:, :], 2), axis=0)
        hf_en = self._filter_and_normalize(hf_en, rm_filts_len)
        if extra_points > 0:
            hf_en = self._extend_feature(hf_en, extra_points)
        else:
            hf_en = hf_en.reshape(1, -1)
        return hf_en
    
    def create_labels(self, audio_filename, time, time_dist_threshold):
        file_name, file_extension = os.path.splitext(audio_filename)
        with open(file_name + '.txt') as f:
            minima = f.readlines()
        minima_positions = np.array([float(x.strip()) for x in minima])
        minima_no = minima_positions.size
        vehicle_count = 0
        if minima_positions[0] >= 0:
            vehicle_count = minima_no
        
        labels_separate = np.empty((0, time.size))
        for k in range(minima_no):
            label_vehicle = time_dist_threshold * np.ones_like(time)
            if minima_positions[k] >= 0:
                abs_fun = np.abs(time - minima_positions[k])
                label_vehicle = np.fmin(label_vehicle, abs_fun)
            labels_separate = np.append(labels_separate, label_vehicle.reshape(1, -1), axis=0)
    
        labels = np.min(labels_separate, axis=0)

        return labels, vehicle_count

In [None]:
preprocessor = Preprocess()

### Feature extraction and label parameters

In [None]:
window_len = 4096   # Window length in samples
hop_perc = 40 # Hop length relative to window size (percents)
hop_len = int((hop_perc / 100.0) * window_len) # Hop length in the STFT calculation
time_samples_num = 539 # Number of time samples where features are calcuated
mel_bands_num = 64 # Number of Mel-bands to calculate Mel-band energy feature
signal_len = 20.0 # Signal length in seconds
adj_feat_num = 10 # Number of adjacent features to concatenate (only for 1-D features)
feat_num = mel_bands_num + 3 * (2 * adj_feat_num + 1) # Number of features (mel_bands_num + 1-D time features)
run_mean_filts_len = (11, 5) # Lengths of running mean filters for postprocessing of 1-D features
time_dist_threshold = 0.75 # Time distance threshold

### Create Dataset

In [None]:
 audio_folder = "../VC-PRG-1_5/"
 output_dataset_name = 'dataset_cross-validation.h5'

audio_files = [os.path.basename(file) for file in  glob.glob( audio_folder + '*.wav')]
audio_files.sort(reverse=False)

files_num = len(audio_files)
features = np.zeros((files_num, time_samples_num, feat_num))
labels = np.zeros((files_num, time_samples_num))
places = np.zeros((files_num, ))
veh_count = np.zeros((files_num, ))

for file_index in tqdm(range(files_num)):
    audio_file_name = audio_folder + audio_files[file_index]
    y, fs = preprocessor.load_audio(audio_file_name, signal_length=signal_len)

    features[file_index, :, :], time = preprocessor.extract_feature(y, fs, window_len, hop_len, mel_bands_num, adj_feat_num, run_mean_filts_len)
    labels[file_index, :], veh_count[file_index] = preprocessor.create_labels(audio_file_name, time, time_dist_threshold)
    places[file_index] = int(audio_files[file_index][6:8])

output_folder = 'datasets/'
hf = h5py.File(output_folder + output_dataset_name, 'w')
hf.create_dataset('features', data=features, compression="gzip")
hf.create_dataset('labels', data=labels, compression="gzip")
hf.create_dataset('places', data=places, compression="gzip")
hf.create_dataset('vehicle_count', data=veh_count, compression="gzip")
hf.close()

In [80]:
 def create_dataset(audio_folder, output_dataset_name, output_folder):
    audio_files = [os.path.basename(file) for file in  glob.glob( audio_folder + '*.wav')]
    audio_files.sort(reverse=False)

    files_num = len(audio_files)
    features = np.zeros((files_num, time_samples_num, feat_num))
    labels = np.zeros((files_num, time_samples_num))
    places = np.zeros((files_num, ))
    veh_count = np.zeros((files_num, ))

    for file_index in tqdm(range(files_num)):
        audio_file_name = audio_folder + audio_files[file_index]
        y, fs = preprocessor.load_audio(audio_file_name, signal_length=signal_len)

        features[file_index, :, :], time = preprocessor.extract_feature(y, fs, window_len, hop_len, mel_bands_num, adj_feat_num, run_mean_filts_len)
        labels[file_index, :], veh_count[file_index] = preprocessor.create_labels(audio_file_name, time, time_dist_threshold)
        places[file_index] = int(audio_files[file_index][6:8])

    hf = h5py.File(output_folder + output_dataset_name, 'w')
    hf.create_dataset('features', data=features, compression="gzip")
    hf.create_dataset('labels', data=labels, compression="gzip")
    hf.create_dataset('places', data=places, compression="gzip")
    hf.create_dataset('vehicle_count', data=veh_count, compression="gzip")
    hf.close()

In [None]:
audio_folder = "../VC-PRG-1_5/"
output_dataset_name = 'dataset_cross-validation.h5'
output_folder = 'datasets/'
create_dataset(audio_folder, output_dataset_name, output_folder)

In [81]:
audio_folder = "../VC-PRG-6/"
output_dataset_name = 'dataset_test_unseen.h5'
output_folder = 'datasets/'
create_dataset(audio_folder, output_dataset_name, output_folder)

100%|██████████| 172/172 [07:15<00:00,  2.53s/it]
