In [19]:
import collections
import contextlib
import sys
import wave
import webrtcvad
from sklearn import metrics

def read_wave(path):
    """Reads a .wav file.
    Takes the path, and returns (PCM audio data, sample rate).
    """
    with contextlib.closing(wave.open(path, 'rb')) as wf:
        num_channels = wf.getnchannels()
        assert num_channels == 1
        sample_width = wf.getsampwidth()
        assert sample_width == 2
        sample_rate = wf.getframerate()
        assert sample_rate in (8000, 16000, 32000, 48000)
        pcm_data = wf.readframes(wf.getnframes())
        return pcm_data, sample_rate

class Frame(object):
    """Represents a "frame" of audio data."""
    def __init__(self, bytes, timestamp, duration):
        self.bytes = bytes
        self.timestamp = timestamp
        self.duration = duration

def frame_generator(frame_duration_ms, audio, sample_rate):
    """Generates audio frames from PCM audio data.
    Takes the desired frame duration in milliseconds, the PCM data, and
    the sample rate.
    Yields Frames of the requested duration.
    """
    n = int(sample_rate * (frame_duration_ms / 1000.0) * 2)
    offset = 0
    timestamp = 0.0
    duration = (float(n) / sample_rate) / 2.0
    while offset + n < len(audio):
        yield Frame(audio[offset:offset + n], timestamp, duration)
        timestamp += duration
        offset += n

def vad_collector(key, sample_rate, frame_duration_ms,
                  padding_duration_ms, vad, frames):
    """Filters out non-voiced audio frames.
    Given a webrtcvad.Vad and a source of audio frames, yields only
    the voiced audio.
    Uses a padded, sliding window algorithm over the audio frames.
    When more than 90% of the frames in the window are voiced (as
    reported by the VAD), the collector triggers and begins yielding
    audio frames. Then the collector waits until 90% of the frames in
    the window are unvoiced to detrigger.
    The window is padded at the front and back to provide a small
    amount of silence or the beginnings/endings of speech around the
    voiced frames.
    Arguments:
    sample_rate - The audio sample rate, in Hz.
    frame_duration_ms - The frame duration in milliseconds.
    padding_duration_ms - The amount to pad the window, in milliseconds.
    vad - An instance of webrtcvad.Vad.
    frames - a source of audio frames (sequence or generator).
    """
    
    num_padding_frames = int(padding_duration_ms / frame_duration_ms)
    # We use a deque for our sliding window/ring buffer.
    ring_buffer = collections.deque(maxlen=num_padding_frames)
    # We have two states: TRIGGERED and NOTTRIGGERED. We start in the
    # NOTTRIGGERED state.
    triggered = False

    boundaries = np.zeros((1000,2))
    i = 0
    j = 0
    
    voiced_frames = []
    for frame in frames:
        is_speech = vad.is_speech(frame.bytes, sample_rate)
        
        if not triggered:
            ring_buffer.append((frame, is_speech))
            num_voiced = len([f for f, speech in ring_buffer if speech])
            # If we're NOTTRIGGERED and more than 90% of the frames in
            # the ring buffer are voiced frames, then enter the
            # TRIGGERED state.
            if num_voiced > 0.9 * ring_buffer.maxlen:
                triggered = True
                #sys.stdout.write('+(%s)' % (ring_buffer[0][0].timestamp,))
                boundaries[i][0] = ring_buffer[0][0].timestamp
                # We want to yield all the audio we see from now until
                # we are NOTTRIGGERED, but we have to start with the
                # audio that's already in the ring buffer.
                for f, s in ring_buffer:
                    voiced_frames.append(f)
                ring_buffer.clear()
        else:
            # We're in the TRIGGERED state, so collect the audio data
            # and add it to the ring buffer.
            voiced_frames.append(frame)
            ring_buffer.append((frame, is_speech))
            num_unvoiced = len([f for f, speech in ring_buffer if not speech])
            # If more than 90% of the frames in the ring buffer are
            # unvoiced, then enter NOTTRIGGERED and yield whatever
            # audio we've collected.
            if num_unvoiced > 0.9 * ring_buffer.maxlen:
                #sys.stdout.write('-(%s)' % (frame.timestamp + frame.duration))
                boundaries[i][1] = (frame.timestamp + frame.duration)
                i = i+1
                triggered = False
                #yield b''.join([f.bytes for f in voiced_frames])
                ring_buffer.clear()
                voiced_frames = []
    if triggered:
        #sys.stdout.write('-(%s)' % (frame.timestamp + frame.duration))
        boundaries[i][1] = (frame.timestamp + frame.duration)
        i = i+1
#     sys.stdout.write('\n')
    boundaries = boundaries[:i,:]
    filename = key[:-4] + '_slices'
    np.save(filename, boundaries)
    
    predictions = np.zeros(vad_dict[key]['nonsilent_slices'][-1][1])
    for n in range(boundaries.shape[0]):
        start = int(boundaries[n][0]*1000)
        end = int(boundaries[n][1]*1000)
        predictions[start:end] = 1  
    
    truth = np.zeros(vad_dict[key]['nonsilent_slices'][-1][1])
    for n in range(len(vad_dict[key]['nonsilent_slices'])):
        start = vad_dict[key]['nonsilent_slices'][n][0]
        end = vad_dict[key]['nonsilent_slices'][n][1]
        truth[start:end] = 1                  
    
    changes = np.where(np.diff(np.sign(predictions)))[0]
    
    for i in range(len(changes)-1):
        if changes[i] + 60 > changes[i+1]:
            print(changes[i],changes[i+1])
            a = changes[i]
            b = changes[i+1]
            predictions[a:b] = predictions[a]
    
    precision = metrics.precision_score(truth, predictions)
    recall = metrics.recall_score(truth, predictions)
    f1 = metrics.f1_score(truth, predictions)
 
    return precision, recall, f1

In [20]:
import numpy as np
import glob, os
import pickle

path = '/project/graziul/ra/anishk/VAD/Data'

pkl_path = '/project/graziul/ra/ajays/whitelisted_vad_dict.pkl'
pkl_file = open(pkl_path,'rb')
vad_dict = pickle.load(pkl_file)

precision_list = []
recall_list = []
f1_list = []

keys = ['/project/graziul/data/Zone1/2018_08_11/201808112236-743532-27730.mp3',
        '/project/graziul/data/Zone1/2018_08_12/201808120005-475816-27730.mp3']
#for key in vad_dict.keys():
for key in keys:
    print(key)
    title = os.path.basename(key)[:-4] 
    file = '/project/graziul/ra/anishk/VAD/Data/' + title + '.wav'
    audio, sample_rate = read_wave(file)
    vad = webrtcvad.Vad(1)
    frames = frame_generator(30, audio, sample_rate)
    frames = list(frames)
    precision, recall, f1 = vad_collector(key, sample_rate, 30, 300, vad, frames)
    f1_list.append(f1)
    recall_list.append(recall)
    precision_list.append(precision)

/project/graziul/data/Zone1/2018_08_11/201808112236-743532-27730.mp3
/project/graziul/data/Zone1/2018_08_12/201808120005-475816-27730.mp3
829828 829858
1611688 1611718


In [6]:
print(np.mean(np.array(recall_list)))
print(np.mean(np.array(precision_list)))
print(np.mean(np.array(f1_list)))

0.9857481975662491
0.5510419190508227
0.7035610850253097


In [13]:
vad_dict

{'/project/graziul/data/Zone1/2018_08_10/201808100021-669974-27730.mp3': {'nonsilent_slices': [(7528,
    8482),
   (9967, 14075),
   (25076, 26428),
   (30112, 31040),
   (31941, 35228),
   (36368, 37057),
   (43233, 44559),
   (47660, 48879),
   (49542, 54896),
   (58316, 59641),
   (60993, 61470),
   (146532, 147539),
   (149474, 150694),
   (151568, 154298),
   (379504, 380697),
   (383639, 384461),
   (385680, 388728),
   (392015, 393314),
   (448900, 450596),
   (452107, 452584),
   (676517, 677736),
   (686536, 688525),
   (691361, 692527),
   (693720, 700586),
   (759856, 761552),
   (762665, 764097),
   (765396, 771015),
   (772500, 773692),
   (802055, 804441),
   (807489, 807966),
   (890059, 890960),
   (893187, 894274),
   (897322, 898754),
   (900582, 901563),
   (987261, 989011),
   (992085, 992881),
   (993888, 1001575),
   (1002370, 1002847),
   (1197437, 1198683),
   (1201254, 1202208),
   (1203613, 1208252),
   (1215038, 1216045),
   (1649120, 1650763),
   (1654845, 

In [None]:
'/project/graziul/data/Zone1/2018_08_11/201808112236-743532-27730.mp3'
'/project/graziul/data/Zone1/2018_08_12/201808120005-475816-27730.mp3'