In [1]:
from asr_dataset.police import BpcETL, AmbiguityStrategy
from asr_dataset.constants import Cluster, DATASET_DIRS
import librosa
import numpy as np
import pandas as pd
import os

In [2]:
import logging
logging.getLogger('asr').setLevel(logging.DEBUG)

In [3]:
cluster = Cluster.AI

In [4]:
etl = BpcETL(cluster,
    filter_inaudible=False, 
    filter_numeric=False, 
    filter_uncertain=False,
    ambiguity=AmbiguityStrategy.ALL)

In [17]:
data = etl.extract()
whitelist = pd.read_csv(DATASET_DIRS[cluster]['police_mp3s'] + "/whitelisted_vad_files.csv", names=['files'])
in_whitelist = data['original_audio'].apply(lambda x: os.path.basename(x)).isin(whitelist['files'])
print(f'Discarding {len(data) - in_whitelist.sum()} utts insuitable for VAD')
data = data.loc[in_whitelist]

Discarding 35880 utts insuitable for VAD


In [18]:
hop_len = 20 # ms
hop_sec = hop_len / 1000
sr = 16000 # hz
hop_sam = sr * hop_sec

In [19]:
data = data.assign(end = data['offset'] + data['duration'])
data = data.assign(start_frame = librosa.time_to_frames(data['offset'], sr=16000, hop_length=hop_sam),
                    end_frame = librosa.time_to_frames(data['end'], sr=16000, hop_length=hop_sam))        

In [20]:
def compute_fleiss(data):
    # Make binary speech arrays
    frame_speech = {}

    aud_frames = {}
    for aud in data['original_audio'].unique():
        if os.path.exists(aud):
            n_sec = librosa.get_duration(filename=aud, sr=sr)
        else:
            n_sec = 30 * 60
        aud_frames[aud] = librosa.time_to_frames(n_sec ,sr=sr, hop_length=hop_sam)

    for tup in data.itertuples():
        aud = tup.original_audio
        ts = tup.transcriber
        speech = frame_speech.get((aud, ts), np.zeros(aud_frames[aud]))
        speech[tup.start_frame : tup.end_frame] += 1
        frame_speech[(aud, ts)] = speech
    
    # Compute total speech and fleiss agreement score
    n_annotators = {}
    for aud, ts in frame_speech.keys():
        n_annotators[aud] = n_annotators.get(aud, 0) + 1

    aud_speech = {}
    for aud, ts in frame_speech.keys():
        speech = aud_speech.get(aud, np.zeros(aud_frames[aud]))
        speech += frame_speech[(aud, ts)] > 0  #compare to 0 to avoid double-counting duplicate records
        aud_speech[aud] = speech

    fleiss_agreement = {}
    pct_agreement = {}
    for aud in aud_speech.keys():
        speech = aud_speech[aud]
        na = n_annotators[aud]
        non_speech = na - speech
        assert((speech <= na).all())
        assert((speech >= 0).all())
        assert((non_speech >= 0).all())
        norm = 1 if na == 1 else (1. / (na * (na - 1)))
        fleiss = norm * (speech * (speech - 1) + non_speech * (non_speech - 1))
        pct = (speech - 1) * (speech > 0) + (non_speech - 1) * (non_speech > 0)
        fleiss_agreement[aud] = fleiss
        pct_agreement[aud] = pct

    sum_agree = 0
    len_agree = 0
    sum_speech = 0
    sum_pct = 0
    for aud, arr in fleiss_agreement.items():
        sum_agree += arr.sum()
        sum_speech += aud_speech[aud].sum()
        sum_pct += pct_agreement[aud].sum()
        len_agree += len(arr)
    avg_agree = sum_agree / len_agree
    p_speech = sum_speech / len_agree
    p_silence = 1 - sum_speech / len_agree
    print(f'p_speech {p_speech:.3f}, p_nonspeech {p_silence:.3f}')

    var_prob = p_speech * p_speech + p_silence * p_silence

    kappa = (avg_agree - var_prob) / (1 - var_prob)
    pct_agg = sum_pct / len_agree
    std_err = np.sqrt(pct_agg * (1 - pct_agg) / (len_agree * (1 - var_prob)*(1 - var_prob)))
    ci_low, ci_high = kappa - 1.96 * std_err, kappa + 1.96 * std_err
    print(f'Pct Aggreement {pct_agg :.3f}')
    print(f'Kappa {kappa:.3f} +/- {1.96 * std_err : .6f}')

    return pct_agg, kappa, std_err

In [21]:
compute_fleiss(data)

p_speech 0.335, p_nonspeech 0.665
Pct Aggreement 0.749
Kappa 0.436 +/-  0.000622


(0.7485551337665967, 0.43587702131028944, 0.0003172609721521282)

Fleiss Kappa (rough not universally accepted not context-independent) Rule of Thumb:

Kappa | Interpretation
--- | ---
< 0 | Poor
.01 - .20 | Slight
.21 - .40 | Fair
.41 - .60 | Moderate
.61 - .80 | Substantial
.81 - 1 | Almost Perfect

In [22]:
zone1 = data.loc[data['original_audio'].str.contains('Zone1').fillna(False)]
compute_fleiss(zone1)

p_speech 0.235, p_nonspeech 0.765
Pct Aggreement 0.770
Kappa 0.361 +/-  0.000940


(0.7704108484145789, 0.36124575914277196, 0.0004796458439563995)

In [23]:
zone4 = data.loc[data['original_audio'].str.contains('Zone4').fillna(False)]
compute_fleiss(zone4)

p_speech 0.335, p_nonspeech 0.665
Pct Aggreement 0.828
Kappa 0.615 +/-  0.005482


(0.8284432775407504, 0.6151647916358401, 0.002797083703064903)

In [24]:
zone8 = data.loc[data['original_audio'].str.contains('Zone8').fillna(False)]
compute_fleiss(zone8)

p_speech 0.513, p_nonspeech 0.487
Pct Aggreement 0.708
Kappa 0.415 +/-  0.000972


(0.7077904195526118, 0.4152131618263185, 0.0004957947569327375)

In [33]:
print(f"len zone 1, 4, 8: {len(zone1)}, {len(zone4)}, {len(zone8)}")

len zone 1, 4, 8: 10620, 133, 15164
