In [1]:
from asr_dataset.police import BpcETL, AmbiguityStrategy
from asr_dataset.constants import Cluster
import librosa
import numpy as np
import pandas as pd

In [2]:
import logging
logging.getLogger('asr').setLevel(logging.DEBUG)

In [3]:
etl = BpcETL(Cluster.AI, 
    filter_inaudible=False, 
    filter_numeric=False, 
    filter_uncertain=False,
    ambiguity=AmbiguityStrategy.ALL)

In [4]:
data = etl.extract()

In [5]:
hop_len = 20 # ms
hop_sec = hop_len / 1000
sr = 16000 # hz
hop_sam = sr * hop_sec

In [6]:
data = data.assign(end = data['offset'] + data['duration'])
data = data.assign(start_frame = librosa.time_to_frames(data['offset'], sr=16000, hop_length=hop_sam),
                    end_frame = librosa.time_to_frames(data['end'], sr=16000, hop_length=hop_sam))        

In [7]:
# Make binary speech arrays
frame_speech = {}
n_frames = librosa.time_to_frames(30 * 60, sr=sr, hop_length=hop_sam)
for tup in data.itertuples():
    aud = tup.original_audio
    ts = tup.transcriber
    speech = frame_speech.get((aud, ts), np.zeros(n_frames))
    speech[tup.start_frame : tup.end_frame] += 1
    frame_speech[(aud, ts)] = speech

In [8]:
# Compute total speech and fleiss agreement score
n_annotators = {}
for aud, ts in frame_speech.keys():
    n_annotators[aud] = n_annotators.get(aud, 0) + 1

aud_speech = {}
for aud, ts in frame_speech.keys():
    speech = aud_speech.get(aud, np.zeros(n_frames))
    speech += frame_speech[(aud, ts)] > 0  #compare to 0 to avoid double-counting duplicate records
    aud_speech[aud] = speech

fleiss_agreement = {}
pct_agreement = {}
for aud in aud_speech.keys():
    speech = aud_speech[aud]
    na = n_annotators[aud]
    non_speech = na - speech
    assert((speech <= na).all())
    assert((speech >= 0).all())
    assert((non_speech >= 0).all())
    norm = 1 if na == 1 else (1. / (na * (na - 1)))
    fleiss = norm * (speech * (speech - 1) + non_speech * (non_speech - 1))
    pct = (speech - 1) * (speech > 0) + (non_speech - 1) * (non_speech > 0)
    fleiss_agreement[aud] = fleiss
    pct_agreement[aud] = pct

In [9]:
sum_agree = 0
len_agree = 0
sum_speech = 0
sum_pct = 0
for aud, arr in fleiss_agreement.items():
    sum_agree += arr.sum()
    sum_speech += aud_speech[aud].sum()
    sum_pct += pct_agreement[aud].sum()
    len_agree += len(arr)
avg_agree = sum_agree / len_agree
p_speech = sum_speech / len_agree
p_silence = 1 - sum_speech / len_agree
print(f'p_speech {p_speech:.3f}, p_nonspeech {p_silence:.3f}')

p_speech 0.286, p_nonspeech 0.714


In [10]:
var_prob = p_speech * p_speech + p_silence * p_silence

In [11]:
kappa = (avg_agree - var_prob) / (1 - var_prob)
print(f'Kappa {kappa:.3f}')
print(f'Pct Aggreement {sum_pct / len_agree :.3f}')

Kappa 0.203
Pct Aggreement 0.770


Fleiss Kappa (rough not universally accepted not context-independent) Rule of Thumb:

Kappa | Interpretation
--- | ---
< 0 | Poor
.01 - .20 | Slight
.21 - .40 | Fair
.41 - .60 | Moderate
.61 - .80 | Substantial
.81 - 1 | Almost Perfect