In [1]:
import logging
root_logger = logging.getLogger()
root_logger.setLevel(logging.DEBUG)
root_logger.addHandler(logging.StreamHandler())

In [2]:
from asr_dataset.police import BpcETL, AmbiguityStrategy
from asr_dataset.constants import DataSizeUnit, Cluster

cluster = Cluster['RCC']
etl = BpcETL(cluster, filter_inaudible=False, filter_uncertain=False, filter_numeric=False, ambiguity=AmbiguityStrategy.ALL)
data = etl.etl()

police-extracted dataset stats:
	Row count = 62762
	Min duration = 0.01 (sec)
	Max duration = 24.93 (sec)
	Mean duration = 2.27 (sec)
	Stdev duration = 2.43 (sec)
	Total duration = 1 days 15:38:58.708010
Discarding 3407 missing audios.
Discarding 287 too_short mp3s.
Discarding 49 transcripts with no text.
police-transformed dataset stats:
	Row count = 59019
	Min duration = 0.04 (sec)
	Max duration = 24.93 (sec)
	Mean duration = 2.27 (sec)
	Stdev duration = 2.42 (sec)
	Total duration = 1 days 13:13:30.104800
Writing utterance audio clips.
Writing audio took 0:00:13.222853.
Discarding 0 missing audios.
Discarding 0 too_short mp3s.
police-loaded dataset stats:
	Row count = 59019
	Min duration = 0.04 (sec)
	Max duration = 24.93 (sec)
	Mean duration = 2.27 (sec)
	Stdev duration = 2.42 (sec)
	Total duration = 1 days 13:13:30.104800


In [3]:
data = data.assign(inaudible = data['text'].str.contains('|'.join(etl.BAD_WORDS), regex=True, case=False),
                    uncertain = lambda x: ~x['inaudible'] & x['text'].str.contains('\[.+\]', regex=True),
                    clean = lambda x: ~x['inaudible'] & ~x['uncertain'])

In [4]:
f"{data['inaudible'].sum()} inaudible and {data['uncertain'].sum()} uncertain and {data['clean'].sum()} clean"

'7907 inaudible and 10099 uncertain and 41013 clean'

In [58]:
labeler = {'clean': 0, 'uncertain': 1, 'inaudible': 2}
unlabeler = {0: 'clean', 1:'uncertain', 2:'inaudible'}

In [6]:
import numpy as np
import pandas as pd

In [7]:
data['label'] = pd.Series(np.zeros(len(data)))
data.loc[data['inaudible'], 'label'] = labeler['inaudible']
data.loc[data['clean'], 'label'] = labeler['clean']
data.loc[data['uncertain'], 'label'] = labeler['uncertain']

In [75]:
import torchtext
from torchtext.data import get_tokenizer
from torchtext.data.utils import ngrams_iterator
from collections import Counter
tokenizer = get_tokenizer("basic_english")
word_counters = {k:Counter() for k in labeler.values()}
char_bigram_counters = {k:Counter() for k in labeler.values()}
for tup in data.itertuples():
    clean_txt = tup.text
    for bad in etl.BAD_WORDS:
        clean_txt = clean_txt.replace(bad, '')
    clean_txt = clean_txt.replace('[','').replace(']','')
    tokens = tokenizer(clean_txt)
    word_counters[tup.label].update(tokens)
    char_bigram_counters[tup.label].update(ngrams_iterator(clean_txt,2))


In [76]:
def aligned_counts(c1, c2):
    pad1 = {k: c1.get(k, 0) for k in c1.keys() | c2.keys()}
    pad2 = {k: c2.get(k, 0) for k in c1.keys() | c2.keys()}
    return Counter(pad1), Counter(pad2)

In [77]:
from scipy.spatial import distance

In [78]:
def js_dist(c1, c2):
    v1, v2 = aligned_counts(c1, c2)
    return distance.jensenshannon(list(v1.values()), list(v2.values()))

def corr_dist(c1, c2):
    v1, v2 = aligned_counts(c1, c2)
    return distance.correlation(list(v1.values()), list(v2.values()))

def jacc_dist(c1, c2):
    v1, v2 = aligned_counts(c1, c2)
    b1, b2 = [v > 0 for v in v1.values()], [v > 0 for v in v2.values()]
    return distance.jaccard(b1, b2)

def jacc_sim(c1, c2):
    return len(c1 & c2) / len(c1 | c2)

In [79]:
# Use laplace smoothing because word sets dont fully intersect
def cross_entropy(c1, c2):
    sum1, sum2 = sum(c1.values()), sum(c2.values())
    freq1 = {k: v / sum1 for k,v in c1.items()}
    freq2 = {k: v / sum2 for k,v in c2.items()}
    ce = 0
    for k in freq1:
        ce -= freq1[k] * np.log(freq2.get(k, 0))
    return ce

In [80]:
import torch
klfunc = torch.nn.KLDivLoss(reduction='batchmean', log_target=True)
def kl_div(c1, c2):
    a1, a2 = aligned_counts(c1, c2)
    e1 = {k: np.exp(v) for k, v in a1.items()}
    e2 = {k: np.exp(v) for k, v in a2.items()}
    sum1, sum2 = sum(e1.values()), sum(e2.values())
    sm1 = {k: v / sum1 for k,v in e1.items()}
    sm2 = {k: v / sum2 for k,v in e2.items()}
    t1 = torch.tensor([np.log(v) for v in sm1.values()])
    t2 = torch.tensor([np.log(v) for v in sm2.values()])
    return klfunc(t1, t2)

In [81]:
def compare_corpora(counters):
    for l1, c1 in counters.items():
        for l2, c2 in counters.items():
            k1, k2 = unlabeler[l1], unlabeler[l2]
            if l1 >= l2:
                continue
            print(f'{k1} vs {k2}...')
            loss = corr_dist(c1, c2)
            print(f'\t Correlation = {loss:.3f}')
            loss = jacc_dist(c1, c2)
            print(f'\t Jaccard Dist = {loss:.3f}')
            loss = js_dist(c1, c2)
            print(f'\t Jensen-Shannon = {loss:.3f}')

In [82]:
compare_corpora(word_counters)

clean vs uncertain...
	 Correlation = 0.081
	 Jaccard Dist = 0.627
	 Jensen-Shannon = 0.203
clean vs inaudible...
	 Correlation = 0.136
	 Jaccard Dist = 0.684
	 Jensen-Shannon = 0.241
uncertain vs inaudible...
	 Correlation = 0.013
	 Jaccard Dist = 0.658
	 Jensen-Shannon = 0.205


In [83]:
compare_corpora(char_bigram_counters)

clean vs uncertain...
	 Correlation = 0.005
	 Jaccard Dist = 0.279
	 Jensen-Shannon = 0.075
clean vs inaudible...
	 Correlation = 0.014
	 Jaccard Dist = 0.327
	 Jensen-Shannon = 0.099
uncertain vs inaudible...
	 Correlation = 0.006
	 Jaccard Dist = 0.248
	 Jensen-Shannon = 0.070
