In [None]:
from asr_dataset.police import BpcETL, AmbiguityStrategy
from asr_dataset.constants import Cluster
import pandas as pd
import numpy as np
import librosa

In [None]:
import logging
logging.getLogger('asr').setLevel(logging.DEBUG)

In [None]:
etl = BpcETL(Cluster.AI, 
    filter_inaudible=False, 
    filter_numeric=False, 
    filter_uncertain=False,
    ambiguity=AmbiguityStrategy.ALL)

In [None]:
data = etl.extract()

In [None]:
data = data.assign(end = data['offset'] + data['duration'])
candidates = data.merge(data, on='original_audio')
# remove "reverse duplicates": e.g. when row 1 = (x,y) and row 2 = (y,x)
# by keeping the version where x is on the left of the join
# conveniently this makes computing overlap easier
overlaps = (candidates['offset_x'] <= candidates['offset_y']) \
            & (candidates['end_x'] >= candidates['offset_y'])
same_scriber = candidates['transcriber_x'] == candidates['transcriber_y']
candidates = candidates.loc[overlaps & ~same_scriber]
n_candidates = pd.concat([candidates['audio_x'],candidates['audio_y']]).nunique()
print(f'Found {n_candidates} that overlap somewhat.')

In [None]:
OVERLAP_THRESHOLD = .5   # arbitrary
intersect = candidates['end_x'] - candidates['offset_y']
overlap = pd.concat([candidates['duration_y'], intersect], axis=1).apply(min, axis=1)
shorter = candidates[['duration_x','duration_y']].apply(min, axis=1)
candidates = candidates.loc[(overlap / shorter) > OVERLAP_THRESHOLD]
n_candidates = pd.concat([candidates['audio_x'],candidates['audio_y']]).nunique()
print(f'Found {n_candidates} that overlap > 50%.')


In [None]:
for i in range(20):
    pick = candidates[['text_x','text_y']].sample().values
    print(f"{pick[0,0]} vs {pick[0,1]}")

In [None]:
# Cherry Pick 1: Negation
cherry = candidates.loc[candidates['text_x'].str.contains('FEMALE IN YOUR CAR').fillna(False)]
cherry = cherry.loc[cherry['text_x'].str.contains("DON") & cherry['text_y'].str.contains("DO HAVE")]
for tup in cherry[['audio_x','audio_y','text_x','text_y']].drop_duplicates().itertuples():
    print(f"{tup.audio_x}")
    print(f"{tup.text_x}")
    print(f"{tup.text_y}")
    print(f"{tup.audio_y}")
    print("---")
cherry_audio_x = '/net/projects/uri/data/utterances/Zone1/2018_08_05/201808050001-100238-27730/853366_857758.flac'
cherry_audio_y = '/net/projects/uri/data/utterances/Zone1/2018_08_05/201808050001-100238-27730/853366_857758.flac'

In [None]:
# Cherry Pick 2: Shell Casing
cherry = candidates.loc[candidates['text_x'].str.contains('SHELL CASING BY THE').fillna(False)]
for tup in cherry[['audio_x','audio_y','text_x','text_y']].drop_duplicates().itertuples():
    print(f"{tup.audio_x}")
    print(f"{tup.text_x}")
    print(f"{tup.text_y}")
    print(f"{tup.audio_y}")
    print("---")
cherry_audio_x = '/net/projects/uri/data/utterances/Zone8/2018_08_10/201808101202-916683-27158/268396_270685.flac'
cherry_audio_y = '/net/projects/uri/data/utterances/Zone8/2018_08_10/201808101202-916683-27158/268522_270528.flac'

In [None]:
# Cherry Pick 3: Traffic Stop vs Shots
cherry = candidates.loc[candidates['text_x'].str.contains('TRAFFIC STOP LAWRENCE').fillna(False)]
for tup in cherry[['audio_x','audio_y','text_x','text_y']].drop_duplicates().itertuples():
    print(f"{tup.audio_x}")
    print(f"{tup.text_x}")
    print(f"{tup.text_y}")
    print(f"{tup.audio_y}")
    print("---")
cherry_audio_x = '/net/projects/uri/data/utterances/Zone1/2018_08_05/201808050359-300564-27730/433621_435561.flac'
cherry_audio_y = '/net/projects/uri/data/utterances/Zone1/2018_08_05/201808050359-300564-27730/433755_435281.flac'