In [1]:
import pandas as pd
import numpy as np
import pydub
from pydub import AudioSegment
from pydub.silence import detect_nonsilent
import pickle
from multiprocessing import Pool
import os
from datetime import datetime
from collections import namedtuple
from itertools import combinations
import random

Range = namedtuple('Range',['start','end'])
os.chdir('/project/graziul/')

#### For each file: Use intersection of transcriber utterance timings as threshold to create union of transcriber utterance timings, then use union to clip audio - add silence between, save file, receive test audio corpus with manual VAD feature extraction

In [2]:
df_transcripts = pd.read_csv('transcripts/deprecated/transcripts.csv')
df_transcripts[['start','end','transcription']].head()

Unnamed: 0,start,end,transcription
0,00.02.21.252,00.02.31.279,RADIOSHOP TESTING ONE TWO THREE FOUR FIVE FIVE...
1,00.02.38.109,00.02.39.417,ONE TWO ONE TWO
2,00.02.48.327,00.02.49.235,UNIT COMING IN
3,00.02.55.330,00.02.57.437,ZONE ONE IS ON CITY [WIDE] FIVE
4,00.03.04.003,00.03.09.017,OKAY THANKS UH THIS IS THE RADIO SHOP TESTING ...


In [3]:
df_transcripts[['filename','transcriber']] = df_transcripts['file'].str.split('(\d*-\d*-\d*)',expand=True)[[1,2]]
def remove_filetype(x):
    y = x.replace('.xlsx','')
    y = y.replace('.txt','')
    return y
df_transcripts['transcriber'] = df_transcripts['transcriber'].apply(lambda x: remove_filetype(x))
df_transcripts['transcriber'] = df_transcripts['transcriber'].str.lower().str.strip()
df_transcripts = df_transcripts[df_transcripts['transcriber']!='xxx']
#df_transcripts[['filename','transcriber']]
#df_transcripts['transcriber'].value_counts()

In [4]:
df_transcripts['start_dt'] = pd.to_datetime(df_transcripts['start_dt'])
df_transcripts['end_dt'] = pd.to_datetime(df_transcripts['end_dt'])

In [5]:
def remove_irrelevant(x):
    x = str(x)
    x = x.replace("\[Uncertain\]","")
    x = x.replace("INAUDIBLE","")    
    x = x.replace("<X>","")
    x = ' '.join(x.split())
    x = ''.join([i for i in x if i.isalnum() or i==' '])
    x = x.strip()
    x = x.upper()
    return x

In [6]:
df_transcripts['start_ts'] = (df_transcripts['start_dt']-np.datetime64('1900-01-01T00:00:00.000000000')).dt.total_seconds()
df_transcripts['end_ts'] = (df_transcripts['end_dt']-np.datetime64('1900-01-01T00:00:00.000000000')).dt.total_seconds()

In [7]:
df_transcripts['Range'] = df_transcripts[['start_ts','end_ts']].apply(tuple,axis=1)
df_transcripts[['Range','transcription']].head()

Unnamed: 0,Range,transcription
0,"(141.252, 151.279)",RADIOSHOP TESTING ONE TWO THREE FOUR FIVE FIVE...
1,"(158.109, 159.417)",ONE TWO ONE TWO
2,"(168.327, 169.235)",UNIT COMING IN
3,"(175.33, 177.437)",ZONE ONE IS ON CITY [WIDE] FIVE
4,"(184.00300000000001, 189.01700000000002)",OKAY THANKS UH THIS IS THE RADIO SHOP TESTING ...


In [8]:
def get_prop_overlap(se1, se2):
    s1, e1 = se1
    s2, e2 = se2
    overlap = max(0,min(e1,e2)-max(s1,s2))
    prop_overlap = overlap/max(e1-s1,e2-s2)
    return prop_overlap

In [9]:
def get_se_voice(filename, df_transcripts):
    df = df_transcripts[df_transcripts['filename']==f].copy()
    # Extract information so we can reconstruct dir_path
    zone = df['zone'].unique()[0]
    year = df['year'].astype(str).unique()[0]
    month = df['month'].astype(str).apply(lambda x: x.zfill(2)).unique()[0]
    day = df['day'].astype(str).apply(lambda x: x.zfill(2)).unique()[0]
    date = year+'_'+month+'_'+day
    # Create a list to hold the ranges we derive
    se_voice_list = []
    # Create a list of "exhausted" tuples (i.e. do not try to compare to others)
    exhausted = []
    # Get transcribers
    transcribers = df['transcriber'].unique().tolist()
    # If only 1 transcription...
    # If 2+ transcriptions...
    if len(transcribers)>=2:
        # Pick 2 transcribers at random to avoid "issues"
        df_foruse = df[df['transcriber'].isin(random.sample(transcribers,2))].copy()
        # Get the ranges
        ranges = df_foruse['Range'].values
        # Check all permutations
        checked_se_tuples = []
        # Get target range
        for se1 in ranges:
            # Collect candidates
            candidate_se_list = []
            # Get start/end for target range
            s1, e1 = se1
            # Check overlap with remaining ranges
            for se2 in ranges:
                se_tuple = set((se1, se2))
                # Make sure it's not the target range and we have not checked for overlap of these ranges
                if se1!=se2: #) and len([i for i in checked_se_tuples if se_tuple in i])==0:
                    #checked_se_tuples.append(se_tuple)
                    prop_overlap = get_prop_overlap(se1, se2)
                    # If overlap more than 75%
                    if prop_overlap > 0.75 and [se2, se1] not in checked_se_tuples:
                        checked_se_tuples.append([se1, se2])
                        exhausted.append(se2)
                        exhausted.append(se1)
                        # Get start/end for other range
                        s2, e2 = se2
                        # Get the minimum start time of both periods 
                        start_voice = min(s1,s2)
                        # Get the maximum end time of both periods
                        end_voice = max(e1,e2)
                        # Get transcriptions associated with each range
                        t1 = df_foruse.loc[df['Range']==se1,'transcription'].apply(remove_irrelevant).values[0]
                        t2 = df_foruse.loc[df['Range']==se2,'transcription'].apply(remove_irrelevant).values[0]
                        # Add to candidate list
                        candidate_se_list.append([zone, date, f+'.mp3', start_voice, end_voice, t1==t2, t1, t2])
            if len(candidate_se_list)>0:
                se_voice_list.append(candidate_se_list[0])
    return se_voice_list

In [10]:
'''
        # If only one overlapping range is found then add it to the list of voice activity for the file
        if len(candidate_se_list)==1:
            se_voice_list.append(candidate_se_list[0])
        # If more than one overlapping range...
        if len(candidate_se_list)>1:
            # Add the first overlapping range (NOTE: THIS IS NOT IDEAL)
            se_voice_list.append(candidate_se_list[0])
'''


'\n        # If only one overlapping range is found then add it to the list of voice activity for the file\n        if len(candidate_se_list)==1:\n            se_voice_list.append(candidate_se_list[0])\n        # If more than one overlapping range...\n        if len(candidate_se_list)>1:\n            # Add the first overlapping range (NOTE: THIS IS NOT IDEAL)\n            se_voice_list.append(candidate_se_list[0])\n'

In [11]:
%%time
voice_list = []
for f in df_transcripts['filename'].unique():
    voice_list += get_se_voice(f,df_transcripts)

CPU times: user 14 s, sys: 22.1 ms, total: 14.1 s
Wall time: 14.1 s


In [23]:
df_va = pd.DataFrame(voice_list,columns=['zone','date','filename','start_voice','end_voice','transcripts_agree','transcription1','transcription2'])
df_va = df_va.drop_duplicates()
df_va.sort_values(['zone','date','start_voice'],inplace=True)
df_va.to_csv('transcripts/deprecated/df_va.csv')
df_va[['start_voice','end_voice','transcripts_agree']].head()

Unnamed: 0,start_voice,end_voice,transcripts_agree
2541,40.044,45.552,True
2542,71.0,72.786,True
2543,75.826,81.023,True
2544,96.567,100.19,False
2545,101.429,103.266,True


### This code for file-by-file removal of silence using manually provided VAD data

In [14]:
# Set some defaults 
silence_buffer = 2500 # ms
num_processors = 28

In [15]:
def get_all_voice_activity(df_va):
    p = Pool(processes = num_processors)
    output = p.starmap(extract_voice_activity,[(file,df_va) for file in df_va['filename'].unique()])
    p.close()

In [17]:
def extract_voice_activity(file, df_va, pad=200):
    # Get data associated with file
    df_temp = df_va[df_va['filename']==file].copy()
    # Get data needed to construct path to file
    zone = df_temp['zone'].unique()[0]
    date = df_temp['date'].unique()[0]
    # Load file
    try:
        mp3_file_in = 'data/'+zone+'/'+date+'/'+file
        pydub_audiosegment = AudioSegment.from_mp3(mp3_file_in)
    except:
        date_split = date.split('_')
        day = int(date_split[2])+1
        date = date_split[0]+'_'+date_split[1]+'_'+str(day).zfill(2)
        mp3_file_in = 'data/'+zone+'/'+date+'/'+file
        pydub_audiosegment = AudioSegment.from_mp3(mp3_file_in)
    # Get frame_rate
    fr = pydub_audiosegment.frame_rate
    # Create silence
    silence = AudioSegment.silent(duration = silence_buffer, frame_rate = fr)
    # Get (start, end) tuples to extract voice activity
    se = df_temp[['start_voice','end_voice']].sort_values('start_voice').values
    # Build up list of audio segments with voice activity  
    va_slices = []
    for s,e in se:
        start = max(0,s*1000-pad)
        end = min(e*1000+pad,len(pydub_audiosegment))
        va_slices.append(pydub_audiosegment[start:end])
    combined = silence
    for va_slice in va_slices:
        combined += va_slice + silence
    wav_file_out = 'data/'+zone+'/'+date+'/'+file.split('.')[0]+'va.wav'
    combined = combined.set_frame_rate(fr)
    combined.export(wav_file_out,format='wav')

In [18]:
%%time
get_all_voice_activity(df_va)

CPU times: user 237 ms, sys: 167 ms, total: 403 ms
Wall time: 9.29 s


### This code for creating test files using manually provided VAD data

In [22]:
df_va = pd.read_csv('transcripts/deprecated/df_va.csv')
df_va.drop(columns=df_va.columns.values[0],inplace=True)
df_va[['start_voice','end_voice','transcripts_agree']].head()

Unnamed: 0,start_voice,end_voice,transcripts_agree
0,40.044,45.552,True
1,71.0,72.786,True
2,75.826,81.023,True
3,96.567,100.19,False
4,101.429,103.266,True


In [None]:
# Now use these timings to compile sample data for use by potential vendors

In [24]:
# File #1: Both transcribers agree ("clean" = easy)
df_va1 = df_va[df_va['transcripts_agree']].copy()
df_va1 = df_va1.drop_duplicates()
df_va1 = df_va1[~df_va1['transcription1'].isna()]
len(df_va1)

1923

In [25]:
df_va1.sort_values(['date','start_voice'],inplace=True)
# Exhausted case indices
exhausted_cases = []
total_length = 0
# Create silence
silence = AudioSegment.silent(duration = silence_buffer, frame_rate = 22050)
combined = silence
files = df_va1['filename'].unique().tolist()
files.sort()
pad = 200
for file in files:
    if total_length < 10:
        # Get data associated with file
        df_temp = df_va1[df_va1['filename']==file].copy()
        # Get data needed to construct path to file
        zone = df_temp['zone'].unique()[0]
        date = df_temp['date'].unique()[0]
        # Load file
        try:
            mp3_file_in = 'data/'+zone+'/'+date+'/'+file
            pydub_audiosegment = AudioSegment.from_mp3(mp3_file_in)
        except:
            date_split = date.split('_')
            day = int(date_split[2])+1
            date = date_split[0]+'_'+date_split[1]+'_'+str(day).zfill(2)
            mp3_file_in = 'data/'+zone+'/'+date+'/'+file
            pydub_audiosegment = AudioSegment.from_mp3(mp3_file_in)
        # Get (start, end) tuples to extract voice activity
        se = df_temp[['start_voice','end_voice']].values
        # Build up audio segments with voice activity  
        for s,e in se:
            start = max(0,s*1000-pad)
            end = min(e*1000+pad,len(pydub_audiosegment))
            va_segment = pydub_audiosegment[start:end]
            if len(detect_nonsilent(va_segment))>0:
                #print(len(detect_nonsilent(va_segment)),file)
                if total_length < 10: # and va_segment not in combined:
                    print("File #1 now "+str(round(total_length,1))+" in length",file)
                    combined += va_segment + silence
                    total_length = len(combined)/(1000*60)
                    exhausted_cases.append(df_temp[(df_temp['start_voice']==s) & (df_temp['end_voice']==e)].index)
        #del df_temp, se
wav_file_out = 'transcripts/vendors/test1.wav'
combined = combined.set_frame_rate(22050)
combined.export(wav_file_out,format='wav')

File #1 now 0 in length 201808042331-339616-27730.mp3
File #1 now 0.2 in length 201808042331-339616-27730.mp3
File #1 now 0.3 in length 201808042331-339616-27730.mp3
File #1 now 0.4 in length 201808042331-339616-27730.mp3
File #1 now 0.5 in length 201808042331-339616-27730.mp3
File #1 now 0.5 in length 201808042331-339616-27730.mp3
File #1 now 0.7 in length 201808042331-339616-27730.mp3
File #1 now 0.7 in length 201808042331-339616-27730.mp3
File #1 now 0.8 in length 201808042331-339616-27730.mp3
File #1 now 0.9 in length 201808042331-339616-27730.mp3
File #1 now 0.9 in length 201808042331-339616-27730.mp3
File #1 now 1.0 in length 201808042331-339616-27730.mp3
File #1 now 1.1 in length 201808042331-339616-27730.mp3
File #1 now 1.2 in length 201808042331-339616-27730.mp3
File #1 now 1.3 in length 201808042331-339616-27730.mp3
File #1 now 1.3 in length 201808042331-339616-27730.mp3
File #1 now 1.5 in length 201808042331-339616-27730.mp3
File #1 now 1.6 in length 201808042331-339616-2773

<_io.BufferedRandom name='transcripts/vendors/test1.wav'>

In [26]:
exhausted_from_test1 = pd.Index([i for s in exhausted_cases for i in s])
len(exhausted_cases), len(df_va), len(df_va[~df_va.index.isin(exhausted_from_test1)])

(127, 6213, 6086)

In [27]:
df_va[df_va.index.isin(exhausted_from_test1)].to_csv('transcripts/vendors/test1.csv')

In [28]:
# File #2: Mix of agree/disagree ("not clean" = medium)
df_va2 = df_va[(~df_va.index.isin(exhausted_from_test1)) & (df_va.index>max(exhausted_from_test1))].copy()
print("Proportion of samples where transcripts disagree:", str(len(df_va2[~df_va2['transcripts_agree']])/len(df_va2)))
print("Length of df_va2: ",str(len(df_va2)))

Proportion of samples where transcripts disagree: 0.723629935179729
Length of df_va2:  3394


In [29]:
df_va2.sort_values(['date','start_voice'],inplace=True)
df_va2[['start_voice','end_voice','transcripts_agree']].head()

Unnamed: 0,start_voice,end_voice,transcripts_agree
3580,11.313,14.092,True
3581,14.818,15.653,False
3582,17.324,33.94,False
2824,17.495,18.882,False
2825,20.272,21.545,True


In [30]:
df_va2.sort_values(['date','start_voice'],inplace=True)
# Exhausted case indices
exhausted_cases = []
total_length = 0
# Create silence
silence = AudioSegment.silent(duration = silence_buffer, frame_rate = 22050)
combined = silence
files = df_va2['filename'].unique().tolist()
files.sort()
for file in files:
    if total_length < 10:
        # Get data associated with file
        df_temp = df_va2[df_va2['filename']==file].copy()
        # Get data needed to construct path to file
        zone = df_temp['zone'].unique()[0]
        date = df_temp['date'].unique()[0]
        # Load file
        try:
            mp3_file_in = 'data/'+zone+'/'+date+'/'+file
            pydub_audiosegment = AudioSegment.from_mp3(mp3_file_in)
        except:
            date_split = date.split('_')
            day = int(date_split[2])+1
            date = date_split[0]+'_'+date_split[1]+'_'+str(day).zfill(2)
            mp3_file_in = 'data/'+zone+'/'+date+'/'+file
            pydub_audiosegment = AudioSegment.from_mp3(mp3_file_in)
        # Get (start, end) tuples to extract voice activity
        se = df_temp[['start_voice','end_voice']].values
        # Build up audio segments with voice activity  
        for s,e in se:
            start = max(0,s*1000-pad)
            end = min(e*1000+pad,len(pydub_audiosegment))
            va_segment = pydub_audiosegment[start:end]
            if len(detect_nonsilent(va_segment))>0:
                #print(len(detect_nonsilent(va_segment)),file)
                if total_length < 10: # and va_segment not in combined:
                    print("File #2 now "+str(round(total_length,1))+" in length",file)
                    combined += va_segment + silence
                    total_length = len(combined)/(1000*60)
                    exhausted_cases.append(df_temp[(df_temp['start_voice']==s) & (df_temp['end_voice']==e)].index)
        #del df_temp, se
wav_file_out = 'transcripts/vendors/test2.wav'
combined = combined.set_frame_rate(22050)
combined.export(wav_file_out,format='wav')

File #2 now 0 in length 201808050100-298054-27730.mp3
File #2 now 0.1 in length 201808050100-298054-27730.mp3
File #2 now 0.2 in length 201808050200-579833-27730.mp3
File #2 now 0.3 in length 201808050200-579833-27730.mp3
File #2 now 0.4 in length 201808050200-579833-27730.mp3
File #2 now 0.7 in length 201808050200-579833-27730.mp3
File #2 now 0.8 in length 201808050200-579833-27730.mp3
File #2 now 0.9 in length 201808050200-579833-27730.mp3
File #2 now 1.0 in length 201808050200-579833-27730.mp3
File #2 now 1.1 in length 201808050200-579833-27730.mp3
File #2 now 1.2 in length 201808050200-579833-27730.mp3
File #2 now 1.3 in length 201808050200-579833-27730.mp3
File #2 now 1.3 in length 201808050200-579833-27730.mp3
File #2 now 1.6 in length 201808050200-579833-27730.mp3
File #2 now 1.6 in length 201808050200-579833-27730.mp3
File #2 now 1.8 in length 201808050200-579833-27730.mp3
File #2 now 1.9 in length 201808050200-579833-27730.mp3
File #2 now 2.0 in length 201808050200-579833-2773

<_io.BufferedRandom name='transcripts/vendors/test2.wav'>

In [31]:
exhausted_from_test2 = pd.Index([i for s in exhausted_cases for i in s])
len(exhausted_from_test1), len(exhausted_from_test2), len(df_va), len(df_va2), len(df_va[~df_va.index.isin(exhausted_from_test1.union(exhausted_from_test2))])

(127, 101, 6213, 3394, 5985)

In [32]:
df_va[df_va.index.isin(exhausted_from_test2)].to_csv('transcripts/vendors/test2.csv')

In [33]:
# File #3: Transcribers disagree ("not clean" = hard)'
df_va3 = df_va[(~df_va['transcripts_agree']) & (df_va.index>max(exhausted_from_test2)-500)].copy()
print("Proportion of samples where transcripts disagree:", str(len(df_va3[~df_va3['transcripts_agree']])/len(df_va3)))
print("Length of df_va3: ",str(len(df_va3)))

Proportion of samples where transcripts disagree: 1.0
Length of df_va3:  2252


In [35]:
df_va3.sort_values(['date','start_voice'],inplace=True)
df_va3[['start_voice','end_voice','transcripts_agree']].head()

Unnamed: 0,start_voice,end_voice,transcripts_agree
3581,14.818,15.653,False
3582,17.324,33.94,False
3583,35.919,38.048,False
3587,47.255,51.791,False
3650,83.251,88.501,False


In [36]:
df_va3.sort_values(['date','start_voice'],inplace=True)
# Exhausted case indices
exhausted_cases = []
total_length = 0
# Create silence
silence = AudioSegment.silent(duration = silence_buffer, frame_rate = 22050)
combined = silence
files = df_va3['filename'].unique().tolist()
files.sort()
for file in files:
    if total_length < 10:
        # Get data associated with file
        df_temp = df_va3[df_va3['filename']==file].copy()
        # Get data needed to construct path to file
        zone = df_temp['zone'].unique()[0]
        date = df_temp['date'].unique()[0]
        # Load file
        try:
            mp3_file_in = 'data/'+zone+'/'+date+'/'+file
            pydub_audiosegment = AudioSegment.from_mp3(mp3_file_in)
        except:
            date_split = date.split('_')
            day = int(date_split[2])+1
            date = date_split[0]+'_'+date_split[1]+'_'+str(day).zfill(2)
            mp3_file_in = 'data/'+zone+'/'+date+'/'+file
            pydub_audiosegment = AudioSegment.from_mp3(mp3_file_in)
        # Get (start, end) tuples to extract voice activity
        se = df_temp[['start_voice','end_voice']].values
        # Build up audio segments with voice activity  
        for s,e in se:
            start = max(0,s*1000-pad)
            end = min(e*1000+pad,len(pydub_audiosegment))
            va_segment = pydub_audiosegment[start:end]
            if len(detect_nonsilent(va_segment))>0:
                #print(len(detect_nonsilent(va_segment)),file)
                if total_length < 10: # and va_segment not in combined:
                    print("File #3 now "+str(round(total_length,1))+" in length",file)
                    combined += va_segment + silence
                    total_length = len(combined)/(1000*60)
                    exhausted_cases.append(df_temp[(df_temp['start_voice']==s) & (df_temp['end_voice']==e)].index)
        #del df_temp, se
wav_file_out = 'transcripts/vendors/test3.wav'
combined = combined.set_frame_rate(22050)
combined.export(wav_file_out,format='wav')

File #3 now 0 in length 201808050200-579833-27730.mp3
File #3 now 0.1 in length 201808050200-579833-27730.mp3
File #3 now 0.4 in length 201808050200-579833-27730.mp3
File #3 now 0.5 in length 201808050200-579833-27730.mp3
File #3 now 0.6 in length 201808050200-579833-27730.mp3
File #3 now 0.7 in length 201808050200-579833-27730.mp3
File #3 now 0.9 in length 201808050200-579833-27730.mp3
File #3 now 1.1 in length 201808050200-579833-27730.mp3
File #3 now 1.2 in length 201808050200-579833-27730.mp3
File #3 now 1.4 in length 201808050200-579833-27730.mp3
File #3 now 1.5 in length 201808050200-579833-27730.mp3
File #3 now 1.6 in length 201808050200-579833-27730.mp3
File #3 now 1.6 in length 201808050200-579833-27730.mp3
File #3 now 1.7 in length 201808050200-579833-27730.mp3
File #3 now 1.8 in length 201808050200-579833-27730.mp3
File #3 now 1.9 in length 201808050200-579833-27730.mp3
File #3 now 2.1 in length 201808050200-579833-27730.mp3
File #3 now 2.1 in length 201808050200-579833-2773

<_io.BufferedRandom name='transcripts/vendors/test3.wav'>

In [37]:
exhausted_from_test3 = pd.Index([i for s in exhausted_cases for i in s])

In [38]:
df_va[df_va.index.isin(exhausted_from_test3)].to_csv('transcripts/vendors/test3.csv')