In [4]:
# module load python
# module load ffmpeg (if Midway2)

import pandas as pd
import pydub
import re
import os 

os.chdir('/project/graziul/')

In [5]:
df = pd.read_csv('transcripts/transcripts2021_09_03.csv')
df[['start','end','transcription']].head()

Unnamed: 0,start,end,transcription
0,00.02.21.252,00.02.31.279,RADIOSHOP TESTING ONE TWO THREE FOUR FIVE FIVE...
1,00.02.38.109,00.02.39.417,ONE TWO ONE TWO
2,00.02.48.327,00.02.49.235,UNIT COMING IN
3,00.02.55.330,00.02.57.437,ZONE ONE IS ON CITY [WIDE] FIVE
4,00.03.04.003,00.03.09.017,OKAY THANKS UH THIS IS THE RADIO SHOP TESTING ...


In [7]:
def get_milliseconds(pandas_row,time_var):
    var_dt = pd.to_datetime(pandas_row[time_var])
    minute = var_dt.dt.minute*60
    second = var_dt.dt.second
    frac_second = var_dt.dt.microsecond/(10**6)
    total_seconds = minute+second+frac_second
    milliseconds = int(total_seconds.values[0]*1000)
    return milliseconds

In [8]:
def get_audio_transcript(df, idx, data_dir='data/', sample_rate=22050):
    # Select row by index value
    row = df.loc[df.index==idx].copy()
    # Get transcription
    transcription = row['transcription'].values[0]
    # Load audio using row info
    file = row['file'].values[0]
    zone = row['zone'].values[0]
    date = '_'.join([row['year'].astype(str).values[0],
                     row['month'].astype(str).str.zfill(2).values[0],
                     row['day'].astype(str).str.zfill(2).values[0]])
    mp3_file = '-'.join([re.sub("[^0-9^.]", "", i) for i in file.split('.')[0].split('-')])+'.mp3'
    mp3_path = data_dir+zone+'/'+date+'/'+mp3_file
    audio = pydub.AudioSegment.from_mp3(mp3_path)
    # Get snippet of audio based on timing
    start_ms = max(0,get_milliseconds(row, 'start_dt')-200)
    end_ms = min(len(audio),get_milliseconds(row, 'end_dt')+200)
    snippet = audio[start_ms:end_ms]
    transcription_lower = transcription.lower()
    transcription_proper = transcription_lower.split()[0].capitalize()+' '+' '.join(transcription_lower.split()[1:])
    return snippet, transcription_proper

In [10]:
list_of_idx = [22, 33, [43, 52], [248, 249, 268], [443, 444]]
transcripts = []
proj_n = 1
# Extract projectives based on index
for idx in list_of_idx:
    if type(idx)==list:
        multiple_snippets = pydub.AudioSegment.empty()
        multiple_transcripts = ['Projective #'+str(proj_n)+':','']
        for i in idx:
            # Get audio snippet and its transcription
            snippet, transcript = get_audio_transcript(df, i) 
            # Add the snippet    
            multiple_snippets += pydub.AudioSegment.silent(500)
            multiple_snippets += snippet
            # Add the transcription
            multiple_transcripts.append(transcript)
        # Save all snippets
        multiple_snippets.export('data/ProjectiveNum'+str(proj_n)+'.wav', format='wav')
        # Add transcriptions
        transcripts += multiple_transcripts
        transcripts += ['\n']
        # Iterate projective number
        proj_n+=1
    else:
        # Get audio snippet and its transcription
        snippet, transcript = get_audio_transcript(df, idx) 
        # Save the snippet
        snippet.export('data/ProjectiveNum'+str(proj_n)+'.wav', format='wav')
        # Add the transcription to list of transcriptions
        label = 'Projective #'+str(proj_n)+': \n\n'
        proj = label + transcript + '\n'
        transcripts.append(proj)
        # Iterate projective number
        proj_n+=1
# Write the list of transcriptions to file
with open('data/projectives2021_09_07.txt','w') as f:
    for t in transcripts:
        f.write(t+'\n')
    f.close()
# Print the list of transcriptions to validate process
transcripts

["Projective #1: \n\nSuspicous person seventy one hundred at mason call says three males in hoodies riding bikes checking car doors one's in gray, either a gray or a white hoodie the other two are in dark hoodiesheaded north bound\n",
 'Projective #2: \n\nFourty ten lawrence at the walgreens gary says he got male black about 35 black t shirt holding items in a bag\n',
 'Projective #3:',
 '',
 "Suspicous person bernice and menard bernice and menard male black red shirt carying a white shopping bag been walking around in circles for over thirty minutes talking to themselves and yelling not sure what's wrong with them.",
 "Guy is saying now the male black in the red t-shirt carying the bag is probably drunk talking to himself but he's getting on the neighbor's porch at fifty eight oh two on byron now",
 '\n',
 'Projective #4:',
 '',
 "Yeah i've got a request for a supervisor for you at fourty four hundred menard regarding twenty four's job ms. portillo says she wants a supervisor because 