In [19]:
# pip install --upgrade google-cloud-speech
import os
import io
from google.cloud import speech_v1p1beta1 as speech
from google.cloud.speech_v1p1beta1 import enums
from pydub import AudioSegment
import pandas as pd
import time

In [20]:
# Check audio file properites
sound_file = AudioSegment.from_file('./Datasets/sample_audio/QUICKTEST/31120-20200501-0034sample21.wav', format="wav")

print(f'Sample Width: {sound_file.sample_width}')
print(f'Channel Count: {sound_file.channels}')
print(f'Duration: {len(sound_file) / 1000}s')
print(f'Sample Rate: {sound_file.frame_rate}')
print(f'Sample Rate: {sound_file.dBFS}')

Sample Width: 2
Channel Count: 1
Duration: 13.33s
Sample Rate: 22050
Sample Rate: -19.370745835569505


In [24]:
def transcribe(input_path,output_path,output_filename,street_file):
    # define key path (security feature)
    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = '/Users/alex/ga-dsi-11/police-scanner-speech-to-text-c09b11750e4e.json'
    
    # Instantiate speech to text model, DataFrame, and time
    client = speech.SpeechClient()
    df = pd.DataFrame()
    total_time_start = time.time()
    
    # List of streets
    street_df = pd.read_csv(street_file)
    street_list = []
    for j in street_df['0']:
        street_list.append(j) 
        
    
    # loop through files in our input path
    for i in os.listdir(input_path):
        if i.endswith('.wav'):
            loop_time = time.time()
            
            # open our audio file
            with io.open(f'{input_path}{i}', 'rb') as audio_file:
                content = audio_file.read()
                audio = speech.types.RecognitionAudio(content=content)
            print(f"File {i} opened")
            
            # model parameters
            config = speech.types.RecognitionConfig(
                encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
                sample_rate_hertz = 22050,
                language_code = 'en-US',
                audio_channel_count = 1,
                enable_separate_recognition_per_channel = True,
                use_enhanced = True,
                model = 'phone_call',
                speech_contexts = [{
                                    'boost': 20.0}])
            
            # This models equivalent of fit/predict
            response = client.recognize(config, audio)
            
            # build dictionary -> dataframe
#             print(response.results)
            for result in response.results:
                d = {}
                d['transcript'] = result.alternatives[0].transcript
                d['confidence_interval'] = result.alternatives[0].confidence
                d['file_name'] = i
                df = df.append(d, ignore_index=True)
                print(f"File {i} results added to dataframe: {time.time() - loop_time}s")


    
    # export df of transcriptions
    df.to_csv(f'{output_path}{output_filename}', index=False)
    
    print(f"Total time elapsed: {time.time() - total_time_start}s")
    
    #return dataframe incase we would like to use it in local file
    return df

In [25]:
transcription_df = transcribe('./Datasets/sample_audio/QUICKTEST/',   # Indivitual INPUT PATH
                              './Datasets/sample_transcript/ALEX', # Indivitual INPUT PATH
                              'ALEX_sample_transcript_QUICKTEST.csv', # Indivitual transcription csv name
                              './Datasets/cambridge.csv'
                             )







File 31120-20200501-0034sample6.wav opened
File 31120-20200501-0034sample6.wav results added to dataframe: 4.639209032058716s
File 31120-20200501-0034sample7.wav opened
File 31120-20200501-0034sample7.wav results added to dataframe: 9.623837947845459s
File 31120-20200501-0034sample5.wav opened
File 31120-20200501-0034sample4.wav opened
File 31120-20200501-0034sample1.wav opened
File 31120-20200501-0034sample1.wav results added to dataframe: 17.229122161865234s
File 31120-20200501-0034sample3.wav opened
File 31120-20200501-0034sample3.wav results added to dataframe: 4.784204006195068s
File 31120-20200501-0034sample2.wav opened
File 31120-20200501-0034sample11.wav opened
File 31120-20200501-0034sample11.wav results added to dataframe: 9.840970039367676s
File 31120-20200501-0034sample10.wav opened
File 31120-20200501-0034sample10.wav results added to dataframe: 3.8044259548187256s
File 31120-20200501-0034sample12.wav opened
File 31120-20200501-0034sample13.wav opened
File 31120-20200501-0

In [26]:
transcription_df

Unnamed: 0,confidence_interval,file_name,transcript
0,0.635647,31120-20200501-0034sample6.wav,home
1,0.478475,31120-20200501-0034sample7.wav,Hong Kong
2,0.628009,31120-20200501-0034sample1.wav,took off going off of that individual
3,0.636534,31120-20200501-0034sample3.wav,home
4,0.664766,31120-20200501-0034sample11.wav,home home
5,0.635865,31120-20200501-0034sample10.wav,home
6,0.641121,31120-20200501-0034sample16.wav,home
7,0.650507,31120-20200501-0034sample15.wav,home
8,0.662562,31120-20200501-0034sample18.wav,home home home home
9,0.586742,31120-20200501-0034sample19.wav,home home
