In [1]:
# pip install --upgrade google-cloud-speech
import os
import io
from google.cloud import speech_v1p1beta1 as speech
from google.cloud.speech_v1p1beta1 import enums
from pydub import AudioSegment
import pandas as pd
import time

In [20]:
# Check audio file properites
# sound_file = AudioSegment.from_file('./Datasets/sample_audio/QUICKTEST/31120-20200501-0034sample21.wav', format="wav")

# print(f'Sample Width: {sound_file.sample_width}')
# print(f'Channel Count: {sound_file.channels}')
# print(f'Duration: {len(sound_file) / 1000}s')
# print(f'Sample Rate: {sound_file.frame_rate}')
# print(f'Sample Rate: {sound_file.dBFS}')

In [5]:
def transcribe(input_path,output_path,output_filename,street_file):
    # define key path (security feature)
    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = '/Users/alex/ga-dsi-11/police-scanner-speech-to-text-c09b11750e4e.json'
    
    # Instantiate speech to text model, DataFrame, and time
    client = speech.SpeechClient()
    df = pd.DataFrame()
    total_time_start = time.time()
    
    # List of streets
    street_df = pd.read_csv(street_file)
    street_list = []
    for j in street_df['0']:
        street_list.append(j) 
        
    
    # loop through files in our input path
    for i in os.listdir(input_path):
        if i.endswith('.wav'):
            loop_time = time.time()
            
            # open our audio file
            with io.open(f'{input_path}{i}', 'rb') as audio_file:
                content = audio_file.read()
                audio = speech.types.RecognitionAudio(content=content)
            print(f"File {i} opened")
            
            # model parameters
            config = speech.types.RecognitionConfig(
                encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
                sample_rate_hertz = 22050,
                language_code = 'en-US',
                audio_channel_count = 1,
                enable_separate_recognition_per_channel = True,
                use_enhanced = True,
                model = 'phone_call',
                speech_contexts = [{'phrases': street_file,
                                    'boost': 20.0}])
            
            # This models equivalent of fit/predict
            response = client.recognize(config, audio)
            
            # build dictionary -> dataframe
            print(response.results)
            for result in response.results:
                d = {}
                d['transcript'] = result.alternatives[0].transcript
                d['confidence'] = result.alternatives[0].confidence
                d['file_name'] = i
                df = df.append(d, ignore_index=True)
                print(f"File {i} results added to dataframe: {time.time() - loop_time}s")


    
    # export df of transcriptions
    df.to_csv(f'{output_path}{output_filename}', index=False)
    
    print(f"Total time elapsed: {time.time() - total_time_start}s")
    
    #return dataframe incase we would like to use it in local file
    return df

In [8]:
transcription_df = transcribe('./Datasets/sample_audio/test/sample/',   # Indivitual INPUT PATH
                              './Datasets/sample_transcript/', # Indivitual OUTPUT PATH
                              'ALEX_sample_transcript_TESTQUICKTEST.csv', # Indivitual transcription csv name
                              './Datasets/Metro_West_Streets.csv'
                             )

transcription_df


File Alexsample18-25818-20200501-0413.wav opened
[alternatives {
  transcript: "s"
  confidence: 0.6598044633865356
}
language_code: "en-us"
]
File Alexsample18-25818-20200501-0413.wav results added to dataframe: 12.442830801010132s
File Alexsample17-25818-20200501-0413.wav opened
[alternatives {
  transcript: "m r s a Highlander"
  confidence: 0.6445850729942322
}
language_code: "en-us"
]
File Alexsample17-25818-20200501-0413.wav results added to dataframe: 6.560755968093872s
File Alexsample13-25818-20200501-0413.wav opened
[alternatives {
  transcript: "set up and just o Hospital a o d a v a m / housing"
  confidence: 0.750121533870697
}
language_code: "en-us"
]
File Alexsample13-25818-20200501-0413.wav results added to dataframe: 19.648750066757202s
File Alexsample10-25818-20200501-0413.wav opened
[alternatives {
  transcript: "tensions r v a o m"
  confidence: 0.6642780900001526
}
language_code: "en-us"
]
File Alexsample10-25818-20200501-0413.wav results added to dataframe: 5.42780

Unnamed: 0,confidence_interval,file_name,transcript
0,0.659804,Alexsample18-25818-20200501-0413.wav,s
1,0.644585,Alexsample17-25818-20200501-0413.wav,m r s a Highlander
2,0.750122,Alexsample13-25818-20200501-0413.wav,set up and just o Hospital a o d a v a m / hou...
3,0.664278,Alexsample10-25818-20200501-0413.wav,tensions r v a o m
4,0.63429,Alexsample14-25818-20200501-0413.wav,a transporter with a c r e
5,0.746279,Alexsample12-25818-20200501-0413.wav,7113 O Street E 350 for 14
6,0.622404,Alexsample16-25818-20200501-0413.wav,c m s e n r a
7,0.651599,Alexsample16-25818-20200501-0413.wav,never notified c m s r. E s
8,0.627712,Alexsample16-25818-20200501-0413.wav,no problem a r o a high school
9,0.83008,Alexsample15-25818-20200501-0413.wav,Carlos a c m o to be outside in a motor vehicle


'car locations required to be outside in the motor vehicle'