In [1]:
# pip install --upgrade google-cloud-speech
import os
import io
from google.cloud import speech_v1p1beta1 as speech
from google.cloud.speech_v1p1beta1 import enums
from pydub import AudioSegment
import pandas as pd
import time

In [3]:
# Check audio file properites
sound_file = AudioSegment.from_file('./Datasets/sample_audio/0501-1240sample3.wav', format="wav")

print(f'Sample Width: {sound_file.sample_width}')
print(f'Channel Count: {sound_file.channels}')
print(f'Duration: {len(sound_file) / 1000}s')
print(f'Sample Rate: {sound_file.frame_rate}')
print(f'Sample Rate: {sound_file.dBFS}')

Sample Width: 2
Channel Count: 1
Duration: 27.63s
Sample Rate: 22050
Sample Rate: -27.58092973651456


In [5]:
def transcribe(input_path,output_path,output_filename):
    # define key path (security feature)
    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = '/Users/alex/ga-dsi-11/police-scanner-speech-to-text-c09b11750e4e.json'
    
    # Instantiate speech to text model, DataFrame, and time
    client = speech.SpeechClient()
    df = pd.DataFrame()
    total_time_start = time.time()
    
    # loop through files in our input path
    for i in os.listdir(input_path):
        if i.endswith('.wav'):
            loop_time = time.time()
            
            # open our audio file
            with io.open(f'{input_path}{i}', 'rb') as audio_file:
                content = audio_file.read()
                audio = speech.types.RecognitionAudio(content=content)
            print(f"File {i} opened")
            
            # model parameters
            config = speech.types.RecognitionConfig(
                encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
                sample_rate_hertz = sound_file.frame_rate,
                language_code = 'en-US',
                audio_channel_count = sound_file.channels,
                enable_separate_recognition_per_channel = True,
                use_enhanced = True,
                model = 'phone_call',
                speech_contexts = [{'boost': 20.0}])
            
            # This models equivalent of fit/predict
            response = client.recognize(config, audio)
            
            # build dictionary -> dataframe
#             print(response.results)
            for result in response.results:
                d = {}
                d['transcript'] = result.alternatives[0].transcript
                d['confidence_interval'] = result.alternatives[0].confidence
                d['file_name'] = i
                df = df.append(d, ignore_index=True)
                print(f"File {i} results added to dataframe: {time.time() - loop_time}s")


    
    # export df of transcriptions
    df.to_csv(f'{output_path}{output_filename}', index=False)
    
    print(f"Total time elapsed: {time.time() - total_time_start}s")
    
    #return dataframe incase we would like to use it in local folder
    return df

In [7]:
transcription_df = transcribe('./Datasets/sample_audio/', './Datasets/sample_transcript/', 'full_sample_transcript.csv')

File 0501-1240sample22.wav opened
[alternatives {
  transcript: "question mark"
  confidence: 0.7874553799629211
}
language_code: "en-us"
]
File 0501-1240sample22.wav results added to dataframe: 5.9480509757995605s
File 0501-1240sample36.wav opened
[]
File 0501-1240sample7.wav opened
[alternatives {
  transcript: "we having all these cars are similar to the medications they seemed a little days off"
  confidence: 0.6177639961242676
}
language_code: "en-us"
]
File 0501-1240sample7.wav results added to dataframe: 17.015205144882202s
File 0501-1240sample6.wav opened
[alternatives {
  transcript: "fls you on with the road for another clue"
  confidence: 0.618201494216919
}
language_code: "en-us"
]
File 0501-1240sample6.wav results added to dataframe: 7.523545026779175s
File 0501-1240sample37.wav opened
[alternatives {
  transcript: "play the area of seven to White Street for a mail that"
  confidence: 0.7448995113372803
}
language_code: "en-us"
, alternatives {
  transcript: " open up peop

[alternatives {
  transcript: "the island aftermath reality"
  confidence: 0.7066814303398132
}
language_code: "en-us"
, alternatives {
  transcript: " 12:30"
  confidence: 0.6494860649108887
}
language_code: "en-us"
]
File 0501-1240sample42.wav results added to dataframe: 8.615243911743164s
File 0501-1240sample42.wav results added to dataframe: 8.620067119598389s
File 0501-1240sample43.wav opened
[alternatives {
  transcript: "that\'s Community Springfield I know there\'s a couple of cars off in the area anyway even the tools for you maybe the one-on-one so there is two on 1 second"
  confidence: 0.667500913143158
}
language_code: "en-us"
]
File 0501-1240sample43.wav results added to dataframe: 7.571887969970703s
File 0501-1240sample17.wav opened
[alternatives {
  transcript: "nineteen eighty-two and the cardiac Florida 1190 Adams Street Dorchester 1119 Adams Rd, 12:20 to see mm 1190 Adams"
  confidence: 0.773810625076294
}
language_code: "en-us"
]
File 0501-1240sample17.wav results a