In [1]:
# pip install --upgrade google-cloud-speech
import os
import io
from google.cloud import speech_v1p1beta1 as speech
from google.cloud.speech_v1p1beta1 import enums
from pydub import AudioSegment
import pandas as pd
import time

In [2]:
# Check audio file properites
sound_file = AudioSegment.from_file('./Datasets/TEST_samples/sample1.wav', format="wav")

print(f'Sample Width: {sound_file.sample_width}')
print(f'Channel Count: {sound_file.channels}')
print(f'Duration: {len(sound_file) / 1000}s')
print(f'Sample Rate: {sound_file.frame_rate}')
print(f'Sample Rate: {sound_file.dBFS}')

Sample Width: 2
Channel Count: 1
Duration: 9.854s
Sample Rate: 22050
Sample Rate: -28.164799306236986


In [23]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = '/Users/alex/ga-dsi-11/police-scanner-speech-to-text-c09b11750e4e.json'
client = speech.SpeechClient()

df = pd.DataFrame()
total_time_start = time.time()
for i in os.listdir('./Datasets/TEST_samples/'):
    if i.endswith('.wav'):
        loop_time = time.time()
        with io.open(f'./Datasets/TEST_samples/{i}', 'rb') as audio_file:
            content = audio_file.read()
            audio = speech.types.RecognitionAudio(content=content)
        print(f"File {i} opened")
        config = speech.types.RecognitionConfig(
            encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
            sample_rate_hertz=22050,
            language_code='en-US',
            audio_channel_count=1,
            enable_separate_recognition_per_channel = True,
            use_enhanced = True,
            model = 'phone_call',
            speech_contexts= [{'boost': 20.0}])
        
        response = client.recognize(config, audio)
        for result in response.results:
            d = {}
            d['transcript'] = result.alternatives[0].transcript
            d['confidence_interval'] = result.alternatives[0].confidence
            d['file_name'] = i
            df = df.append(d, ignore_index=True)
            print(f"File {i} results added to dataframe: {time.time() - loop_time}s")
        

print(f"Total time elapsed: {time.time() - total_time_start}s")

File sample11.wav opened
File sample11.wav processed
File sample11.wav results added to dataframe 
Took 9.391078233718872s to process
File sample11.wav results added to dataframe 
Took 9.397046089172363s to process
File sample10.wav opened
File sample10.wav processed
File sample12.wav opened
File sample12.wav processed
File sample12.wav results added to dataframe 
Took 4.6144468784332275s to process
File sample13.wav opened
File sample13.wav processed
File sample13.wav results added to dataframe 
Took 2.8722410202026367s to process
File sample17.wav opened
File sample17.wav processed
File sample17.wav results added to dataframe 
Took 11.489796161651611s to process
File sample16.wav opened
File sample16.wav processed
File sample16.wav results added to dataframe 
Took 10.83010196685791s to process
File sample16.wav results added to dataframe 
Took 10.836151838302612s to process
File sample14.wav opened
File sample14.wav processed
File sample14.wav results added to dataframe 
Took 13.0012

In [24]:
df

Unnamed: 0,confidence_interval,file_name,transcript
0,0.811442,sample11.wav,a lot of static respiratory distress 1089 Dorc...
1,0.72731,sample11.wav,anyone returning
2,0.664993,sample12.wav,play water at 8:10 a.m. our colorful Longwood
3,0.53508,sample13.wav,Saturday 72nd Street
4,0.773811,sample17.wav,nineteen eighty-two and the cardiac Florida 11...
5,0.705214,sample16.wav,24 monitor comments please
6,0.781541,sample16.wav,I meant probably have one for you two stroke ...
7,0.912839,sample14.wav,okay
8,0.609282,sample14.wav,495 North Friday the Thirteenth battling the ...
9,0.659719,sample15.wav,495 North despite a 13 sat on the 1st and 2nd ...


In [None]:
# # Load in audio file
# segment = AudioSegment.from_wav('./Datasets/sample_audio2/sample12.wav')

# # loud
# response.results

# # Make it louder
# louder_segment = segment +10
# louder_segment.export(f"./Datasets/TEST_samples/sample12loud.wav", format = "wav")

# #
# compressed_audio = segment.compress_dynamic_range(threshold=segment.dBFS)
# compressed_audio.export(f"./Datasets/TEST_samples/sample12compressdynamicrange.wav", format = "wav")

# removeddcoffset = segment.remove_dc_offset()
# removeddcoffset.export(f"./Datasets/TEST_samples/sample12removeddcoffset.wav", format = "wav")

# normailzed = segment.normalize(headroom=0.5)
# normailzed

# # clean_segment = AudioSegment.from_wav('./Datasets/sample_audio/sample5clean.wav')
# # clean_segment

# code for single audio file

# os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = '/Users/alex/ga-dsi-11/police-scanner-speech-to-text-c09b11750e4e.json'
# client = speech.SpeechClient()

# with io.open('./Datasets/sample_audio2/sample12.wav', 'rb') as audio_file:
#     content = audio_file.read()
#     audio = speech.types.RecognitionAudio(content=content)

# config = speech.types.RecognitionConfig(
#     encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
#     sample_rate_hertz=22050,
#     language_code='en-US',
#     audio_channel_count=1,
#     enable_separate_recognition_per_channel = True,
#     use_enhanced = True,
#     model = 'phone_call',
#     speech_contexts= [{'boost': 20.0}]
# )

# response = client.recognize(config, audio)
# for result in response.results:
#     print(result.alternatives[0].transcript)
#     print(result.alternatives[0].confidence)

# df = pd.DataFrame()

# d = {}
# d['transcript'] = result.alternatives[0].transcript
# d['confidence_interval'] = result.alternatives[0].confidence
# d['file_name'] = "TESTESTTEST"

# df.append(d, ignore_index=True)

# os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = '/Users/alex/ga-dsi-11/police-scanner-speech-to-text-c09b11750e4e.json'
# client = speech.SpeechClient()


# with io.open('./Datasets/TEST_samples/sample1.wav', 'rb') as audio_file:
#     content = audio_file.read()
#     audio = speech.types.RecognitionAudio(content=content)

# config = speech.types.RecognitionConfig(
#     encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
#     sample_rate_hertz=22050,
#     language_code='en-US',
#     audio_channel_count=1,
#     enable_separate_recognition_per_channel = True,
#     use_enhanced = True,
#     model = 'phone_call',
#     speech_contexts= [{'boost': 20.0}]
# )

# response = client.recognize(config, audio)
# # for result in response.results:
# print(result.alternatives[0].transcript)
# print(result.alternatives[0].confidence)
# #     if len(result.alternatives[0].transcript) > 0:
# d = {}
# d['transcript'] = result.alternatives[0].transcript
# d['confidence_interval'] = result.alternatives[0].confidence
# d['file_name'] = "i"
# print(d)
# df = df.append(d, ignore_index=True)
# # for result in response.results:
# #     print(result.alternatives[0].transcript)
# #     print(result.alternatives[0].confidence)

# # df = pd.DataFrame()

# # d = {}
# # d['transcript'] = result.alternatives[0].transcript
# # d['confidence_interval'] = result.alternatives[0].confidence
# # d['file_name'] = "TESTESTTEST"

# # df.append(d, ignore_index=True)