In [1]:
import pandas as pd
import numpy
import matplotlib
from openai import OpenAI
import speech_recognition as sr
from datetime import datetime
from os import path, listdir, getcwd

In [2]:
r = sr.Recognizer()
times_by_ggl = {}
times_by_whisper = {}
transcription_by_ggl = {}
transcription_by_whisper = {}

# Get all audio files
audio_files = listdir(getcwd() + "/../audio_files/audio-stimuli/3/")

In [3]:
def time_taken_google_speech(audio):
    start = datetime.now()
    # recognize speech using Google Speech Recognition
    try:
        text = r.recognize_google(audio)
        # print("Google Speech Recognition thinks you said " + r.recognize_google(audio))
    except sr.UnknownValueError:
        print("Google Speech Recognition could not understand audio")
    except sr.RequestError as e:
        print("Could not request results from Google Speech Recognition service; {0}".format(e))

    end = datetime.now()
    total_time = end - start
    return (total_time, text)

In [4]:
def time_taken_openAI_whisper(audio):
    start = datetime.now()
    try:
        text = r.recognize_whisper(audio)
        # print(r.recognize_whisper(audio))
    except sr.UnknownValueError:
        print("Did not understand")
    except sr.RequestError as e:
        print(e)
    end = datetime.now()
    total_time = end - start
    return (total_time, text)
    # return total_time

In [5]:
for audio_file in audio_files:
    print(audio_file)
    # AUDIO_FILE = path.join(path.dirname(path.realpath(__file__)), audio_file)
    AUDIO_FILE = getcwd() + "/../audio_files/audio-stimuli/3/" + audio_file

    # use the audio file as the audio source
    with sr.AudioFile(AUDIO_FILE) as source:
        audio = r.record(source)  # read the entire audio file
    print(f"Processing Audio file - {audio_file}")
    print()
    print("Time taken by Google Speech Recognition")
    tt_ggl, text_ggl = time_taken_google_speech(audio)
    tt_ggl = int(tt_ggl.total_seconds() * 1e9)
    print(tt_ggl)
    times_by_ggl[audio_file] = tt_ggl
    transcription_by_ggl[audio_file] = text_ggl

    print("Time taken by OpenAI Whisper")
    tt_whisper, text_whisper = time_taken_google_speech(audio)
    tt_whisper = int(tt_whisper.total_seconds() * 1e9)
    print(tt_whisper)
    times_by_whisper[audio_file] = tt_whisper
    transcription_by_whisper[audio_file] = text_whisper
    print()

3.4.wav
Processing Audio file - 3.4.wav

Time taken by Google Speech Recognition
580530000
Time taken by OpenAI Whisper
625565000

3.5.wav
Processing Audio file - 3.5.wav

Time taken by Google Speech Recognition
448273000
Time taken by OpenAI Whisper
432072000

3.7.wav
Processing Audio file - 3.7.wav

Time taken by Google Speech Recognition
353154000
Time taken by OpenAI Whisper
833780000

3.6.wav
Processing Audio file - 3.6.wav

Time taken by Google Speech Recognition
440638000
Time taken by OpenAI Whisper
635584000

3.2.wav
Processing Audio file - 3.2.wav

Time taken by Google Speech Recognition
531713000
Time taken by OpenAI Whisper
491243000

3.3.wav
Processing Audio file - 3.3.wav

Time taken by Google Speech Recognition
457198000
Time taken by OpenAI Whisper
391849000

3.1.wav
Processing Audio file - 3.1.wav

Time taken by Google Speech Recognition
470970000
Time taken by OpenAI Whisper
510756000

3.8.wav
Processing Audio file - 3.8.wav

Time taken by Google Speech Recognition
63

In [6]:
total_times_ggl = 0
total_times_whisper = 0

print("Total times taken by Google Speech Recognition")
for k in times_by_ggl:
    print(k, times_by_ggl[k])
    t = times_by_ggl[k]
    total_times_ggl += t
print()

print("Total times taken by OpenAI Whisper")
for k in times_by_whisper:
    print(k, times_by_whisper[k])
    t = times_by_whisper[k]
    total_times_whisper += t
print()


print(f'Total time taken by Google Speech Recognition to process {len(audio_files)} audio files is {str(total_times_ggl)[0: len(str(total_times_ggl)) - 9]}.{str(total_times_ggl)[len(str(total_times_ggl)) - 9: ]} seconds')
print(f'Total time taken by OpenAI Whisper to process {len(audio_files)} audio files is {str(total_times_whisper)[0: len(str(total_times_whisper)) - 9]}.{str(total_times_whisper)[len(str(total_times_whisper)) - 9: ]} seconds')

Total times taken by Google Speech Recognition
3.4.wav 580530000
3.5.wav 448273000
3.7.wav 353154000
3.6.wav 440638000
3.2.wav 531713000
3.3.wav 457198000
3.1.wav 470970000
3.8.wav 635656000
3.9.wav 452390000
3.10.wav 443943000

Total times taken by OpenAI Whisper
3.4.wav 625565000
3.5.wav 432072000
3.7.wav 833780000
3.6.wav 635584000
3.2.wav 491243000
3.3.wav 391849000
3.1.wav 510756000
3.8.wav 383654000
3.9.wav 434507000
3.10.wav 463274000

Total time taken by Google Speech Recognition to process 10 audio files is 4.814465000 seconds
Total time taken by OpenAI Whisper to process 10 audio files is 5.202284000 seconds


In [7]:
times_by_ggl_sorted = {k: v for (k, v) in sorted(times_by_ggl.items(), key = lambda x: x[0])}
transcription_by_ggl_sorted = {k: v for (k, v) in sorted(transcription_by_ggl.items(), key = lambda x: x[0])}

times_by_whisper_sorted = {k: v for (k, v) in sorted(times_by_whisper.items(), key = lambda x: x[0])}
transcription_by_whisper_sorted = {k: v for (k, v) in sorted(transcription_by_whisper.items(), key = lambda x: x[0])}

In [8]:
whisper_df = pd.DataFrame(list(zip(times_by_whisper_sorted.keys(), times_by_whisper_sorted.values(), transcription_by_whisper_sorted.values())))
whisper_df.columns = ['File Name', 'Time to Convert', 'Transcribed Text']
whisper_df

Unnamed: 0,File Name,Time to Convert,Transcribed Text
0,3.1.wav,510756000,close the door
1,3.10.wav,463274000,you're losing me
2,3.2.wav,491243000,don't do that
3,3.3.wav,391849000,how are
4,3.4.wav,625565000,how's it going
5,3.5.wav,432072000,I don't agree
6,3.6.wav,635584000,please get me
7,3.7.wav,833780000,please repeat that
8,3.8.wav,383654000,what's going on
9,3.9.wav,434507000,why don't you


In [9]:
ggl_df = pd.DataFrame(list(zip(times_by_ggl_sorted.keys(), times_by_ggl_sorted.values(), transcription_by_ggl_sorted.values())))
ggl_df.columns = ['File Name', 'Time to Convert', 'Transcribed Text']
ggl_df

Unnamed: 0,File Name,Time to Convert,Transcribed Text
0,3.1.wav,470970000,close the door
1,3.10.wav,443943000,you're losing me
2,3.2.wav,531713000,don't do that
3,3.3.wav,457198000,how are
4,3.4.wav,580530000,how's it going
5,3.5.wav,448273000,I don't agree
6,3.6.wav,440638000,please get me
7,3.7.wav,353154000,please repeat that
8,3.8.wav,635656000,what's going on
9,3.9.wav,452390000,why don't you
