Objective: Determine success rate of GCS in correctly identifying individual words.

Steps:
1. Select at random 100 audio files from Mozilla dataset
2. Write their original transcripts to dataframe
3. GCS transcribe each file
4. Write GCS transcripts to dataframe
5. Determine most common words in original transcripts and count top 5
6. Count same words in GCS transcripts
7. Hit rate is benchmark

### Import Mozilla data

In [4]:
import pandas as pd

# import transcripts to a dataframe
meta_train = pd.read_csv('cv_corpus_v1/cv-valid-train.csv')

### Select N random clips

In [124]:
import random

n = 100
randoms = random.sample(range(0, len(meta_train)), n)

selection = meta_train.iloc[randoms, [0,1]]
selection.sort_index(inplace=True)
selection = selection.reset_index()
selection['transcript_gc'] = np.nan

### Transcode to GC compatible format

In [125]:
from pydub import AudioSegment

for x in selection.filename:
    file_path = str('cv_corpus_v1/' + x)
    file_name = x[-17:-4]
    clip = AudioSegment.from_file(file_path)
    clip.export('clips_to_GCS/' + str(file_name) + '.wav', format = 'wav')

### Speech recognition using Google Cloud

In [126]:
import glob

list_to_GCS = sorted(glob.glob('clips_to_GCS/*.wav'))

In [127]:
import speech_recognition as sr

r = sr.Recognizer()

In [None]:
import time
start_time = time.time()

for i, e in enumerate(list_to_GCS):
    sample = sr.AudioFile(e)
    with sample as source:
         audio = r.record(source)
    transcript_gc = r.recognize_google_cloud(audio)
    selection.transcript_gc[i] = transcript_gc
    
end_time = time.time()

selection.head()

In [130]:
print("Elapsed time was %g seconds" % (end_time - start_time))

Elapsed time was 254.47 seconds


### Calculate difference ratio between original text and GCS transcript

In [216]:
from difflib import SequenceMatcher

selection['diff_ratio'] = np.nan

for row in range(0, len(selection)):
    a = selection.text[row]
    b = selection.transcript_gc[row]
    ratio = SequenceMatcher(None, a, b).ratio()
    selection.diff_ratio[row] = ratio
    
selection.diff_ratio.mean()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


0.8717051764077093

In [140]:
selection.to_csv('GCS_benchmark_1.csv')

### Single word differences

In [212]:
# List of words in original text
og_words = {
    x for x in ' '.join(
        selection.text.str.lower().tolist()
    ).split() if x.isalpha()
} 

og_words = list(og_words)

# compute words that weren't caught by GCS:
import re

og_count = np.asarray([ selection['text'].str.contains(r'\b'+woi+r'\b', flags=re.IGNORECASE).sum() for woi in og_words ])
gc_count = np.asarray([ selection['transcript_gc'].str.contains(r'\b'+woi+r'\b', flags=re.IGNORECASE).sum() for woi in og_words ])

In [215]:
sw_diffs = og_count - gc_count
sw_diffs_perc = sw_diffs / og_count
fail_perc = sw_diffs_perc[sw_diffs_perc >= 0].mean()

print(fail_perc)

0.2633317591531516
