## Processing longer audio files
Google Cloud Speech will only process files < 1min synchronously. Longer files require the long_running_recognize method which is asynchronous and requires the files to be hosted on GC.

In [1]:
# function to use long-file transcription

def transcribe_gcs(gcs_uri):
    """Asynchronously transcribes the audio file specified by the gcs_uri."""
    from google.cloud import speech
    from google.cloud.speech import enums
    from google.cloud.speech import types
    
    client = speech.SpeechClient()

    audio = types.RecognitionAudio(uri=gcs_uri)
    config = types.RecognitionConfig(
        #encoding=enums.RecognitionConfig.AudioEncoding.FLAC,
        #sample_rate_hertz=16000,
        language_code='en-US')

    operation = client.long_running_recognize(config, audio)

    response = operation.result(timeout=90)

    transcription_str = ''
    confidence_metric = 0
    for result in response.results:
        transcription_str += ' ' + format(result.alternatives[0].transcript)
        confidence_metric = format(result.alternatives[0].confidence)

    #for result in response.results:
        # The first alternative is the most likely one for this portion.
        #print(u'Transcript: {}'.format(result.alternatives[0].transcript))
        #print('Confidence: {}'.format(result.alternatives[0].confidence))
        
    return transcription_str, confidence_metric

In [2]:
from google.cloud import storage
storage_client = storage.Client()

# List GCS buckets
buckets = list(storage_client.list_buckets())
print(buckets)

[<Bucket: audio_a>, <Bucket: youtube_wavs>]


In [2]:
def list_blobs(bucket_name):
    """Lists all the blobs in the bucket."""
    from google.cloud import storage
    
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)

    blobs = bucket.list_blobs()
    return(blobs)

In [3]:
blob_list = list_blobs('youtube_wavs')

blobs = []
for blob in blob_list:
    blobs.append(blob)

gc_files = []    
for blob in blobs:    
    gc_file = str(blob)[-24:-1]
    gc_files.append(gc_file)

In [None]:
from tqdm import tqdm

#ytIDs = []
#transcripts = []
#confidences = []

for filename in tqdm(gc_files[990:(len(gc_files)-1)]):
    wav_file = 'gs://youtube_wavs/' + filename
    t, c = transcribe_gcs(wav_file)
    ytID = filename[0:11]
    
    ytIDs.append(ytID)
    transcripts.append(t)
    confidences.append(c)

Save out as dataframe

In [21]:
import pandas as pd
trans_temp = {'ytID': ytIDs,
             'transcript': transcripts,
             'confidence': confidences}

transcript_df = pd.DataFrame(trans_temp)
transcript_df.to_csv('transcript_df.csv')

In [14]:
curr_id = ytIDs[len(ytIDs)-1]
curr_id

'vI5EzZ74fRw'

Reload dataframe in case of stall

In [4]:
import pandas as pd
restart = pd.read_csv('trans_temp.csv')
ytIDs = restart['ytID'].tolist()
transcripts = restart['transcript'].tolist()
confidences = restart['confidence'].tolist()

# Test zone

## Transcribe a single audio file hosted on GC

In [None]:
filenaame =   '8ns4cfMABqQ-cropped.wav'
audio_file = 'gs://youtube_wavs/' + filenaame
t, c = transcribe_gcs(audio_file)

In [41]:
print(c)

0.9208365082740784


Write the transcript to a file

In [None]:
transcript = open("transcript2.txt", "w")
transcript.write(t)
transcript.close()