In [6]:
import csv
import os
import pdb
import subprocess
import wave
from pydub import AudioSegment

from google.cloud import speech 
from google.cloud import storage 
from google.oauth2 import service_account

In [7]:
credential_file = '../gcloud_credentials.json' # Cannot call gcloud api without this credentials file
credentials = service_account.Credentials.from_service_account_file(credential_file)
client = speech.SpeechClient(credentials = credentials)
storage_client = storage.Client.from_service_account_json(credential_file)
bucketname = "movie-character-audio"
        
def get_frame_rate(audio_file_name):
    """Return the frame rate of the audio file"""
    with wave.open(audio_file_name, "rb") as wave_file:
        frame_rate = wave_file.getframerate()
        return frame_rate
    
def upload_blob(bucket_name, source_file_name, destination_blob_name):
    """Uploads a file to the bucket."""
    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)
    blob.upload_from_filename(source_file_name)
    
def delete_blob(bucket_name, blob_name):
    """Deletes a blob from the bucket."""
    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.blob(blob_name)
    blob.delete()
    
def google_transcribe(audio_file, time_init, words_from_the_subtitles):
    """Return a transcript with timestamp for the audio file studied relative to time_init (the time the sentence appears in the the studied audio segment)"""
    
    frame_rate = get_frame_rate(audio_file)
    bucket_name = bucketname
    source_file_name = audio_file
    destination_blob_name = audio_file
    
    upload_blob(bucket_name, source_file_name, destination_blob_name)
    gcs_uri = 'gs://' + bucketname + '/' + audio_file
    audio = speech.RecognitionAudio(uri = gcs_uri)

    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=frame_rate,
        language_code='en-US',
        speech_contexts = [{"phrases": words_from_the_subtitles}],
        enable_word_time_offsets=True,
        enable_automatic_punctuation=True)

    # Detects speech in the audio file
    operation = client.long_running_recognize(config=config, audio=audio)
    response = operation.result(timeout=10000)
    transcript = []
    if len(response.results) > 0:
        result = response.results[0] 
        words_info = result.alternatives[0].words 
        for word_info in words_info:
            start_time = word_info.start_time
            time = start_time.seconds + start_time.microseconds * 1e-6
            time_adjusted = round(time_init/1000 + time, 1)
            transcript.append([time_adjusted, word_info.word])

    delete_blob(bucket_name, destination_blob_name)
    return transcript

In [8]:
def remove_charac_and_split(words):
    sentence_clean = words.replace('- ', ' ') 
    if ':' in sentence_clean:
        sentence_clean = sentence_clean.split(': ')[-1]    
    while "(" in sentence_clean:
        i1 = sentence_clean.index("(")
        i2 = sentence_clean.index(")")
        sentence_clean = sentence_clean[:i1] + sentence_clean[i2+1:]
    while "<" in sentence_clean:
        i1 = sentence_clean.index("<")
        i2 = sentence_clean.index(">")
        sentence_clean = sentence_clean[:i1] + sentence_clean[i2+1:]
    return sentence_clean.strip().split()

In [9]:
def get_timestamp(meld_csv_file, meld_clips_folder, out_path):
    # Parse data in meld_csv_file
    with open(meld_csv_file, 'r') as f:
        reader = csv.reader(f)
        
        first_row = True
        csv_data = {}
        header2idx = {}
        num_data = 0
        for row in reader:
            if first_row:
                for i, header in enumerate(row):
                    csv_data[i] = []
                    header2idx[header] = i
                first_row = False
            else:
                for i, elem in enumerate(row):
                    csv_data[i].append(elem.encode('ascii', 'ignore').decode()) # Remove unicode characters
                num_data += 1
    
    # Collect sentence, character and path to audio file
    sentences = []
    for i in range(num_data):
        start_time = csv_data[header2idx['StartTime']][i]
        end_time = csv_data[header2idx['EndTime']][i]
        utt = csv_data[header2idx['Utterance']][i]
        char = csv_data[header2idx['Speaker']][i]
        clip_file = 'dia{}_utt{}.mp4'.format(csv_data[header2idx['Dialogue_ID']][i], csv_data[header2idx['Utterance_ID']][i])
        clip_path = os.path.join(meld_clips_folder, clip_file)
        sentences.append([start_time, end_time, utt, char, clip_path])
    
    # For each sentence, get the word-level timestamps using Google Speech-to-Text API
    for sentence in sentences:
        words = remove_charac_and_split(sentence[2])
        character = sentence[3]
        clip_path = sentence[4]
        if len(words) > 0:
            temp_file_name = 'temp_audio_processing.wav'
            command = "ffmpeg -i {} -ab 160k -ac 1 -ar 44100 -vn {}".format(clip_path, temp_file_name)
            subprocess.call(command, shell=True)
            transcript = google_transcribe(temp_file_name, start_time, words)
            os.remove(temp_file_name)
            
            #Removing punctuation
            words_from_subtitles = [remove_punctuation(word) for word in words]
            transcript_from_api = [[element[0], remove_punctuation(element[1])] for element in transcript]
            
            #Keeping only words found by the API which are in the subtitles and present once
            to_remove=[]
            for i in range(len(transcript_from_api)):
                word = transcript_from_api[i][1]
                if words_from_subtitles.count(word)!= 1:
                    to_remove.append(i)
                for j in range(i+1, len(transcript_from_api)):
                    if transcript_from_api[j][1] == word:
                        to_remove.append(j)
                        to_remove.append(i)
            for i in reversed(sorted(np.unique(to_remove))):
                transcript_from_api.pop(i)

            #Getting the exact timestamp when possible
            timestamp = [round(start_time/1000, 1)] + [0] * (len(words_from_subtitles) - 1)
            ind_known = {0}
            for i in range(len(transcript_from_api)):
                ind = words_from_subtitles.index(transcript_from_api[i][1])
                timestamp[ind] = round(transcript_from_api[i][0], 1)
                ind_known.add(ind)
            ind_known = sorted(list(ind_known))

            #Interpollate the rest of the timestamp
            i = -1
            for j in range(len(words_from_subtitles)):
                if j in ind_known :
                    i += 1
                elif j < ind_known[-1]:
                    timestamp[j] = round((timestamp[ind_known[i]] + (j - ind_known[i])*(timestamp[ind_known[i+1]] - timestamp[ind_known[i]])/(ind_known[i+1]-ind_known[i])), 1)
                else:
                    timestamp[j] = round(timestamp[j-1]+0.2, 1) 

            #If the result is not coherent, use linear interpollation
            if sorted(timestamp) != timestamp:
                timestamp = [round(start_time/1000 + 2*i, 1)  for i in range(len(words_from_subtitles))]

            #Writing the words and the timestamp in the desired file
            to_write = ''
            for i in range(len(words)):
                to_write += str(timestamp[i]) + ': ' + character + ': ' + words[i] + '\n'
            f = open(out_path,"a")
            f.write(to_write + '\n')
            f.close()

In [10]:
meld_csv_file = 'MELD_friends/MELD.Raw/train_sent_emo.csv'
meld_clips_folder = 'MELD_friends/MELD.Raw/train_clips/'
transcript_with_timestamp_to_create_filename = "movies_transcripts_with_characters/friends/train.txt"
get_timestamp(meld_csv_file, meld_clips_folder, transcript_with_timestamp_to_create_filename)

Forbidden: 403 POST https://storage.googleapis.com/upload/storage/v1/b/movie-character-audio/o?uploadType=multipart: {
  "error": {
    "code": 403,
    "message": "The project to be billed is associated with a closed billing account.",
    "errors": [
      {
        "message": "The project to be billed is associated with a closed billing account.",
        "domain": "global",
        "reason": "accountDisabled",
        "locationType": "header",
        "location": "Authorization"
      }
    ]
  }
}
: ('Request failed with status code', 403, 'Expected one of', <HTTPStatus.OK: 200>)