##### `This notebook reads the audios (in "s##e##.wav") in folder "audios" and extracts the text info from the audio. The get_time_nodes() function reads in rttm file and split the audio by the time when speaker role changes. The output data of the audio script will be stored into folder "timeData". The extract_whole_text() does the speech recognition on the whole input video. Note that wav2vec cannot process long audios. Create following folders before running the file:` 

- audios: all audio files for WILTY
- rttm: find all rttm files
- timeData: the folder where output files are stored

Last updated on August 3rd, 2022

In [3]:
import librosa
import numpy as np
import scipy as sp
from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip
import moviepy.editor as mp

from transformers import AutoTokenizer, AutoFeatureExtractor, AutoModelForCTC
from datasets import load_dataset
import librosa
import datasets
import torch
from IPython.display import Audio
import moviepy.editor as mp
import os
import pandas as pd
import sox

# import model, feature extractor, tokenizer
model = AutoModelForCTC.from_pretrained("facebook/wav2vec2-base-960h")
tokenizer = AutoTokenizer.from_pretrained("facebook/wav2vec2-base-960h")
feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h",sampling_rate=16000)

ModuleNotFoundError: No module named 'librosa'

In [None]:
def get_time_nodes(filename):
    '''
    parameter: 
    filename: str. the name of the audio without the suffix

    reads in the rttm file in folder "rttm"

    returns:
    timeNodes: list., time spot that speaker changes
    speakerID: list, same length with the timeNodes, marks the corrosponding speaker
    '''
    f = open("..rttm/"+filename+".rttm", "r")
    data=f.read().split('\n')[:-1]

    #time and speaker data from the file
    onset=[float(d.split(" ")[3]) for d in data]
    duration=[float(d.split(" ")[4]) for d in data]
    speaker=[int(d.split(" ")[7][-1]) for d in data]

    #initialize variables
    current=0
    speakerID=[]
    timeNodes=[]

    for i in range(len(speaker)):
        if speaker[i]!=current:
            timeNodes.append(onset[i])
            speakerID.append(speaker[i-1])
            current=speaker[i]

    return timeNodes,speakerID

In [None]:
def extract_text_snippet(nameOfAudio,listOfTime):
    '''
    parameter: 
    nameOfAudio: str. the name of audio to be processed without the suffix
    listOfTime: an array

    read in audio files from the folder audios
    generate audio snippet, desample the audio into audio.mp3, and extract the snippet into temp.mp3
    extract text from the temp.mp3

    output stored to folder "timeData"
    '''
    
    #desample
    os.system('sox audios/'+nameOfAudio+'.wav -r 16000 audio.mp3')
    dataList=[]

    for i in range(len(listOfTime)-1):
        timeOff=listOfTime[i]
        ffmpeg_extract_subclip("audio.mp3", listOfTime[i], listOfTime[i+1], targetname="temp.mp3")

        # Load the audio with the librosa library
        input_audio, sr = librosa.load("temp.mp3", sr=16000)

        # forward sample through model to get greedily predicted transcription ids
        input_values = feature_extractor(input_audio, return_tensors="pt").input_values
        logits = model(input_values).logits[0]
        pred_ids = torch.argmax(logits, axis=-1)

        # retrieve word stamps (analogous commands for `output_char_offsets`)
        outputs = tokenizer.decode(pred_ids, output_word_offsets=True)
        # compute `time_offset` in seconds as product of downsampling ratio and sampling_rate
        time_offset = model.config.inputs_to_logits_ratio / feature_extractor.sampling_rate
    
        for d in outputs.word_offsets:
            dataList.append([d["word"],round(d["start_offset"] * time_offset+timeOff, 2),round(d["end_offset"] * time_offset+timeOff, 2)])
    df=pd.DataFrame(dataList, columns=('word', 'start_time', 'end_time'))
    df.to_csv("timeData/"+nameOfAudio+".csv")


In [None]:
def extract_whole_text(filename):
    '''
    parameter:
    filename: the name of the audio without the suffix

    does the text extraction based on the timeSeg split
    '''
    timeNodes,_=get_time_nodes(filename)
    timeNodes.insert(0,0)
    extract_text_snippet(filename,timeNodes)

In [4]:
extract_whole_text("utility/videoAnnotate/s01e03")
# print(get_time_nodes("s01e03"))