##### `This notebook generates a dictionary containing word name, start time, and end time for each word spoken in the indicated small clip of the original video and store the dictionary as "script.txt". To run this file, include a video file (rename it into "clip.mp4") under the same folder that this notebook resides and specify the start and end time of the target clip in the second block.`

- intermediate files: audio_1.mp3, audio_16.mp3
- output file: script.txt

Last updated on August 3rd, 2022

In [None]:
from transformers import AutoTokenizer, AutoFeatureExtractor, AutoModelForCTC
from datasets import load_dataset
import librosa
import datasets
import torch
from IPython.display import Audio
import moviepy.editor as mp
import os
import sox

# import model, feature extractor, tokenizer
model = AutoModelForCTC.from_pretrained("facebook/wav2vec2-base-960h")
tokenizer = AutoTokenizer.from_pretrained("facebook/wav2vec2-base-960h")
feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h",sampling_rate=16000)



`process the audio`

In [None]:
#extract the audio in mp3 format from the clip
clip = mp.VideoFileClip("clip.mp4")
start=285
end=295
# end = clip.duration

# Save the paths for later
clip_paths = []

#try a small demo
sub_clip=clip.subclip(start,end)
sub_clip.audio.write_audiofile("audio_1.mp3")
clip_paths.append("audio_1.mp3")

#desample
os.system('sox audio_1.mp3 -r 16000 audio_16.mp3')


`Actual output`

In [None]:

# Load the audio with the librosa library
input_audio, sr = librosa.load("audio_16.mp3", sr=16000)

# forward sample through model to get greedily predicted transcription ids
input_values = feature_extractor(input_audio, return_tensors="pt").input_values
logits = model(input_values).logits[0]
pred_ids = torch.argmax(logits, axis=-1)

# retrieve word stamps (analogous commands for `output_char_offsets`)
outputs = tokenizer.decode(pred_ids, output_word_offsets=True)
# compute `time_offset` in seconds as product of downsampling ratio and sampling_rate
time_offset = model.config.inputs_to_logits_ratio / feature_extractor.sampling_rate

word_offsets = [
    {
        "word": d["word"],
        "start_time": round(d["start_offset"] * time_offset, 2),
        "end_time": round(d["end_offset"] * time_offset, 2),
    }
    for d in outputs.word_offsets
]
# compare word offsets with audio `common_voice_en_100038.mp3` online on the dataset viewer:
# https://huggingface.co/datasets/common_voice/viewer/en/train

#word_offsets is a list of dictionary
word_offsets

In [None]:
# #find the time segmentation slot
# timeSegList=[]
# #find the place where to stop
# for i in range(len(word_offsets)):
#     if word_offsets[i]['word']=="WAS" or word_offsets[i]['word']=="IS":
#         if word_offsets[i+1]['word']=='IT' and word_offsets[i+2]['word']=='A':
#             timeSegList.append(word_offsets[i]['word'])

with open('script.txt', 'w') as f:
    for item in word_offsets:
        f.write("%s\n" % item)
            