## 1. extract audio track

In [None]:
! ls -l
video_file_name = '做了一颗气疯对手的羽毛球 - 001 - 做了一颗气疯对手的羽毛球(ZH).mp4'
! ffprobe "$video_file_name"

In [18]:
from IPython.display import HTML
from base64 import b64encode

mp4 = open(video_file_name,'rb').read()
data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
HTML("""
<video width=852 controls>
      <source src="%s" type="video/mp4">
</video>
""" % data_url)

Output hidden; open in https://colab.research.google.com to view.

In [19]:
import os

audio_file_name = "_audio.wav"
if not os.path.exists(audio_file_name):
  !ffmpeg -i "$video_file_name" -acodec pcm_s16le -ac 1 -ar 16000 -vn "$audio_file_name"
else:
  !ls -l "$audio_file_name"
!ffprobe "$audio_file_name"

from IPython.display import HTML
from base64 import b64encode

mp4 = open(audio_file_name,'rb').read()
data_url = "data:audio/wav;base64," + b64encode(mp4).decode()
HTML("""
<audio width=852 controls>
      <source src="%s" type="video/mp4">
</audio>
""" % data_url)

Output hidden; open in https://colab.research.google.com to view.

## 2. Speech translation

In [None]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset

device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

#model_id = "openai/whisper-large-v3-turbo"
asr_model_id = "openai/whisper-base"

asr_model = AutoModelForSpeechSeq2Seq.from_pretrained(
    asr_model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
asr_model.to(device)

asr_processor = AutoProcessor.from_pretrained(asr_model_id)

asr_pipe = pipeline(
    "automatic-speech-recognition",
    model=asr_model,
    tokenizer=asr_processor.tokenizer,
    feature_extractor=asr_processor.feature_extractor,
    torch_dtype=torch_dtype,
    chunk_length_s=30,
    device=device,
)

Exception ignored in: <function _xla_gc_callback at 0x7d7a7d4214e0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/jax/_src/lib/__init__.py", line 96, in _xla_gc_callback
    def _xla_gc_callback(*args):
    
KeyboardInterrupt: 


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/290M [00:00<?, ?B/s]

generation_config.json: 0.00B [00:00, ?B/s]

preprocessor_config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

normalizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

In [None]:
import soundfile as sf
import time
import librosa

audio_data_orig, samplerate = sf.read(audio_file_name)

if samplerate != 16000:
  audio_data_orig = librosa.resample(audio_data_orig, orig_sr=samplerate, target_sr=16000)

audio_length_orig = audio_data_orig.size/16_000
print(f"audio length: {audio_length_orig:.2f} seconds")

# truncate to 10s
#if audio_data_orig.size > 160_000:
#  audio_data_orig = audio_data_orig[:160_000]

start_time = time.perf_counter()
#dubs = asr_pipe(audio_data_orig, generate_kwargs={'language':"english", "task":"translate"}, return_timestamps=True)
dubs = asr_pipe(audio_data_orig, return_timestamps=True)
end_time = time.perf_counter()
print(f"translation time: {end_time - start_time:0.4f} seconds")

In [None]:
print(f"audio_data_orig.size:{audio_data_orig.size} bytes")
for segment in dubs['chunks']:
  print(f"{segment['timestamp']}: {segment['text']}")

In [None]:
# @title Visualize
import matplotlib.pyplot as plt
import librosa.display
import librosa
import numpy as np

#load data
array, sampling_rate = librosa.load("_audio.wav")

# waveform
plt.figure().set_figwidth(24)
librosa.display.waveshow(array, sr=sampling_rate)

# spectrogram
D = librosa.stft(array)
S_db = librosa.amplitude_to_db(np.abs(D), ref=np.max)

plt.figure().set_figwidth(24)
librosa.display.specshow(S_db, x_axis="time", y_axis="hz")
plt.colorbar()

# mel spectrogram
S = librosa.feature.melspectrogram(y=array, sr=sampling_rate, n_mels=128, fmax=8000)
S_dB = librosa.power_to_db(S, ref=np.max)

plt.figure().set_figwidth(24)
librosa.display.specshow(S_dB, x_axis="time", y_axis="mel", sr=sampling_rate, fmax=8000)
plt.colorbar()

In [None]:
# @title audio separation: install libraries

!pip install spleeter
!pip install demucs

In [None]:
# @title audio separation with spleeter
!spleeter separate  -p spleeter:2stems -o output _audio.wav

In [None]:
!ls -lR output

from IPython.display import Audio, display
display(Audio("_audio.wav"))
display(Audio("output/_audio/vocals.wav"))
display(Audio("output/_audio/accompaniment.wav"))

In [None]:
# @title audio separation with Demucs
!demucs _audio.wav
!ls -lR separated/htdemucs

from IPython.display import Audio, display
display(Audio("_audio.wav"))
display(Audio("separated/htdemucs/bass.mp3"))
display(Audio("separated/htdemucs/drums.mp3"))
display(Audio("separated/htdemucs/other.mp3"))
display(Audio("separated/htdemucs/vocals.mp3"))

In [None]:
# @title speaker diarization - load pyannote
%%capture
!pip install pyannote.audio

In [None]:
# @title speaker diarization

import torch
from pyannote.audio import Pipeline
import librosa
import numpy as np

#load data
array, sampling_rate = librosa.load("_audio.wav")

diarization_pipeline = Pipeline.from_pretrained(
    "pyannote/speaker-diarization@2.1", use_auth_token=True
)

input_tensor = torch.from_numpy(array[None, :]).float()
print(input_tensor.shape)
outputs = diarization_pipeline(
    {"waveform": input_tensor, "sample_rate":sampling_rate}
)

print(outputs)

In [None]:
print(outputs)
print(outputs.to_lab())
print(outputs.to_rttm())
outputs

## 3. tts

In [None]:
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from IPython.display import Audio
import numpy as np
from tqdm import tqdm

processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
speech_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
#embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
#speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsquee ze(0)
speaker_embeddings = torch.tensor([-0.07474489510059357,0.002643823390826583,0.03688539192080498,0.03512439504265785,-0.005208780989050865,-0.03423682972788811,-0.08709239214658737,0.027717752382159233,0.04135262593626976,0.015155801549553871,-0.07620799541473389,-0.0964214950799942,0.05223611742258072,0.04240100085735321,0.04186772555112839,0.054444827139377594,0.01739475317299366,0.03321673348546028,0.008912730030715466,0.019664224237203598,0.030064281076192856,0.010342113673686981,-0.011965553276240826,-0.03283921256661415,-0.0634637176990509,-0.008157500997185707,-0.060943834483623505,-0.011470903642475605,0.03994050994515419,0.03919297829270363,-0.0035242666490375996,0.06483065336942673,0.0351378433406353,-0.0016681051347404718,0.05319290608167648,-0.04745408892631531,0.007469510659575462,0.051649171859025955,0.0020652529783546925,-0.05772737041115761,0.00554873701184988,-0.0036845675203949213,0.041346244513988495,0.04835839569568634,0.0238394383341074,-0.11541786789894104,-0.01815902814269066,0.01155827846378088,-0.0698034018278122,0.044879667460918427,0.010146931745111942,0.027808697894215584,0.03384612500667572,0.043776459991931915,-0.1081085056066513,-0.05728144571185112,-0.009199238382279873,0.013370316475629807,0.022656310349702835,0.02106654644012451,0.0017284282948821783,-0.007368892431259155,-0.015889307484030724,-0.0198047012090683,0.028659097850322723,0.031234370544552803,0.030591562390327454,-0.041826363652944565,-0.07365202903747559,-0.05863726884126663,0.005236026830971241,0.010184486396610737,0.02379629574716091,0.00679893558844924,0.026819724589586258,0.03838685527443886,0.03257431462407112,-0.0033332007005810738,-0.0864490419626236,-0.08450169116258621,-0.07030824571847916,-0.06029101461172104,-0.052337560802698135,-0.05924786999821663,-0.03188478201627731,-0.07612700015306473,-0.06602782011032104,0.03249499574303627,0.032045286148786545,-0.03449323773384094,0.028878649696707726,-0.05996255576610565,0.0203450508415699,-0.07937837392091751,0.04966126009821892,-0.03308264538645744,0.049292538315057755,0.027551742270588875,-0.07814554125070572,-0.06124405935406685,0.04698098823428154,-0.05457557365298271,-0.10711847245693207,0.02075541764497757,0.04674331471323967,0.02410520613193512,0.06988665461540222,0.030459139496088028,0.029673665761947632,0.037770453840494156,-0.08807750046253204,-0.01126333512365818,0.08092061430215836,0.007537479046732187,0.03392012417316437,0.06526593118906021,-0.05841786786913872,0.020404169335961342,-0.049709614366292953,0.03572451323270798,0.03479814529418945,-0.05884891748428345,0.012071526609361172,0.05365372821688652,-0.05958378687500954,0.04602493718266487,-0.07443774491548538,0.041104238480329514,0.034925080835819244,0.04881758242845535,-0.016469750553369522,0.029331296682357788,0.02859329804778099,0.054612793028354645,0.013668832369148731,-0.07295277714729309,-0.06085388734936714,0.038344696164131165,-0.06646919250488281,-0.015249960124492645,0.02219722606241703,0.002317242557182908,-0.04584980010986328,0.057733774185180664,-0.08519744127988815,0.024049952626228333,0.018364157527685165,-0.02072151191532612,0.003982945811003447,-0.10561719536781311,0.02979273907840252,-0.04987327381968498,-0.07849834859371185,0.007934740744531155,0.036807265132665634,0.04111422598361969,0.049148209393024445,-0.09208708256483078,-0.07261878997087479,0.03867354243993759,0.033997636288404465,0.03281630575656891,0.024939432740211487,0.009947285987436771,-0.038853589445352554,0.004217676818370819,0.012936485931277275,0.016868174076080322,0.03315247222781181,0.03941621631383896,0.012447413057088852,-0.07002250105142593,0.01717609539628029,-0.07449460029602051,-0.02719980850815773,0.010918683372437954,-0.04549262300133705,0.01623145304620266,0.053597480058670044,-0.0853971317410469,0.027622628957033157,-0.05715799331665039,0.012696417048573494,0.006171656306833029,-0.07740526646375656,-0.01162845827639103,0.039500147104263306,0.025573061779141426,-0.06979802995920181,-0.06028183549642563,0.04068611189723015,0.022157464176416397,-0.06614097207784653,0.01626768335700035,0.026119401678442955,0.012975613586604595,0.031535692512989044,0.018736770376563072,0.04483550414443016,-0.023903517052531242,0.04630449786782265,0.037567541003227234,-0.060992028564214706,0.01335993129760027,0.015740595757961273,0.012750066816806793,0.03330856189131737,0.02662089094519615,0.036886632442474365,0.022088468074798584,0.0032368008978664875,-0.06527955830097198,-0.06216329336166382,0.042685672640800476,-0.05620260909199715,0.042113807052373886,0.02376597374677658,-0.05867401883006096,0.033393148332834244,0.02914613112807274,-0.05902326852083206,-0.0031576615292578936,-0.06894227117300034,-0.05789955332875252,-0.05461506545543671,0.04082470014691353,0.057532377541065216,0.0765174925327301,0.06278862804174423,0.03027660772204399,-0.024636613205075264,0.04821140319108963,0.04738625884056091,-0.019783632829785347,0.027890894562005997,-0.009368840605020523,0.05040566250681877,-0.001797153614461422,0.003578711301088333,0.05357080325484276,-0.0695725753903389,0.020486855879426003,-0.08710305392742157,0.0044944025576114655,-0.06761045008897781,0.029314877465367317,0.04172307252883911,0.0318637378513813,0.03311328962445259,0.03522223234176636,0.050369441509246826,0.01281869038939476,0.0074380021542310715,-0.06020970270037651,0.014677142724394798,0.04063631594181061,0.03268294408917427,0.036881864070892334,-0.06639500707387924,0.06948862969875336,0.0071528274565935135,-0.05936615914106369,0.05941660702228546,0.026838917285203934,-0.0006050336523912847,0.04615551605820656,0.03221263363957405,0.0429956391453743,0.028975723311305046,0.009937259368598461,0.02933293953537941,0.0007681047427468002,-0.027055863291025162,0.01296965405344963,-0.0793551504611969,0.0235446747392416,0.026026282459497452,0.041221484541893005,-0.06441888958215714,-0.04811068996787071,-0.009411598555743694,0.023835420608520508,0.04085332527756691,-0.07913480699062347,0.029194671660661697,0.052332695573568344,0.0062669734470546246,0.033208709210157394,-0.10416773706674576,0.003781320294365287,0.01906805858016014,0.011800936423242092,0.04475213587284088,-0.05519372969865799,0.03362629935145378,0.0022386410273611546,0.027802377939224243,-0.025562606751918793,0.029766608029603958,0.024714438244700432,-0.03898638114333153,0.047194499522447586,0.022479314357042313,-0.07421331852674484,0.011782785877585411,0.03859677165746689,0.014212680980563164,0.01999730058014393,0.03465714305639267,0.04840122163295746,0.03194015100598335,0.021001605316996574,-0.005360795650631189,0.03339202702045441,-0.08817162364721298,-0.05788463354110718,-0.018991932272911072,0.009926128201186657,-0.06693514436483383,0.04493489861488342,-0.04398319870233536,0.026823291555047035,-0.035034388303756714,0.008483992889523506,0.034049130976200104,-0.07437850534915924,0.03819500282406807,0.0490955226123333,-0.04417441412806511,-0.09288392961025238,-0.046222228556871414,0.0036631543189287186,0.020891420543193817,0.04090827330946922,-0.06625478714704514,0.04991475120186806,0.043689578771591187,0.004815760534256697,-0.024766966700553894,0.03029077686369419,0.015694953501224518,-0.04965530335903168,0.014839718118309975,0.01525207795202732,-0.06685241311788559,0.02862832322716713,0.05061541125178337,0.028090057894587517,-0.06197202205657959,-0.06672331690788269,-0.05355874076485634,0.009311419911682606,-0.05638501048088074,0.09885109215974808,0.024070121347904205,-0.04510633647441864,-0.005064956843852997,0.03750596567988396,-0.04295717552304268,0.032802287489175797,-0.0972873792052269,0.024752894416451454,-0.002238929970189929,0.040538884699344635,0.04788079857826233,0.017457451671361923,-0.0631578266620636,0.0029476494528353214,0.009611893445253372,0.02568051777780056,0.005539656616747379,0.035837605595588684,-0.058227624744176865,0.026197098195552826,-0.015046726912260056,-0.0016220827819779515,0.04214366525411606,0.02237509749829769,0.04098764806985855,0.02997656911611557,-0.07284840941429138,-0.11325317621231079,0.046951837837696075,0.016804732382297516,0.020462367683649063,0.016670765355229378,0.03420377895236015,-0.0555630698800087,0.028231602162122726,0.06478877365589142,0.019855745136737823,0.025523794814944267,-0.022565029561519623,0.05106881633400917,-0.004287736490368843,-0.01257051806896925,0.03778013214468956,-0.0713963508605957,-0.0011696778237819672,-0.009815702214837074,0.026815012097358704,-0.04624961316585541,-0.03201886638998985,0.008973309770226479,0.0054025505669415,0.009504747577011585,0.005194731988012791,-0.05913089960813522,-0.04718157649040222,-0.08097729086875916,-0.04915035143494606,0.023611657321453094,0.0014368495903909206,-0.010382466949522495,0.0382838137447834,-0.05365371331572533,-0.0035727813374251127,-0.08121512830257416,-0.13388198614120483,-0.020450986921787262,-0.0645972490310669,0.002566778566688299,0.024269837886095047,-0.01041062455624342,-0.06228507682681084,0.037596896290779114,0.05975359305739403,0.03501012176275253,0.015445739962160587,-0.04279089719057083,-0.04109042510390282,-0.011383198201656342,-0.02107555791735649,0.03127964586019516,0.02557465061545372,0.016607951372861862,0.052315130829811096,0.01952190510928631,0.027512354776263237,-0.07657983899116516,0.024858396500349045,0.029490211978554726,0.031971123069524765,0.0019712012726813555,-0.032920047640800476,0.0077959345653653145,0.02981044352054596,0.004797802306711674,-0.010275439359247684,-0.009850324131548405,0.04801104962825775,0.03558178246021271,0.02711130864918232,0.026120105758309364,0.012872223742306232,0.029125098139047623,0.01975320465862751,-0.07201159745454788,-0.05201341584324837,0.019590124487876892,-0.010659851133823395,0.007280176505446434,0.059019748121500015,0.06040883809328079,-0.07943958789110184,0.04665379598736763,0.032098542898893356,-0.04006946459412575,0.03952088952064514,0.04412809759378433,-0.0015381444245576859,0.008693463169038296,0.019868547096848488,0.005048331338912249,-0.043332330882549286,-0.06829051673412323,0.006053863558918238,-0.004997326992452145,0.04765036702156067,0.06544014066457748,-0.06168901547789574,-0.06050042062997818,0.0057837339118123055,0.03510945662856102,0.03501885384321213,0.04173685610294342,-0.053481798619031906,0.04718702659010887,-0.0569387711584568,-0.01094904076308012,-0.03860919922590256,0.04401363804936409,-0.03985997289419174,0.018636897206306458,-0.004632298368960619,0.004343101754784584,-0.055825475603342056,-0.014864529483020306,-0.07084617018699646,-0.06337606906890869,0.0076637351885437965,0.06382157653570175,-0.06868094205856323,0.055222149938344955,0.03996462747454643,-0.01440516673028469,-0.031189268454909325,0.02739126794040203,0.028849242255091667,-0.02752627432346344,0.024715613573789597,-0.07416892051696777]).unsqueeze(0)

def synthesize(text):
  inputs = processor(text=text, return_tensors="pt")
  speech = speech_model.generate_speech(
    inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder
  )
  return {'sampling_rate':16000, 'audio':speech.cpu().numpy()}

# test
#output = synthesize(dubs['chunks'][0]['text'])
#Audio(data=output['audio'], rate=output['sampling_rate'])

audio_data_output = np.zeros(int(audio_length_orig * 16_000), dtype=np.float64)
for i, chunk in enumerate(tqdm(dubs['chunks'], desc="Processing audio segments...")):
  output = synthesize(chunk['text'])
  offset = int(chunk['timestamp'][0] * 16_000)
  print("offset at ", i, ":", offset)
  if offset > 16_000*60 or i>23:
    print(offset)
    break
  length = output['audio'].shape[0]
  audio_data_output[offset:offset+length] = output['audio']

audio_out_file = 'translation_en.wav'
sf.write(audio_out_file, audio_data_output, 16_000)

Audio(filename=audio_out_file)

In [None]:
for i, chunk in enumerate(tqdm(dubs['chunks'], desc="Processing audio segments...")):
  print(i, len(chunk['text']), chunk['text'])

## 4. subtitle track generation

In [1]:
import pandas as pd

subtitle_file_name = 'subtitle.srt'


with open(subtitle_file_name, 'w') as f:
  for i, chunk in enumerate(tqdm(dubs['chunks'], desc="Generating subtitles...")):
    if chunk["timestamp"][0] is None or chunk["timestamp"][1] is None:
      continue
    f.write(f'{i+1}\n')
    f.write(f'{pd.to_datetime(chunk["timestamp"][0], unit="s").to_pydatetime().strftime("%H:%M:%S,%f")[:-3]}')
    f.write(f' --> {pd.to_datetime(chunk["timestamp"][1], unit="s").to_pydatetime().strftime("%H:%M:%S,%f")[:-3]}\n')
    f.write(f'{chunk["text"]}\n\n')

print()
!head $subtitle_file_name -n 10

NameError: name 'tqdm' is not defined

## 5. compose video

In [None]:
video_out_file = 'result.mp4'

!ls -l "$video_file_name"
!ls -l "$audio_out_file"
!ls -l "$subtitle_file_name"
!ffmpeg -i "$video_file_name" -i "$audio_out_file" -i "$subtitle_file_name" \
  -c:v copy -map 0:v:0 -map 1:a:0 -c:s mov_text -metadata:s:s:0 language=eng $video_out_file

In [None]:
from IPython.display import HTML
from base64 import b64encode

mp4 = open(video_out_file,'rb').read()
data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
HTML("""
<video width=852 controls>
      <source src="%s" type="video/mp4">
</video>
""" % data_url)