#### installs

In [26]:
!pip install datasets
!pip install pytesseract
!pip install sounddevice
!pip install moviepy
!pip install ffmpeg-python librosa
!pip install transformers
!pip install ffprobe
!pip install sentencepiece



#### model
* **fn: text_to_speech(text)**

In [27]:
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
import torch
from datasets import load_dataset
import librosa
import matplotlib.pyplot as plt
import IPython as ipy

In [28]:
processor = SpeechT5Processor.from_pretrained('microsoft/speecht5_tts')
model = SpeechT5ForTextToSpeech.from_pretrained('microsoft/speecht5_tts')
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
embedding_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split='validation')

In [29]:
def text_to_speech(text):
  voice_id = 7411
  speaker_embedding = torch.tensor(embedding_dataset[voice_id]['xvector']).unsqueeze(0)
  inputs = processor(text=text, return_tensors='pt', padding=True)
  speech = model.generate_speech(inputs["input_ids"], speaker_embedding, vocoder=vocoder)
  return speech


In [30]:
ipy.display.Audio(text_to_speech("A puppy is in deep sleep. She's obviously sleeping"), rate=16000)

#### vidoe to audio
**input_audio.wav will be created from input_video.mp4**

In [31]:
import os
from moviepy.editor import VideoFileClip

def extract_audio(input_file, output_file, overwrite=True):
    try:
        # Check if output file already exists
        if os.path.exists(output_file) and not overwrite:
            raise FileExistsError(f"Output file '{output_file}' already exists. Use overwrite=True to replace.")

        # Load the video file
        video = VideoFileClip(input_file)

        # Check if the video has an audio stream
        if video.audio is None:
            raise ValueError("No audio stream found in the input file.")

        # Extract the audio
        audio = video.audio

        # Write the audio to file
        audio.write_audiofile(output_file, fps=16000, nbytes=2, codec='pcm_s16le')

        # Close the video to release resources
        video.close()

        print(f"Audio extracted successfully to {output_file}")

    except FileExistsError as e:
        print(str(e))
    except Exception as e:
        print(f"An error occurred: {str(e)}")

# Example usage
input_file = 'input_video.mp4'
output_file = 'input_audio.wav'
extract_audio(input_file, output_file)

MoviePy - Writing audio in input_audio.wav


chunk:  64%|██████▍   | 798/1240 [00:00<00:00, 3940.00it/s, now=None]

                                                                      

MoviePy - Done.
Audio extracted successfully to input_audio.wav




#### video to subtitles
**fn: transcribe(audio_array)**

In [32]:
from transformers import pipeline
from datasets import load_dataset
import IPython as ipy

In [33]:
pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base")
dataset = load_dataset("facebook/voxpopuli", "en", split="validation", streaming=True, trust_remote_code=True)
sample = next(iter(dataset))

In [34]:
ipy.display.Audio(sample['audio']['array'], rate=sample['audio']['sampling_rate'], embed=True)

In [35]:
def transcribe(audio):
  outputs = pipe(audio, max_new_tokens=256, generate_kwargs={'task':'transcribe',}, return_timestamps=True)
  return outputs['chunks']

In [36]:
transcribe(sample['audio'].copy())

[{'timestamp': (0.0, 4.56),
  'text': ' Many measures have been taken not only in September but also in March.'},
 {'timestamp': (4.56, 10.56),
  'text': ' And of course we see some effects of those measures, perhaps not enough, but there are effects of those measures.'},
 {'timestamp': (10.56, 15.32),
  'text': " And the situation could have been worse if we didn't have taken those measures."}]

In [37]:
sample['audio'].copy()

{'path': 'dev_part_0/20160526-0900-PLENARY-3-en_20160526-10:18:50_2.wav',
 'array': array([-0.00088501, -0.0007019 , -0.00057983, ...,  0.00085449,
         0.00067139,  0.00061035]),
 'sampling_rate': 16000}

In [38]:
sample['audio']['array'].shape

(245439,)

#### audio to text

In [39]:
import numpy as np
def load_audio_for_whisper(file_path, sr=16000):
    try:
        # Load the audio file
        audio, sr = librosa.load(file_path, sr=sr)

        # Convert to float32 numpy array
        audio = audio.astype(np.float32)

        # Create the dictionary
        audio_dict = {
            'path': file_path,
            'array': audio,
            'sampling_rate': sr
        }

        return audio_dict
    except Exception as e:
        print(f"Error loading audio file: {str(e)}")
        return None

In [40]:
audio_data = load_audio_for_whisper('input_audio.wav')
audio_data

{'path': 'input_audio.wav',
 'array': array([0.        , 0.        , 0.        , ..., 0.04510498, 0.04690552,
        0.04721069], dtype=float32),
 'sampling_rate': 16000}

In [41]:
transcribed_text = transcribe(audio_data.copy())

In [42]:
transcribed_text[:5]

[{'timestamp': (0.0, 5.08),
  'text': " Say I'm running a coin flip experiment, and I want to find out how likely each alchemy's"},
 {'timestamp': (5.08, 6.44), 'text': ' heads or tails.'},
 {'timestamp': (6.44, 11.52),
  'text': " So I public coin once, twice, a hundred times, and once I've repeated that experiment enough"},
 {'timestamp': (11.52, 16.28),
  'text': ' times, I see that about 50% of my flips or heads and 50% are tails.'},
 {'timestamp': (16.28, 19.12),
  'text': " Now, that's not a particularly interesting result."}]

In [43]:
ipy.display.Audio(text_to_speech(transcribed_text[0]['text']), rate=16_000)

In [44]:
ipy.display.Audio(text_to_speech(transcribed_text[2]['text']), rate=16_000)

#### sample video and it's subtitles
* **fn: get_subs()**

In [45]:
def get_subs():
  return transcribed_text

#### text to audio

In [46]:
from tqdm import tqdm
import os
import soundfile as sf
import numpy as np

def create_audio_from_subs():
    subs = get_subs()
    audio_segments = []

    # Create temporary directory for audio files
    if not os.path.exists('temp_audio'):
        os.makedirs('temp_audio')

    for sub in tqdm(subs, desc="Processing subtitles"):
        audio_tensor = text_to_speech(sub['text'])
        audio_numpy = audio_tensor.numpy().squeeze()  # Convert tensor to numpy array
        audio_segments.append(audio_numpy)

    # Concatenate all audio segments
    full_audio = np.concatenate(audio_segments)

    # Write the final audio to a .wav file
    sf.write('output.wav', full_audio, 16000)

    print("Audio file 'output.wav' has been created.")

create_audio_from_subs()

Processing subtitles: 100%|██████████| 33/33 [01:55<00:00,  3.49s/it]

Audio file 'output.wav' has been created.





#### remove original audio from vidoe

In [47]:
from moviepy.editor import VideoFileClip

def remove_audio_from_video(input_video_path, output_video_path):
    # Load the video clip
    video = VideoFileClip(input_video_path)

    # Create a new video without audio
    video_without_audio = video.without_audio()

    # Write the result to a file
    video_without_audio.write_videofile(output_video_path, codec='libx264')

    # Close the clips
    video.close()
    video_without_audio.close()

# Usage
input_video = "input_video.mp4"
output_video = "video_without_audio.mp4"

remove_audio_from_video(input_video, output_video)
print(f"Video without audio has been created: {output_video}")

Moviepy - Building video video_without_audio.mp4.
Moviepy - Writing video video_without_audio.mp4



t:   2%|▏         | 113/4649 [00:00<00:08, 558.73it/s, now=None]

                                                                 

Moviepy - Done !
Moviepy - video ready video_without_audio.mp4
Video without audio has been created: video_without_audio.mp4




#### merge audio and video

In [48]:
from moviepy.editor import VideoFileClip, AudioFileClip, CompositeVideoClip

def merge_video_and_audio(video_path, audio_path, output_path):
    # Load the video without audio
    video = VideoFileClip(video_path)

    # Load the audio file
    audio = AudioFileClip(audio_path)

    # Set the audio of the video clip
    video_with_audio = video.set_audio(audio)

    # Write the result to a file
    video_with_audio.write_videofile(output_path, codec='libx264', audio_codec='aac')

    # Close the clips
    video.close()
    audio.close()
    video_with_audio.close()

# Usage
video_without_audio = "video_without_audio.mp4"
audio_file = "output.wav"
final_video = "final_video_with_audio.mp4"

merge_video_and_audio(video_without_audio, audio_file, final_video)
print(f"Final video with merged audio has been created: {final_video}")


Moviepy - Building video final_video_with_audio.mp4.
MoviePy - Writing audio in final_video_with_audioTEMP_MPY_wvf_snd.mp4


chunk:  12%|█▏        | 459/3795 [00:00<00:01, 2297.12it/s, now=None]

                                                                      

MoviePy - Done.
Moviepy - Writing video final_video_with_audio.mp4



                                                                 

Moviepy - Done !
Moviepy - video ready final_video_with_audio.mp4
Final video with merged audio has been created: final_video_with_audio.mp4
