<a href="https://colab.research.google.com/github/bhagesh-codebeast/VideoTranscribeTranslate/blob/main/VideoTranscribeandTranslate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Convert video to text

In [None]:
!python --version

Python 3.10.12


In [None]:
!pipx ensurepath
!apt install pipx
!pip install pydub
!pip install pytube
!pip install moviepy
!pip install -U deep-translator
!pipx install insanely-fast-whisper
!pip install -Uqq WhisperSpeech

In [None]:
from deep_translator import GoogleTranslator
GoogleTranslator().get_supported_languages(as_dict=True)

In [27]:
import os
import re
import json
import glob
import torch
import subprocess
from pytube import YouTube
from pydub import AudioSegment
import torch.nn.functional as F
from moviepy.editor import VideoFileClip
from whisperspeech.pipeline import Pipeline
from deep_translator import GoogleTranslator


class videoTranscribeandTranslate:
  def __init__(self, input_video_path,  ifw_path, output_audio_path=os.path.join(os.getcwd(),'downloaded_audio.mp3'), ifw_transcript=os.path.join(os.getcwd(),'downloaded_timestamp.json'),translate=True,source='auto',language='en',speaker=os.path.join(os.getcwd(),'downloaded_audio.mp3'),translated_audio=os.path.join(os.getcwd(),'translated_audio.mp3')):
    self.input_video_path = input_video_path
    self.output_audio_path = output_audio_path
    self.ifw_transcript = ifw_transcript
    self.ifw_path = ifw_path
    self.translate = translate
    self.source = source
    self.language = language
    self.speaker = speaker
    self.translated_audio = translated_audio
  def getAudio(self):
    if self.input_video_path and self.input_video_path.startswith('http'):
      YouTube(self.input_video_path).streams.filter(only_audio=True, file_extension='mp4').first().download(filename=self.output_audio_path)
    else:
      video_clip = VideoFileClip(self.input_video_path)
      audio_clip = video_clip.audio
      audio_clip.write_audiofile(self.output_audio_path, codec='mp3')
    return self.output_audio_path
  def transcribeAudio(self):
    if os.path.exists(self.getAudio()):
      output_json_path = ''
      command = [self.ifw_path, "--transcript-path", self.ifw_transcript, "--file-name", self.output_audio_path]
      result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
      if result.returncode == 0:
        match = re.search(r'output\.json', result.stdout)
        if match:
          output_json_path = match.group()
      return output_json_path
  def getText(self):
    self.transcribeAudio()
    text = json.loads(open(self.ifw_transcript, "r").read()).get("text", "")
    return text
  def translateText(self):
    if self.translate:
      string = self.getText()
      translated = {}
      result_list = [string[i:i+2000] if i+2000 >= len(string) else string[i:i+2000].rsplit(' ', 1)[0] for i in range(0, len(string), 2000)]
      for text in result_list:
        translated[text] = {GoogleTranslator(source=self.source, target=self.language).translate(text=str(text))}
      return translated
  def changeAudio(self):
    if self.language == 'en':
      # string = self.getText()
      # result_list = [string[i:i+2000] if i+2000 >= len(string) else string[i:i+2000].rsplit(' ', 1)[0] for i in range(0, len(string), 2000)]
      result_list = self.translateText().values()
      # Q4 tiny model
      pipe = Pipeline(s2a_ref='collabora/whisperspeech:s2a-q4-tiny-en+pl.model')
      # Q4 small model
      # pipe = Pipeline()
      for i, item in enumerate(result_list, 1):
        filename = os.path.join(os.path.dirname(self.translated_audio),f"temp_{i}.mp3")
        pipe.generate_to_file(filename, f"""{item}""", lang=self.language, speaker=self.speaker)
      mp3_files = glob.glob(os.path.join(os.path.dirname(self.translated_audio),"temp_*.mp3"))
      mp3_files.sort(key=lambda x: int(x.split("_")[1].split(".")[0]))
      merged_audio = AudioSegment.silent()
      for mp3_file in mp3_files:
        audio_segment = AudioSegment.from_file(mp3_file, format="mp3")
        merged_audio += audio_segment
      merged_audio.export(self.translated_audio, format="mp3")
      for mp3_file in mp3_files:
        os.remove(mp3_file)
      return self.translated_audio


In [28]:
input_video_path = 'https://www.youtube.com/watch?v=p8QOnty6rSU'
ifw_path = '/root/.local/pipx/venvs/insanely-fast-whisper/bin/insanely-fast-whisper'

instance = videoTranscribeandTranslate(input_video_path, ifw_path)

In [29]:
%%time
text = instance.changeAudio()




CPU times: user 4min 8s, sys: 2.3 s, total: 4min 11s
Wall time: 5min 32s


----------------------------------------------------

# References
## 1. [insanely-fast-whisper](https://github.com/Vaibhavs10/insanely-fast-whisper)
## 2. [Youtube Summariser](https://github.com/jxcinta/youtube_summariser/blob/main/youtube_summariser.py)
## 3. [WhisperSpeech](https://github.com/collabora/WhisperSpeech/tree/main)