<a href="https://colab.research.google.com/github/bhagesh-codebeast/TextBasedVideoEditing/blob/main/TextBasedVideoEditing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%time
!pip install -q --progress-bar off torch transformers gradio_client==0.2.7 gradio==3.35.2 datasets librosa ffmpeg-python python-dotenv aiohttp

In [2]:
import os
import time
import json
import torch
import base64
import ffmpeg
import aiohttp
import asyncio
import datasets
import gradio as gr
from pathlib import Path
from difflib import Differ
from transformers import pipeline

In [None]:
MODEL = "facebook/wav2vec2-base-960h"
cuda = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
device = 0 if torch.cuda.is_available() else -1
videos_out_path = Path("./videos_out")
videos_out_path.mkdir(parents=True, exist_ok=True)

In [None]:
%%time
# Run this block twice to load models
speech_recognizer = pipeline(task="automatic-speech-recognition", model=f'{MODEL}', tokenizer=f'{MODEL}', framework="pt", device=device)

In [10]:
async def speech_to_text(video_file_path):
  """
  Takes a video path to convert to audio, transcribe audio channel to text and char timestamps
  """
  if (video_file_path == None):
    raise ValueError("Error no video input")
  video_path = Path(video_file_path)
  # convert video to audio 16k using PIPE to audio_memory
  try:
    audio_memory, _ = ffmpeg.input(video_path).output('-', format="wav", ac=1, ar='16k').overwrite_output().global_args('-loglevel', 'quiet').run(capture_stdout=True)
  except Exception as e:
    raise RuntimeError("Error converting video to audio")
  last_time = time.time()
  try:
    output = speech_recognizer(audio_memory, return_timestamps="char", chunk_length_s=10, stride_length_s=(4, 2))
    transcription = output["text"].lower()
    timestamps = [[chunk["text"].lower(), chunk["timestamp"][0].tolist(), chunk["timestamp"][1].tolist()] for chunk in output['chunks']]
    return (transcription, transcription, timestamps)
  except Exception as e:
    raise RuntimeError("Error Running inference with local model", e)

async def cut_timestamps_to_video(video_in, transcription, text_in, timestamps):
  """
  Given original video input, text transcript + timestamps, and edit ext cuts video segments into a single video
  """
  video_path = Path(video_in)
  video_file_name = video_path.stem
  if (video_in == None or text_in == None or transcription == None):
    raise ValueError("Inputs undefined")
  d = Differ()
  # compare original transcription with edit text
  diff_chars = d.compare(transcription, text_in)
  # remove all text aditions from diff
  filtered = list(filter(lambda x: x[0] != '+', diff_chars))
  # groupping character timestamps so there are less cuts
  idx = 0
  grouped = {}
  for (a, b) in zip(filtered, timestamps):
    if a[0] != '-':
      if idx in grouped:
        grouped[idx].append(b)
      else:
        grouped[idx] = []
        grouped[idx].append(b)
    else:
      idx += 1
  # after grouping, gets the lower and upter start and time for each group
  timestamps_to_cut = [[v[0][1], v[-1][2]] for v in grouped.values()]
  between_str = '+'.join(map(lambda t: f'between(t,{t[0]},{t[1]})', timestamps_to_cut))
  if timestamps_to_cut:
    video_file = ffmpeg.input(video_in)
    video = video_file.video.filter("select", f'({between_str})').filter("setpts", "N/FRAME_RATE/TB")
    audio = video_file.audio.filter("aselect", f'({between_str})').filter("asetpts", "N/SR/TB")
    output_video = f'./videos_out/{video_file_name}.mp4'
    ffmpeg.concat(video, audio, v=1, a=1).output(output_video).overwrite_output().global_args('-loglevel', 'quiet').run()
  else:
    output_video = video_in
  tokens = [(token[2:], token[0] if token[0] != " " else None)for token in filtered]
  return (tokens, output_video)

# ---- Gradio Layout -----
video_in = gr.Video(label="Video file", elem_id="video-container")
text_in = gr.Textbox(label="Transcription", lines=10, interactive=True)
video_out = gr.Video(label="Video Out")
diff_out = gr.HighlightedText(label="Cut Diffs", combine_adjacent=True)

css = """
#edit_btn, #reset_btn { align-self:stretch; }
#\\31 3 { max-width: 540px; }
.output-markdown {max-width: 65ch !important;}
#video-container{ align-self:stretch; }
"""

with gr.Blocks(css=css) as demo:
  transcription_var = gr.State()
  timestamps_var = gr.State()
  with gr.Row():
    with gr.Column():
      gr.Markdown("""
      # Edit Video By Editing Text
      This project is a simple video editor where the edits are made by editing the audio transcription.
      ## Upload Video
      """)
  with gr.Row():
    with gr.Column():
      video_in.render()
      transcribe_btn = gr.Button("Transcribe", elem_id="transcribe_btn")
      transcribe_btn.click(speech_to_text, [video_in], [text_in, transcription_var, timestamps_var])
  with gr.Row():
    with gr.Column():
      gr.Markdown("""## Edit Transcript
      Edit the text below (only cuts, not additions).""")
      text_in.render()
      with gr.Row():
        cut_btn = gr.Button("Edit Video", elem_id="edit_btn")
        # send audio path and hidden variables
        cut_btn.click(cut_timestamps_to_video, [video_in, transcription_var, text_in, timestamps_var], [diff_out, video_out])
        reset_transcription = gr.Button("Reset Trascription", elem_id="reset_btn")
        reset_transcription.click(lambda x: x, transcription_var, text_in)
    with gr.Column():
      gr.Markdown("""## Output Video""")
      video_out.render()
      diff_out.render()
demo.queue()

if __name__ == "__main__":
  demo.launch(debug=True)

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://08d620828ee8c83301.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://08d620828ee8c83301.gradio.live
