<a href="https://colab.research.google.com/github/anujpunekar20/video-translator/blob/main/flask_vt_api.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install flask, ngrok and other libraries/modules


In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

!pip install flask pyngrok

# Ensure Wav2Lip and its requirements are installed
!git clone https://github.com/Rudrabha/Wav2Lip.git
!pip install -r Wav2Lip/requirements.txt
!pip install librosa moviepy gdown
!pip install transformers TTS
!pip install git+https://github.com/openai/whisper.git
!pip install --upgrade --no-deps --force-reinstall git+https://github.com/openai/whisper.git
%cd Wav2Lip
!gdown --id 1_OvqStxNxLc7bXzlaVG5sz695p-FVfYY -O checkpoints/wav2lip_gan.pth

# Flask app

In [None]:
from pyngrok import ngrok
token="your-ngrok-authtoken"
ngrok.set_auth_token(token)

In [None]:
import os
import subprocess
from flask import Flask, request, jsonify, send_file
import whisper
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from TTS.api import TTS
import moviepy.editor as mp
%cd Wav2Lip

app = Flask(__name__)

# Create the uploads directory if it doesn't exist
if not os.path.exists('uploads'):
    os.makedirs('uploads')
if not os.path.exists('results'):
    os.makedirs('results')


@app.route("/upload", methods=["POST"])
def upload():
    video_file = request.files['video']
    ref_audio_file = request.files['ref_audio']

    video_path = os.path.join("uploads", video_file.filename)
    ref_audio_path = os.path.join("uploads", ref_audio_file.filename)

    video_file.save(video_path)
    ref_audio_file.save(ref_audio_path)

    # Extract audio from video
    video = mp.VideoFileClip(video_path)
    audio_file = "extracted_audio.wav"
    video.audio.write_audiofile(audio_file)

    # Transcribe Audio with Whisper
    model = whisper.load_model("small")
    result = model.transcribe(audio_file)
    transcribed_text = result['text']
    print("Transcribed Text:", transcribed_text)

    # Translate Text to Hindi using NLLB-200 model
    model_name = "facebook/nllb-200-distilled-600M"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    translation_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

    # Define language codes
    src_lang = "eng_Latn"
    tgt_lang = "hin_Deva"

    # Prepare the input text
    input_text = f"{src_lang} {transcribed_text} {tgt_lang}"

    # Tokenize and translate
    inputs = tokenizer(input_text, return_tensors="pt", padding=True)
    translated_ids = translation_model.generate(**inputs, forced_bos_token_id=tokenizer.lang_code_to_id[tgt_lang])
    translated_text = tokenizer.decode(translated_ids[0], skip_special_tokens=True)
    print("Translated Text:", translated_text)

    # Convert Text to Speech in Hindi
    tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)
    tts_audio_path = "translated_audio.mp3"
    tts.tts_to_file(translated_text,
                    file_path=tts_audio_path,
                    speaker_wav=ref_audio_path, # Use uploaded reference audio file
                    language="hi")

    del model, result, transcribed_text, inputs, translated_ids
    import gc
    gc.collect()

    result_video_path = "results/result_voice.mp4"
    subprocess.run([
        "python", "inference.py",
        "--checkpoint_path", "checkpoints/wav2lip_gan.pth",
        "--face", video_path,
        "--audio", tts_audio_path,
        "--outfile", result_video_path,
        "--wav2lip_batch_size", "1"
    ])

    # Check if the output file is created
    if not os.path.exists(result_video_path):
        return jsonify({"error": "Lip-sync failed, result video not created."}), 500

    # Return the lip-synced video
    return send_file(result_video_path, as_attachment=True)

if __name__ == "__main__":
    port = 5000
    public_url = ngrok.connect(port)
    print(" * ngrok tunnel \"{}\" -> \"http://127.0.0.1:{}/\"".format(public_url, port))
    app.run(port=port)


In [None]:
%ls