<a href="https://colab.research.google.com/github/akshziitj/CSL7770-Major/blob/main/M23CSA503_SU_Major_Q_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Step 1: Install Required Libraries

In [1]:
!pip install openai-whisper TTS
!pip install moviepy
!pip install evaluate
!pip install pesq
!pip install "tortoise-tts>=3.0.0"
!pip install tokenizers==0.13.3
!pip install rotary_embedding_torch

import os
import gdown
import torch
import whisper
import librosa
import gdown
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
from TTS.api import TTS
from evaluate import load
import re
import zipfile
import torchaudio
from tortoise.api import TextToSpeech
from tortoise.utils.audio import load_voice

device = "cuda" if torch.cuda.is_available() else "cpu"

Collecting openai-whisper
  Downloading openai-whisper-20240930.tar.gz (800 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.5/800.5 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting TTS
  Downloading TTS-0.22.0-cp311-cp311-manylinux1_x86_64.whl.metadata (21 kB)
Collecting tiktoken (from openai-whisper)
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting anyascii>=0.3.0 (from TTS)
  Downloading anyascii-0.3.2-py3-none-any.whl.metadata (1.5 kB)
Collecting pysbd>=0.3.4 (from TTS)
  Downloading pysbd-0.3.4-py3-none-any.whl.metadata (6.1 kB)
Collecting pandas<2.0,>=1.4 (from TTS)
  Downloading pandas-1.5.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting trainer>=0.0.32 (from TTS)
  Dow

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

# Step 2: Download Video from Google Drive

In [None]:
file_id = "1CfOgUpI-t3SDJVeYHNpDmYmzOWd6rlTA"
output_name = "lecture_video.mp4"

# Download the file
gdown.download(f"https://drive.google.com/uc?id={file_id}", output_name, quiet=False)

# Check if download succeeded
assert os.path.exists(output_name), "Video download failed!"
print(f"Downloaded video to: {output_name}")


# Step 3: Convert video into audio

In [None]:
import moviepy.editor as mp

# Step 1: Define video file name and audio output
video_file = output_name
audio_file = "extracted_audio.wav"

# Step 2: Convert video to audio
clip = mp.VideoFileClip(video_file)
clip.audio.write_audiofile(audio_file)

# Step 3: Load Whisper and transcribe
model = whisper.load_model("base")  # It can use "small", "medium", large
result = model.transcribe(audio_file)

# Step 4: Display the transcription
print("Transcription:\n", result["text"])

# Step 5: Save the transcription to a text file
transcription_file = "transcription.txt"
with open(transcription_file, "w", encoding="utf-8") as f:
    f.write(result["text"])

print(f"Transcription saved to {transcription_file}")


# Step 4: Transcription Using Whisper (Handles Code-Switching)

In [None]:
# Load Whisper model
model = whisper.load_model("base", device=device)

# Transcribe lecture audio/video
result = model.transcribe(audio_file, language="hi")  # for Hindi-English mix

# Remove filler words
filler_words = ["um", "uh", "you know", "like", "so"]
def clean_filler_words(text):
    for word in filler_words:
        text = re.sub(rf"\b{word}\b", "", text, flags=re.IGNORECASE)
    return re.sub(r"\s+", " ", text).strip()

transcription_raw = result["text"]
transcription_cleaned = clean_filler_words(transcription_raw)

# Save cleaned text
with open("cleaned_transcription.txt", "w") as f:
    f.write(transcription_cleaned)

print("Cleaned Transcription:\n", transcription_cleaned)


# Step 5: Translate to a Low-Resource Language (e.g., Tamil)

In [None]:
# Load M2M100 model and tokenizer
model_name = "facebook/m2m100_418M"
tokenizer = M2M100Tokenizer.from_pretrained(model_name)
model = M2M100ForConditionalGeneration.from_pretrained(model_name).to(device)

# Set source and target languages
tokenizer.src_lang = "en"

# Translation function
def translate_m2m100(text, tokenizer, model, target_lang="ta"):
    encoded = tokenizer(text, return_tensors="pt").to(device)
    generated_tokens = model.generate(
        **encoded,
        forced_bos_token_id=tokenizer.get_lang_id(target_lang),
        max_length=512
    )
    return tokenizer.decode(generated_tokens[0], skip_special_tokens=True)

# Translate
translated_text = translate_m2m100(transcription_cleaned, tokenizer, model, target_lang="ta") # tamil

# Save to file
with open("translated_text_tamil.txt", "w", encoding="utf-8") as f:
    f.write(translated_text)

# Print result
print("Translated to Tamil:\n", translated_text)


# Step 6: TTS - Generate Audio in own Voice (Transfer Learning / Speaker Embedding)

In [None]:
# Code to Generate Tamil TTS in own Voice
folder_id = "1Gb_MoL2qBRInRVALE-Htz0DetXddU4Yk"
my_voice_output_name = "my_voice_id"

# Download the file
gdown.download_folder(id=folder_id, output=my_voice_output_name, quiet=False)

# Check if download succeeded
assert os.path.exists(my_voice_output_name), "my voice audio download failed!"
print(f"Downloaded audio files to: {my_voice_output_name}")

# Convert .m4a files to .wav
for filename in os.listdir(my_voice_output_name):
    if filename.endswith(".m4a"):
        filepath = os.path.join(my_voice_output_name, filename)
        wav_filepath = os.path.splitext(filepath)[0] + ".wav"
        # Use torchaudio to load and resave as .wav
        audio, sr = torchaudio.load(filepath)
        torchaudio.save(wav_filepath, audio, sr)
        print(f"Converted {filename} to {wav_filepath}")

# Step 1: Load Tortoise model
tts = TextToSpeech()

# Step 2: Load own voice (reference clips)
voice_samples = [torchaudio.load(os.path.join(my_voice_output_name, f))[0]
                 for f in os.listdir(my_voice_output_name) if f.endswith('.wav')]
conditioning_latents = tts.get_conditioning_latents(voice_samples)

# Step 3: Tamil text to synthesize
text = translated_text  # "translated_text_tamil.txt"

# Step 4: Generate audio in own voice
#gen_audio = tts.tts(text, voice_samples=voice_samples, conditioning_latents=conditioning_latents)
gen_audio = tts.tts_with_preset(text, voice_samples=voice_samples, conditioning_latents=conditioning_latents, preset="fast")

# Step 5: Save output
torchaudio.save("tamil_tts_own_voice.wav", gen_audio.squeeze(0).cpu(), 24000)
print("TTS audio generated in own voice.")

# Step 7: Evaluation - WER/CER

In [None]:
# Word Error Rate (WER) / Character Error Rate (CER)
wer_metric = load("wer")
cer_metric = load("cer")

# Load reference transcription
with open("transcription.txt", "r", encoding="utf-8") as f:
    reference = f.read().strip()

# Load hypothesis transcription (cleaned)
with open("cleaned_transcription.txt", "r", encoding="utf-8") as f:
    hypothesis = f.read().strip()

# Compute metrics
wer = wer_metric.compute(predictions=[hypothesis], references=[reference])
cer = cer_metric.compute(predictions=[hypothesis], references=[reference])

print(f"WER: {wer:.3f}")
print(f"CER: {cer:.3f}")

# Step 8: Evaluation - PESQ/MOS

In [None]:
## PESQ or MOS calculation
import soundfile as sf

# Load reference and degraded audio properly
ref, sr_ref = sf.read("tamil_tts_own_voice.wav")
deg, sr_deg = sf.read("extracted_audio.wav")

# Resample if necessary and ensure mono
import numpy as np
import scipy.signal

def resample_mono(audio, sr, target_sr=16000):
    if len(audio.shape) > 1:  # Stereo to mono
        audio = np.mean(audio, axis=1)
    if sr != target_sr:
        audio = scipy.signal.resample_poly(audio, target_sr, sr)
    return audio.astype(np.float32)

ref = resample_mono(ref, sr_ref)
deg = resample_mono(deg, sr_deg)

min_len = min(len(ref), len(deg))
ref = ref[:min_len]
deg = deg[:min_len]

score = pesq(16000, ref, deg, 'wb')
print("PESQ Score:", score)