In [None]:
!pip install transformers datasets accelerate torchaudio moviepy speechbrain --quiet
!apt install ffmpeg --quiet
!pip install yt_dlp --quiet

Reading package lists...
Building dependency tree...
Reading state information...
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 34 not upgraded.


In [None]:
# !pip install accelerate

In [6]:
# !ffmpeg -i /content/audios/videoplayback.mp4 -q:a 0 -map a /content/audios/videoplayback.mp3

ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enab

In [None]:
import os
import torch
import yt_dlp,requests
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, pipeline
from speechbrain.pretrained import SpeakerRecognition

# Set up Whisper model (Hugging Face version)
device = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 8

tokenizer = AutoProcessor.from_pretrained("openai/whisper-large-v3")
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    "openai/whisper-large-v3",
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    use_safetensors=True,
).to(device)

whisper_pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=tokenizer.tokenizer,
    feature_extractor=tokenizer.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=30,
    batch_size=BATCH_SIZE,
    return_timestamps=False,
    torch_dtype=torch.float16,
    device=device,
)

# Download or use local file
# def extract_audio_from_video(video_path, output_path="audio.wav"):
#     try:
#         clip = VideoFileClip(video_path)
#         clip.audio.write_audiofile(output_path)
#         return output_path
#     except Exception as e:
#         return '/content/audios/videoplayback.mp3'
def extract_audio_from_video(url):
    """Downloads video from URL and extracts audio to 'audio.wav'."""
    if "youtube.com" in url or "youtu.be" in url:
        # Use yt-dlp to get audio
        ydl_opts = {
            'format': 'bestaudio/best',
            'postprocessors': [{
                'key': 'FFmpegExtractAudio',
                'preferredcodec': 'wav',
                'preferredquality': '192',
            }],
            'outtmpl': 'audio.%(ext)s'
        }
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            ydl.download([url])
        return "audio.wav"
    elif url.lower().endswith((".mp4", ".m4v")):
        # Download file and extract audio via ffmpeg
        local_file = "temp_video.mp4"
        r = requests.get(url)
        with open(local_file, "wb") as f:
            f.write(r.content)
        # Extract audio (mono 16kHz) – requires ffmpeg
        os.system(f"ffmpeg -y -i {local_file} -vn -acodec pcm_s16le -ar 16000 -ac 1 audio.wav")
        return "audio.wav"
    else:
        raise ValueError("Unsupported URL format or host.")
# Transcribe + Compare accent
def detect_accent(audio_path):
    result = whisper_pipe(audio_path)
    print("Transcript:", result['text'])

    classifier = SpeakerRecognition.from_hparams(
        source="speechbrain/spkrec-ecapa-voxceleb",
        savedir="pretrained_models/spkrec-ecapa-voxceleb"
    )

    # Reference accent files (must be uploaded) uncomment the ones you want to use
    accents = {
    "Afrikaans": "/content/audios/afrikaans1.mp3",
    "Agni": "/content/audios/agni1.mp3",
    "Armenian": "/content/audios/armenian1.mp3",
    # "Bengali": "/content/audios/bengali11.mp3",
    # "Bulgarian": "/content/audios/bulgarian1.mp3",
    # "Czech": "/content/audios/czech1.mp3",
    # "Dutch": "/content/audios/dutch1.mp3",
    # "English": "/content/audios/english1.mp3",
    # "Farsi": "/content/audios/farsi5.mp3",
    # "French": "/content/audios/french10.mp3",
    # "German": "/content/audios/german1.mp3",
    # "Greek": "/content/audios/greek1.mp3",
    # "Hindi": "/content/audios/hindi2.mp3",
    # "Icelandic": "/content/audios/icelandic1.mp3",
    # "Japanese": "/content/audios/japanese11.mp3",
    # "Korean": "/content/audios/korean25.mp3",
    # "Mandarin": "/content/audios/mandarin2.mp3",
    # "Polish": "/content/audios/polish2.mp3",
    # "Portuguese": "/content/audios/portuguese17.mp3",
    # "Romanian": "/content/audios/romanian19.mp3",
    # "Russian": "/content/audios/russian2.mp3",
    # "Slovak": "/content/audios/slovak1.mp3",
    # "Spanish": "/content/audios/spanish111.mp3",
    # "Swedish": "/content/audios/swedish1.mp3",
    # "Tswana": "/content/audios/tswana2.mp3",
    # "Turkish": "/content/audios/turkish1.mp3",
    # "Urdu": "/content/audios/urdu1.mp3",
    # "Uyghur": "/content/audios/uyghur1.mp3",
    # "Vietnamese": "/content/audios/vietnamese1.mp3",
    # "Yoruba": "/content/audios/yoruba1.mp3",
    "British" : "/content/audios/5_min_british_sound.mp3"
}


    scores = {}
    for accent, ref_file in accents.items():
        if not os.path.exists(ref_file):
            print(f"Missing reference file: {ref_file}")
            continue
        score, _ = classifier.verify_files(ref_file, audio_path)
        scores[accent] = float(score)

    if not scores:
        return {"error": "No reference files found."}

    best_accent = max(scores, key=scores.get)
    confidence = round(scores[best_accent] * 100, 2)
    return {
        "accent": best_accent,
        "confidence": f"{confidence}%",
        "summary": f"The speaker most closely matches a {best_accent} accent with a confidence score of {confidence}%."
    }

# Main logic
def analyze_accent_from_video(video_path):
    audio_path = extract_audio_from_video(video_path)
    result = detect_accent(audio_path)
    os.remove(audio_path)
    return result

video ="https://youtu.be/BVgnIYSKUjY?si=Mcmvamf-jqI3vvh2" # "https://youtu.be/1i9kcBHX2Nw?si=AR6b7mGwul8mNWNa"  # Upload your own short 5–20 sec .mp4 file
result = analyze_accent_from_video(video)
print(result)


Device set to use cuda



[youtube] Extracting URL: https://youtu.be/BVgnIYSKUjY?si=Mcmvamf-jqI3vvh2
[youtube] BVgnIYSKUjY: Downloading webpage
[youtube] BVgnIYSKUjY: Downloading tv client config
[youtube] BVgnIYSKUjY: Downloading tv player API JSON
[youtube] BVgnIYSKUjY: Downloading ios player API JSON
[youtube] BVgnIYSKUjY: Downloading m3u8 information
[info] BVgnIYSKUjY: Downloading 1 format(s): 251
[download] Destination: audio.webm
[download] 100% of    5.89MiB in 00:00:00 at 12.22MiB/s  
[ExtractAudio] Destination: audio.wav
Deleting original file audio.webm (pass -k to keep)



INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Using symlink found at '/content/pretrained_models/spkrec-ecapa-voxceleb/hyperparams.yaml'
INFO:speechbrain.utils.fetching:Fetch custom.py: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached


Transcript:  I will unite our country, not with words, but with action. I will work day in and day out to deliver for you. This government will have integrity, professionalism and accountability at every level. Trust is earned and I will earn yours. Good morning. I've just been to Buckingham Palace and accepted His Majesty the King's invitation to form a government in his name. It is only right to explain why I am standing here as your new Prime Minister. Right now, our country is facing a profound economic crisis. The aftermath of COVID still lingers. Putin's war in Ukraine has destabilized energy markets and supply chains the world over. I want to pay tribute to my predecessor, Liz Truss. She was not wrong to want to improve growth in this country. It is a noble aim. And I admired her restlessness to create change. But some mistakes were made. Not born of ill will or bad intentions. Quite the opposite, in fact. But mistakes nonetheless. And I have been elected, as leader of my party,

DEBUG:speechbrain.utils.parameter_transfer:Collecting files (or symlinks) for pretraining in pretrained_models/spkrec-ecapa-voxceleb.
INFO:speechbrain.utils.fetching:Fetch embedding_model.ckpt: Using symlink found at '/content/pretrained_models/spkrec-ecapa-voxceleb/embedding_model.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["embedding_model"] = /content/pretrained_models/spkrec-ecapa-voxceleb/embedding_model.ckpt
INFO:speechbrain.utils.fetching:Fetch mean_var_norm_emb.ckpt: Using symlink found at '/content/pretrained_models/spkrec-ecapa-voxceleb/mean_var_norm_emb.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["mean_var_norm_emb"] = /content/pretrained_models/spkrec-ecapa-voxceleb/mean_var_norm_emb.ckpt
INFO:speechbrain.utils.fetching:Fetch classifier.ckpt: Using symlink found at '/content/pretrained_models/spkrec-ecapa-voxceleb/classifier.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["clas

{'accent': 'British', 'confidence': '14.55%', 'summary': 'The speaker most closely matches a British accent with a confidence score of 14.55%.'}


In [None]:
import os
import yt_dlp
import requests,torch
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, pipeline
from speechbrain.pretrained import EncoderClassifier
def download_audio(url):
    """Downloads video from URL and extracts audio to 'audio.wav'."""
    if "youtube.com" in url or "youtu.be" in url:
        # Use yt-dlp to get audio
        ydl_opts = {
            'format': 'bestaudio/best',
            'postprocessors': [{
                'key': 'FFmpegExtractAudio',
                'preferredcodec': 'wav',
                'preferredquality': '192',
            }],
            'outtmpl': 'audio.%(ext)s'
        }
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            ydl.download([url])
        return "audio.wav"
    elif url.lower().endswith((".mp4", ".m4v")):
        # Download file and extract audio via ffmpeg
        local_file = "temp_video.mp4"
        r = requests.get(url)
        with open(local_file, "wb") as f:
            f.write(r.content)
        # Extract audio (mono 16kHz) – requires ffmpeg
        os.system(f"ffmpeg -y -i {local_file} -vn -acodec pcm_s16le -ar 16000 -ac 1 audio.wav")
        return "audio.wav"
    else:
        raise ValueError("Unsupported URL format or host.")

def transcribe_audio(audio_path):
    """Transcribes the given audio file using Whisper."""
    device = "cuda" if torch.cuda.is_available() else "cpu"
    BATCH_SIZE = 8

    tokenizer = AutoProcessor.from_pretrained("openai/whisper-large-v3")
    model = AutoModelForSpeechSeq2Seq.from_pretrained(
        "openai/whisper-large-v3",
        torch_dtype=torch.float16,
        low_cpu_mem_usage=True,
        use_safetensors=True,
    ).to(device)

    whisper_pipe = pipeline(
        "automatic-speech-recognition",
        model=model,
        tokenizer=tokenizer.tokenizer,
        feature_extractor=tokenizer.feature_extractor,
        max_new_tokens=128,
        chunk_length_s=30,
        batch_size=BATCH_SIZE,
        return_timestamps=False,
        torch_dtype=torch.float16,
        device=device,
    )
    result = whisper_pipe(audio_path)
    return result["text"]

def classify_accent(audio_path):
    """Classifies accent from audio using a pre-trained SpeechBrain model."""
    # Load the pretrained accent model (only needs to be done once ideally)
    classifier = EncoderClassifier.from_hparams(
        source="Jzuluaga/accent-id-commonaccent_ecapa",
        savedir="pretrained_models/accent-id-commonaccent_ecapa"
    )
    out_prob, score, index, text_lab = classifier.classify_file(audio_path)
    accent = text_lab  # label string, e.g. 'us', 'england', etc.
    # confidence = float(np.max(out_prob) * 100) # This line is causing the error
    confidence = float(torch.max(out_prob).item() * 100)  # Use torch.max() instead
    return accent, confidence

if __name__ == "__main__":
    video_url = input("Enter video URL: ").strip()
    print("Downloading and extracting audio...")
    audio_file = download_audio(video_url)
    print("Transcribing audio...")
    transcript = transcribe_audio(audio_file)
    print("Transcription:", transcript[:100], "...")
    print("Classifying accent...")
    accent, conf = classify_accent(audio_file)
    print(f"Accent: {accent} ({conf:.1f}%)")
    print("Explanation: ", end="")
    if conf > 80:
        print("Strong acoustic cues match typical " + accent + " English speech.")
    else:
        print("Accent classification is uncertain; model confidence is moderate.")


Enter video URL: https://youtu.be/BVgnIYSKUjY?si=Mcmvamf-jqI3vvh2
Downloading and extracting audio...
[youtube] Extracting URL: https://youtu.be/BVgnIYSKUjY?si=Mcmvamf-jqI3vvh2
[youtube] BVgnIYSKUjY: Downloading webpage
[youtube] BVgnIYSKUjY: Downloading tv client config
[youtube] BVgnIYSKUjY: Downloading tv player API JSON
[youtube] BVgnIYSKUjY: Downloading ios player API JSON
[youtube] BVgnIYSKUjY: Downloading m3u8 information
[info] BVgnIYSKUjY: Downloading 1 format(s): 251
[download] Destination: audio.webm
[download] 100% of    5.89MiB in 00:00:00 at 14.07MiB/s  
[ExtractAudio] Destination: audio.wav
Deleting original file audio.webm (pass -k to keep)
Transcribing audio...


Device set to use cuda


INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Using symlink found at '/content/pretrained_models/accent-id-commonaccent_ecapa/hyperparams.yaml'
INFO:speechbrain.utils.fetching:Fetch custom.py: Fetching from HuggingFace Hub 'Jzuluaga/accent-id-commonaccent_ecapa' if not cached


Transcription:  I will unite our country, not with words, but with action. I will work day in and day out to delive ...
Classifying accent...


DEBUG:speechbrain.utils.parameter_transfer:Collecting files (or symlinks) for pretraining in pretrained_models/accent-id-commonaccent_ecapa.
INFO:speechbrain.utils.fetching:Fetch embedding_model.ckpt: Using symlink found at '/content/pretrained_models/accent-id-commonaccent_ecapa/embedding_model.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["embedding_model"] = /content/pretrained_models/accent-id-commonaccent_ecapa/embedding_model.ckpt
INFO:speechbrain.utils.fetching:Fetch classifier.ckpt: Using symlink found at '/content/pretrained_models/accent-id-commonaccent_ecapa/classifier.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["classifier"] = /content/pretrained_models/accent-id-commonaccent_ecapa/classifier.ckpt
INFO:speechbrain.utils.fetching:Fetch accent_encoder.txt: Using symlink found at '/content/pretrained_models/accent-id-commonaccent_ecapa/label_encoder.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path

Accent: ['england'] (72.0%)
Explanation: Accent classification is uncertain; model confidence is moderate.
