In [12]:
# Task 5a
import pandas as pd
from jiwer import wer, cer 
import torchaudio
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, Wav2Vec2FeatureExtractor, Wav2Vec2CTCTokenizer
import torch
import os

In [None]:
tokenizer = Wav2Vec2CTCTokenizer.from_pretrained("facebook/wav2vec2-large-960h")
finetuned_model = Wav2Vec2ForCTC.from_pretrained("../asr-train/wav2vec2-large-960h-cv") 
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=False)
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
def get_prediction(model, processor, audio_path):
    audio, sr = torchaudio.load(audio_path)
    input_values = processor(audio, sampling_rate=sr, return_tensors="pt").input_values
    input_values = input_values[0]

    # retrieve logits & take argmax
    logits = model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    
    transcription = processor.decode(predicted_ids[0]) 
    return transcription


In [13]:
# Import 
cv_valid_dev_df = pd.read_csv("../data/common_voice/cv-valid-dev.csv")

# Create filepath col to audiofiles 
cv_valid_dev_df['file_path'] = cv_valid_dev_df['filename'].apply(lambda x: os.path.join("../data/resampled_audio/", x))
cv_valid_dev_df

Unnamed: 0,filename,text,up_votes,down_votes,age,gender,accent,duration,file_path
0,cv-valid-dev/sample-000000.mp3,be careful with your prognostications said the...,1,0,,,,,../data/resampled_audio/cv-valid-dev/sample-00...
1,cv-valid-dev/sample-000001.mp3,then why should they be surprised when they se...,2,0,,,,,../data/resampled_audio/cv-valid-dev/sample-00...
2,cv-valid-dev/sample-000002.mp3,a young arab also loaded down with baggage ent...,2,0,,,,,../data/resampled_audio/cv-valid-dev/sample-00...
3,cv-valid-dev/sample-000003.mp3,i thought that everything i owned would be des...,3,0,,,,,../data/resampled_audio/cv-valid-dev/sample-00...
4,cv-valid-dev/sample-000004.mp3,he moved about invisible but everyone could he...,1,0,fourties,female,england,,../data/resampled_audio/cv-valid-dev/sample-00...
...,...,...,...,...,...,...,...,...,...
4071,cv-valid-dev/sample-004071.mp3,but they could never have taught him arabic,2,1,,,,,../data/resampled_audio/cv-valid-dev/sample-00...
4072,cv-valid-dev/sample-004072.mp3,he decided to concentrate on more practical ma...,1,0,,,,,../data/resampled_audio/cv-valid-dev/sample-00...
4073,cv-valid-dev/sample-004073.mp3,that's what i'm not supposed to say,2,0,thirties,male,us,,../data/resampled_audio/cv-valid-dev/sample-00...
4074,cv-valid-dev/sample-004074.mp3,just handling them made him feel better,3,0,,,,,../data/resampled_audio/cv-valid-dev/sample-00...


In [25]:
cv_valid_dev_df_subset = cv_valid_dev_df[['filename', 'file_path', 'text']].copy()
cv_valid_dev_df_subset['text'] = cv_valid_dev_df_subset['text'].str.upper()
cv_valid_dev_df_subset

Unnamed: 0,filename,file_path,text
0,cv-valid-dev/sample-000000.mp3,../data/resampled_audio/cv-valid-dev/sample-00...,BE CAREFUL WITH YOUR PROGNOSTICATIONS SAID THE...
1,cv-valid-dev/sample-000001.mp3,../data/resampled_audio/cv-valid-dev/sample-00...,THEN WHY SHOULD THEY BE SURPRISED WHEN THEY SE...
2,cv-valid-dev/sample-000002.mp3,../data/resampled_audio/cv-valid-dev/sample-00...,A YOUNG ARAB ALSO LOADED DOWN WITH BAGGAGE ENT...
3,cv-valid-dev/sample-000003.mp3,../data/resampled_audio/cv-valid-dev/sample-00...,I THOUGHT THAT EVERYTHING I OWNED WOULD BE DES...
4,cv-valid-dev/sample-000004.mp3,../data/resampled_audio/cv-valid-dev/sample-00...,HE MOVED ABOUT INVISIBLE BUT EVERYONE COULD HE...
...,...,...,...
4071,cv-valid-dev/sample-004071.mp3,../data/resampled_audio/cv-valid-dev/sample-00...,BUT THEY COULD NEVER HAVE TAUGHT HIM ARABIC
4072,cv-valid-dev/sample-004072.mp3,../data/resampled_audio/cv-valid-dev/sample-00...,HE DECIDED TO CONCENTRATE ON MORE PRACTICAL MA...
4073,cv-valid-dev/sample-004073.mp3,../data/resampled_audio/cv-valid-dev/sample-00...,THAT'S WHAT I'M NOT SUPPOSED TO SAY
4074,cv-valid-dev/sample-004074.mp3,../data/resampled_audio/cv-valid-dev/sample-00...,JUST HANDLING THEM MADE HIM FEEL BETTER


In [26]:
cv_valid_dev_df_subset['predicted_text'] = cv_valid_dev_df_subset['file_path'].apply(lambda x: get_prediction(finetuned_model, processor, x))
cv_valid_dev_df_subset

Unnamed: 0,filename,file_path,text,predicted_text
0,cv-valid-dev/sample-000000.mp3,../data/resampled_audio/cv-valid-dev/sample-00...,BE CAREFUL WITH YOUR PROGNOSTICATIONS SAID THE...,BE CAREFUL WITH YOUR PROGNOSTIGATIONS SAID THE...
1,cv-valid-dev/sample-000001.mp3,../data/resampled_audio/cv-valid-dev/sample-00...,THEN WHY SHOULD THEY BE SURPRISED WHEN THEY SE...,THEN WHY SHOULD THEY BE SURPRISED WHEN THE CEI...
2,cv-valid-dev/sample-000002.mp3,../data/resampled_audio/cv-valid-dev/sample-00...,A YOUNG ARAB ALSO LOADED DOWN WITH BAGGAGE ENT...,A YOUNG ARAB ALSO LOADED DOWN WITH BAGGAGE ENT...
3,cv-valid-dev/sample-000003.mp3,../data/resampled_audio/cv-valid-dev/sample-00...,I THOUGHT THAT EVERYTHING I OWNED WOULD BE DES...,I THOUGHT THAT EVERYTHING I OWNED WOULD BE DES...
4,cv-valid-dev/sample-000004.mp3,../data/resampled_audio/cv-valid-dev/sample-00...,HE MOVED ABOUT INVISIBLE BUT EVERYONE COULD HE...,HE MOVED ABOUT INVISIBLE BUT EVERYONE COULD HE...
...,...,...,...,...
4071,cv-valid-dev/sample-004071.mp3,../data/resampled_audio/cv-valid-dev/sample-00...,BUT THEY COULD NEVER HAVE TAUGHT HIM ARABIC,BUT THEY COULD NEVER HAVE TAUGHT HIM ARABIC
4072,cv-valid-dev/sample-004072.mp3,../data/resampled_audio/cv-valid-dev/sample-00...,HE DECIDED TO CONCENTRATE ON MORE PRACTICAL MA...,HE DECIDED TO CONCENTRATE ON MORE PRACTICAL MA...
4073,cv-valid-dev/sample-004073.mp3,../data/resampled_audio/cv-valid-dev/sample-00...,THAT'S WHAT I'M NOT SUPPOSED TO SAY,THAT'S WHAT I'M NOT SUPPOSED TO SAY
4074,cv-valid-dev/sample-004074.mp3,../data/resampled_audio/cv-valid-dev/sample-00...,JUST HANDLING THEM MADE HIM FEEL BETTER,JUST HANDELING PI MADE HIM FEEL BETTER


In [27]:
hotwords = ["be careful", "destroy", "stranger"]

# Function to check if a given text contains any of the hotwords
def contains_hotword(text, hotwords):
    text = text.lower()
    return any(hw in text for hw in hotwords)

detected_files = cv_valid_dev_df_subset[cv_valid_dev_df_subset["predicted_text"].apply(lambda x: contains_hotword(x, hotwords))]["filename"].tolist()
print(detected_files)

with open("detected.txt", "w") as f:
    for filename in detected_files:
        f.write(f"{filename}\n")

['cv-valid-dev/sample-000000.mp3', 'cv-valid-dev/sample-000003.mp3', 'cv-valid-dev/sample-000089.mp3', 'cv-valid-dev/sample-000508.mp3', 'cv-valid-dev/sample-000674.mp3', 'cv-valid-dev/sample-001093.mp3', 'cv-valid-dev/sample-001101.mp3', 'cv-valid-dev/sample-001243.mp3', 'cv-valid-dev/sample-001501.mp3', 'cv-valid-dev/sample-001933.mp3', 'cv-valid-dev/sample-002453.mp3', 'cv-valid-dev/sample-003065.mp3', 'cv-valid-dev/sample-003219.mp3', 'cv-valid-dev/sample-003808.mp3']
