In [1]:
# importing packages 
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import os
from tqdm import tqdm

2024-06-05 12:02:49.321666: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



In [2]:
model_id = "openai/whisper-medium"    # define the model
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id)

processor = AutoProcessor.from_pretrained(model_id)

# create the pipeline
pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=30,
    batch_size=16,
    return_timestamps=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
songs = [file for file in os.listdir("../data/songs") if file.endswith(".mp3")]

for song in tqdm(songs):
    name = ''.join((song).split('.')[:-1])
    if not os.path.exists(f"../data/lyrics-hyp/{name}.txt"):
        hypothesis_lyrics = pipe(f"../data/songs/{song}", return_timestamps=False, generate_kwargs={"language": "fr"})
        hypothesis_lyrics = hypothesis_lyrics["text"]

        name = ''.join((song).split('.')[:-1])

        with open(f"../data/lyrics-hyp/{name}.txt", "w") as file:
            file.write(hypothesis_lyrics)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 146/146 [2:26:56<00:00, 60.39s/it]


In [27]:
import jiwer


transforms = jiwer.Compose(
    [
        jiwer.ExpandCommonEnglishContractions(),
        jiwer.RemoveEmptyStrings(),
        jiwer.ToLowerCase(),
        jiwer.RemoveMultipleSpaces(),
        jiwer.Strip(),
        jiwer.RemovePunctuation(),
        jiwer.ReduceToListOfListOfWords(),
    ]
)

In [30]:
true = "Tombe la neige Tu ne viendras pas ce soir Tombe la neige Et mon coeur s'habille de noir Ce soyeux cortege Tout en larmes blanches L'oiseau sur la branche Pleure le sortilege Tu ne viendras pas ce soir Me crie mon désespoir Mais tombe la neige Impassible manege Tombe la neige Tu ne viendras pas ce soir Tombe la neige Tout est blanc de désespoir Triste certitude Le froid et l'absence Cet odieux silence Blanche solitude Tu ne viendras pas ce soir Me crie mon désespoir Mais tombe la neige Impassible manege"

In [31]:
wer = jiwer.wer(
                true,
                result,
                truth_transform=transforms,
                hypothesis_transform=transforms,
            )
print(f"Word Error Rate (WER) :", wer)

Word Error Rate (WER) : 0.1956521739130435
