In [32]:
import csv
import numpy as np
from transformers import pipeline
from datasets import load_dataset
import pandas as pd
from jiwer import wer
from jarowinkler import *
import string


In [3]:
def add_text_to_csv(text, csv_file_path):
    # Open the CSV file for appending
    with open(csv_file_path, mode='a', newline='', encoding='utf-8') as csv_file:
        writer = csv.writer(csv_file)
        # Write the text to the CSV
        writer.writerow([text])

In [4]:
whisper = pipeline(
    "automatic-speech-recognition",
    model="openai/whisper-tiny.en",
    chunk_length_s=30,
)

In [10]:
wav2vec = pipeline(
    "automatic-speech-recognition",
    model="jonatasgrosman/wav2vec2-large-xlsr-53-english",
    chunk_length_s=30,
)

Some weights of the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
Y

In [6]:
# Load the FLEURS dataset
fleurs_asr = load_dataset("google/fleurs", "en_us")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [7]:
# Function to transcribe audio
def transcribe(audio, model, output_csv_file):
    audio_data = {
        "raw": np.array(audio["array"]),  # The audio waveform
        "sampling_rate": audio["sampling_rate"]  # The sampling rate of the audio
    }
    english_text = model(audio_data)  # Perform ASR
    add_text_to_csv(english_text["text"], output_csv_file)  # Add the transcription to the CSV

In [8]:
# Directory and output file paths
whisper_output_csv_file = 'asr_outputs/whisper_output.csv'
audio_inputs = fleurs_asr["train"][:50]["audio"]

print('Whisper...')
for audio in audio_inputs:
    transcribe(audio, whisper, whisper_output_csv_file)


Whisper...


In [11]:
wave2vec_output_csv_file = 'asr_outputs/wave2vec_output.csv'
audio_inputs = fleurs_asr["train"][:50]["audio"]

print('Wave2vec..')
for audio in audio_inputs:
    transcribe(audio, wav2vec, wave2vec_output_csv_file)

Wave2vec..


In [17]:
original_text = fleurs_asr["train"][:50]["transcription"]
original_df = pd.DataFrame(original_text, columns=['Original_Transcription'])
original_df

Unnamed: 0,Original_Transcription
0,a tornado is a spinning column of very low-pre...
1,former u.s. speaker of the house newt gingrich...
2,the island was first inhabited by the taínos a...
3,these nerve impulses can be sent so quickly th...
4,on september 24 1759 arthur guinness signed a ...
5,today timbuktu is an impoverished town althoug...
6,with the same time zone as hawaii the islands ...
7,hokuriku electric power co reported no effects...
8,massa is due to be out for at least the rest o...
9,pittman suggested that conditions wouldn't imp...


In [27]:
whisper_text = pd.read_csv('asr_outputs/whisper_output.csv', header=None, names=['Whisper_Transcription'])
whisper_text

Unnamed: 0,Whisper_Transcription
0,A tornado is a spinning column of very low pr...
1,"Former US speaker of the house, Newt Gingrich..."
2,The island was first inhabited by the Tianos ...
3,This nerve empulses can be sent so quickly th...
4,On September 24th 1759 Arthur Guinness signed...
5,"Today, Timbuktu is an impoverished town, alth..."
6,"with the same time zone as Hawaii, the island..."
7,Her crookie electric parkour reported no effe...
8,Masa is due to be out of for at least the res...
9,Pittman suggested that conditions wouldn't im...


In [28]:
wave2vec_text = pd.read_csv('asr_outputs/wave2vec_output.csv', header=None, names=['Wave2Vec_Transcription'])
wave2vec_text

Unnamed: 0,Wave2Vec_Transcription
0,a tornado is a spinning column of very low-pre...
1,former ewsh speaker of the house neut gingrich...
2,the island was first inhabited by tetianos and...
3,these nerve impulses can be sent so quickly th...
4,on september twenty-fourth seventeen fifty nin...
5,today timbactu is an impoverished town althoug...
6,with the same time zown as hawai the islands a...
7,hokroky electric parko reported no effect from...
8,massa is due to be out for at least the rest o...
9,pitman suggested that conditions wuldn't impro...


In [33]:
asr_analysis_df = pd.concat([original_df, whisper_text, wave2vec_text], axis=1)
# Remove punctuation and apply lower() to all text columns
asr_analysis_df = asr_analysis_df.applymap(lambda x: x.lower().translate(str.maketrans('', '', string.punctuation)) if isinstance(x, str) else x)
asr_analysis_df

  asr_analysis_df = asr_analysis_df.applymap(lambda x: x.lower().translate(str.maketrans('', '', string.punctuation)) if isinstance(x, str) else x)


Unnamed: 0,Original_Transcription,Whisper_Transcription,Wave2Vec_Transcription
0,a tornado is a spinning column of very lowpres...,a tornado is a spinning column of very low pr...,a tornado is a spinning column of very lowpres...
1,former us speaker of the house newt gingrich c...,former us speaker of the house newt gingrich ...,former ewsh speaker of the house neut gingrich...
2,the island was first inhabited by the taínos a...,the island was first inhabited by the tianos ...,the island was first inhabited by tetianos and...
3,these nerve impulses can be sent so quickly th...,this nerve empulses can be sent so quickly th...,these nerve impulses can be sent so quickly th...
4,on september 24 1759 arthur guinness signed a ...,on september 24th 1759 arthur guinness signed...,on september twentyfourth seventeen fifty nine...
5,today timbuktu is an impoverished town althoug...,today timbuktu is an impoverished town althou...,today timbactu is an impoverished town althoug...
6,with the same time zone as hawaii the islands ...,with the same time zone as hawaii the island ...,with the same time zown as hawai the islands a...
7,hokuriku electric power co reported no effects...,her crookie electric parkour reported no effe...,hokroky electric parko reported no effect from...
8,massa is due to be out for at least the rest o...,masa is due to be out of for at least the res...,massa is due to be out for at least the rest o...
9,pittman suggested that conditions wouldnt impr...,pittman suggested that conditions wouldnt imp...,pitman suggested that conditions wuldnt improv...


In [34]:
asr_analysis_df['whisper_WER'] = asr_analysis_df.apply(lambda row: wer(row['Original_Transcription'], row['Whisper_Transcription']), axis=1)


mean_wer = asr_analysis_df['whisper_WER'].mean()

print(f"Mean WER: {mean_wer}")

Mean WER: 0.10568459229932642


In [35]:
asr_analysis_df['wave2vec_WER'] = asr_analysis_df.apply(lambda row: wer(row['Original_Transcription'], row['Wave2Vec_Transcription']), axis=1)


mean_wer = asr_analysis_df['wave2vec_WER'].mean()

print(f"Mean WER: {mean_wer}")

Mean WER: 0.1553863718076332


In [36]:
# Calculate JW Distance
asr_analysis_df['whisper_JW_Distance'] = asr_analysis_df.apply(lambda row: jarowinkler_similarity(row['Original_Transcription'], row['Whisper_Transcription']), axis=1)

mean_jw = asr_analysis_df['whisper_JW_Distance'].mean()

print(f"Mean Jaro Winkler Distance: {mean_jw}")

Mean Jaro Winkler Distance: 0.8480267675606266


In [37]:
# Calculate JW Distance
asr_analysis_df['Wave2Vec_JW_Distance'] = asr_analysis_df.apply(lambda row: jarowinkler_similarity(row['Original_Transcription'], row['Wave2Vec_Transcription']), axis=1)

mean_jw = asr_analysis_df['Wave2Vec_JW_Distance'].mean()

print(f"Mean Jaro Winkler Distance: {mean_jw}")

Mean Jaro Winkler Distance: 0.9277293899260606
