In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pydub import AudioSegment
from pydub.playback import play
import os
from pydub import AudioSegment
from dotenv import load_dotenv
from transformers import pipeline, AutoProcessor, AutoModelForSpeechSeq2Seq
import soundfile as sf
from openai import OpenAI
import jiwer
import string
import re
from bert_score import BERTScorer

# Manual Transcription Code

In [None]:
labels_df = pd.read_csv("<path to csv file>")


If you type:
- REDO it will run that same clip again
- EXIT will terminate the code but you can always resume from where you left off.

In [None]:
files=os.listdir("<path to audio clips>")
files_sorted = sorted(files, key=lambda x: os.path.getctime(os.path.join("<path to audio clips", x)))


In [None]:
i=0
for clip in files_sorted:
    print(clip)
    if(labels_df.loc[i,"manual_transcription_done"]):
        i+=1
        continue
    play(AudioSegment.from_file("<path to audio clips"+clip))
    ground_truth = input('Enter transcript: ')
    if(ground_truth=="REDO"):
        i-=1
    elif(ground_truth=="EXIT"):
        break
    else:
        labels_df.loc[i,"manual_annotation"]=ground_truth
        labels_df.loc[i,"manual_transcription_done"]=True
        labels_df.to_csv("<path to csv>",index=False)
        foundloc='./clips_done.txt'
        with open(foundloc, 'a') as fp:
            nid=str(labels_df.loc[i,"absolute_index"])+"\n"
            fp.write(nid)
    i+=1


# Whisper2 Inference with API

In [None]:
load_dotenv()

In [None]:
def manual_get_audio_clip(show,episode_id,clip_id):
    audio_path = "<path to clips>"+str(show)+"/"+str(episode_id)+"/"+str(show)+"_"+str(episode_id)+"_"+str(clip_id)+".wav"
    return audio_path


def auto_get_audio_clip(absolute_id):
    show=labels_df["Show"][labels_df["absolute_index"]==absolute_id].values[0]
    episode_id=labels_df["EpId"][labels_df["absolute_index"]==absolute_id].values[0]
    clip_id=labels_df["ClipId"][labels_df["absolute_index"]==absolute_id].values[0]
    return manual_get_audio_clip(show,episode_id,clip_id)

In [None]:
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [None]:
audio_file = open(auto_get_audio_clip(24106), "rb")
transcript = client.audio.transcriptions.create(
  model="whisper-1",
  file=audio_file
)

In [None]:
for i in range(labels_df.shape[0]):
  if(type(labels_df.loc[i, 'annotated_groundTruth'])!=str):
    continue
  if(labels_df.loc[i, 'Whisper2_done']==1):
    continue
  print(i," : ",labels_df.loc[i, 'absolute_index'])
  try:
    audio_file = open(auto_get_audio_clip(labels_df.loc[i, 'absolute_index']), "rb")
    transcript = client.audio.transcriptions.create(
      model="whisper-1",
      file=audio_file
    )
  except:
    continue
  result=transcript.text
  labels_df.loc[i, 'Whisper2Annotation']=result.strip()
  labels_df.loc[i, 'Whisper2_done']=1
  labels_df.to_csv("<path to csv>",index=False)



# Whisper 3 inference with Huggingface

In [None]:
pipe = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3")
processor = AutoProcessor.from_pretrained("openai/whisper-large-v3")
model = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-large-v3")

In [None]:
for i in range(labels_df.shape[0]):
  if(type(labels_df.loc[i, 'annotated_groundTruth'])!=str):
    continue
  if(labels_df.loc[i, 'Whisper3Annotation']==1):
    continue
  print(i," : ",labels_df.loc[i, 'absolute_index'])
  try:
    file_path=auto_get_audio_clip(labels_df.loc[i, 'absolute_index'])
    waveform, sample_rate = sf.read(file_path)
    if sample_rate != 16000:
      print(labels_df.loc[i, 'absolute_index'])
      waveform = librosa.resample(waveform, orig_sr=sample_rate, target_sr=16000)
      sample_rate = 16000
    inputs = processor(waveform, sampling_rate=sample_rate, return_tensors="pt")
    input_features = inputs.input_features
    generated_ids = model.generate(inputs=input_features, language="en")
    transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
  except:
    continue
  result=transcription
  labels_df.loc[i, 'Whisper3Annotation']=result.strip()
  labels_df.loc[i, 'whisper3_done']=1
  labels_df.to_csv("<csv file path>",index=False)



# Whisper 2 Metric Calculation

In [None]:
whisper2_metric_df=pd.read_csv("<path to label_df csv file>")

In [None]:
def remove_punctuation(text):
    all_punctuation = string.punctuation

    return text.translate(str.maketrans('', '', all_punctuation))

In [None]:
def remove_interjection(text):
    text_list=text.split()
    text_size=len(text_list)
    i=0
    while i < text_size:
        text_size=len(text_list)
        if(text_size==0):
            return " "
        if("/i" in text_list[i]):
            if("uh/i" in text_list[i] or "um/i" in text_list[i] or "like/i" in text_list[i] or "mhm/i" in text_list[i] or "eh/i" in text_list[i] or "mm" in text_list[i]):
                text_list=text_list[:i]+text_list[i+1:]     
                i=0
                continue
            elif((text_list[i-1]+text_list[i])=="youknow/i"):
                text_list=text_list[:i-1]+text_list[i+1:]     
                i=0
                continue
            else:
                print("standoutinterject: ",text_list)
        i+=1
    return " ".join(text_list)  


In [None]:
def clean_ground_truth(text, method='literal'):
  symbols = ['\[.*?\]', '/b', '/p', '/r', '/i', '\<.*?\>']

  if method == 'semantic':

    text = re.sub(symbols[0], '', text)

    text = remove_interjection(text)

  text = re.sub(symbols[-1], '', text)
  text = text.replace('[', ' ')
  text = text.replace(']', '')
  for symbol in symbols[1:]:
      text = text.replace(symbol, '')

  return text

In [None]:

for i in range(whisper2_metric_df.shape[0]):
  if(type(whisper2_metric_df.loc[i, 'annotated_groundTruth'])!=str):
    continue
  print(i)

  #ground truth pre-pre-processing
  ground_truth=whisper2_metric_df.loc[i, 'annotated_groundTruth']
  ground_truth=ground_truth.replace('<', '')
  ground_truth=ground_truth.replace('>', '')
  ground_truth=ground_truth.lower()

  #whisper pre-processing
  whisper_guess=whisper2_metric_df.loc[i, 'bigWhisperAnnotation']
  if(type(whisper_guess)==float):
    whisper_guess=" "
  whisper_guess=remove_punctuation(whisper_guess)
  whisper_guess=whisper_guess.lower()

  #LITERAL STUFF
  literal_string=remove_punctuation(clean_ground_truth(ground_truth,method="literal"))
  
  if(literal_string.strip()=="" and whisper_guess.strip()==""):
    whisper2_metric_df.loc[i, 'literal_WER']=0.0
    whisper2_metric_df.loc[i, 'literal_ref']=0
    whisper2_metric_df.loc[i, 'literal_sub']=0.0
    whisper2_metric_df.loc[i, 'literal_ins']=0.0
    whisper2_metric_df.loc[i, 'literal_del']=0.0
    
  elif(literal_string.strip()==""):
    whisper2_metric_df.loc[i, 'literal_WER']=1.0
    whisper2_metric_df.loc[i, 'literal_ref']=len(whisper_guess.split())
    whisper2_metric_df.loc[i, 'literal_sub']=1.0
    whisper2_metric_df.loc[i, 'literal_ins']=1.0
    whisper2_metric_df.loc[i, 'literal_del']=1.0
  else:
    literal_results=jiwer.process_words(literal_string,whisper_guess)
    whisper2_metric_df.loc[i, 'literal_WER']=min(1,literal_results.wer)
    whisper2_metric_df.loc[i, 'literal_ref']=len(literal_string.split())
    whisper2_metric_df.loc[i, 'literal_sub']=literal_results.substitutions
    whisper2_metric_df.loc[i, 'literal_ins']=literal_results.insertions
    whisper2_metric_df.loc[i, 'literal_del']=literal_results.deletions


  #SEMANTIC STUFF
  semantic_string=remove_punctuation(clean_ground_truth(ground_truth,method="semantic"))

  if(semantic_string.strip()=="" and whisper_guess.strip()==""):
    whisper2_metric_df.loc[i, 'semantic_WER']=0.0
    whisper2_metric_df.loc[i, 'semantic_ref']=0.0
    whisper2_metric_df.loc[i, 'semantic_sub']=0.0
    whisper2_metric_df.loc[i, 'semantic_ins']=0.0
    whisper2_metric_df.loc[i, 'semantic_del']=0.0
    
  elif(semantic_string.strip()==""):
    whisper2_metric_df.loc[i, 'semantic_WER']=1.0
    whisper2_metric_df.loc[i, 'semantic_ref']=len(whisper_guess.split())
    whisper2_metric_df.loc[i, 'semantic_sub']=1.0
    whisper2_metric_df.loc[i, 'semantic_ins']=1.0
    whisper2_metric_df.loc[i, 'semantic_del']=1.0
  else:
    semantic_results=jiwer.process_words(semantic_string,whisper_guess)
    whisper2_metric_df.loc[i, 'semantic_WER']=min(1,semantic_results.wer)
    whisper2_metric_df.loc[i, 'semantic_ref']=len(semantic_string.split())
    whisper2_metric_df.loc[i, 'semantic_sub']=semantic_results.substitutions
    whisper2_metric_df.loc[i, 'semantic_ins']=semantic_results.insertions
    whisper2_metric_df.loc[i, 'semantic_del']=semantic_results.deletions



In [None]:

for i in range(whisper2_metric_df.shape[0]):
  if(type(whisper2_metric_df.loc[i, 'annotated_groundTruth'])!=str):
    continue
  print(i)

  #ground truth pre-pre-processing
  ground_truth=whisper2_metric_df.loc[i, 'annotated_groundTruth']
  ground_truth=ground_truth.replace('<', '')
  ground_truth=ground_truth.replace('>', '')
  ground_truth=ground_truth.lower()

  #whisper pre-processing
  whisper_guess=whisper2_metric_df.loc[i, 'bigWhisperAnnotation']
  if(type(whisper_guess)==float):
    whisper_guess=" "
  whisper_guess=remove_punctuation(whisper_guess)
  whisper_guess=whisper_guess.lower()

  #LITERAL STUFF
  literal_string=remove_punctuation(clean_ground_truth(ground_truth,method="literal"))
  
  
  whisper2_metric_df.loc[i, 'literal_2gram_bleu']= sentence_bleu([literal_string.split()], whisper_guess.split(), weights=(1, 1, 0, 0))
   

  #SEMANTIC STUFF
  semantic_string=remove_punctuation(clean_ground_truth(ground_truth,method="semantic"))

  whisper2_metric_df.loc[i, 'semantic_2gram_bleu']= sentence_bleu([semantic_string.split()], whisper_guess.split(), weights=(1, 1, 0, 0))



In [None]:
whisper2_metric_df["literal_BERT_precision"]=None
whisper2_metric_df["literal_BERT_recall"]=None
whisper2_metric_df["literal_BERT_F1"]=None
whisper2_metric_df["semantic_BERT_precision"]=None
whisper2_metric_df["semantic_BERT_recall"]=None
whisper2_metric_df["semantic_BERT_F1"]=None

In [None]:

scorer = BERTScorer(model_type='bert-base-uncased')
for i in range(whisper2_metric_df.shape[0]):
  if(type(whisper2_metric_df.loc[i, 'annotated_groundTruth'])!=str):
    continue
  print(i)

  #ground truth pre-pre-processing
  ground_truth=whisper2_metric_df.loc[i, 'annotated_groundTruth']
  ground_truth=ground_truth.replace('<', '')
  ground_truth=ground_truth.replace('>', '')
  ground_truth=ground_truth.lower()

  #whisper pre-processing
  whisper_guess=whisper2_metric_df.loc[i, 'bigWhisperAnnotation']
  if(type(whisper_guess)==float):
    whisper_guess=" "
  whisper_guess=remove_punctuation(whisper_guess)
  whisper_guess=whisper_guess.lower()

  #LITERAL STUFF
  literal_string=remove_punctuation(clean_ground_truth(ground_truth,method="literal"))

  literalP, literalR, literalF1= scorer.score([whisper_guess], [literal_string])
  
  whisper2_metric_df.loc[i,"literal_BERT_precision"]=literalP.mean().item()
  whisper2_metric_df.loc[i,"literal_BERT_recall"]=literalR.mean().item()
  whisper2_metric_df.loc[i,"literal_BERT_F1"] = literalF1.mean().item()
  
   

  #SEMANTIC STUFF
  semantic_string=remove_punctuation(clean_ground_truth(ground_truth,method="semantic"))
  
  semanticP, semanticR, semanticF1=scorer.score([whisper_guess], [semantic_string])

  whisper2_metric_df.loc[i,"semantic_BERT_precision"]=semanticP.mean().item()
  whisper2_metric_df.loc[i,"semantic_BERT_recall"]=semanticR.mean().item()
  whisper2_metric_df.loc[i,"semantic_BERT_F1"] = semanticF1.mean().item()
  
  whisper2_metric_df.to_csv("<path to separate whisper2 metric sv>",index=False)


# Whisper 3 Metric Calculation

In [None]:
whisper3_metric_df=pd.read_csv("<path to label_df csv file>")

In [None]:

for i in range(whisper3_metric_df.shape[0]):
  if(type(whisper3_metric_df.loc[i, 'annotated_groundTruth'])!=str):
    continue
  print(i)

  #ground truth pre-pre-processing
  ground_truth=whisper3_metric_df.loc[i, 'annotated_groundTruth']
  ground_truth=ground_truth.replace('<', '')
  ground_truth=ground_truth.replace('>', '')
  ground_truth=ground_truth.lower()

  #whisper pre-processing
  whisper_guess=whisper3_metric_df.loc[i, 'bigWhisperAnnotation']
  if(type(whisper_guess)==float):
    whisper_guess=" "
  whisper_guess=remove_punctuation(whisper_guess)
  whisper_guess=whisper_guess.lower()

  #LITERAL STUFF
  literal_string=remove_punctuation(clean_ground_truth(ground_truth,method="literal"))
  
  if(literal_string.strip()=="" and whisper_guess.strip()==""):
    whisper3_metric_df.loc[i, 'literal_WER']=0.0
    whisper3_metric_df.loc[i, 'literal_ref']=0
    whisper3_metric_df.loc[i, 'literal_sub']=0.0
    whisper3_metric_df.loc[i, 'literal_ins']=0.0
    whisper3_metric_df.loc[i, 'literal_del']=0.0
    
  elif(literal_string.strip()==""):
    whisper3_metric_df.loc[i, 'literal_WER']=1.0
    whisper3_metric_df.loc[i, 'literal_ref']=len(whisper_guess.split())
    whisper3_metric_df.loc[i, 'literal_sub']=1.0
    whisper3_metric_df.loc[i, 'literal_ins']=1.0
    whisper3_metric_df.loc[i, 'literal_del']=1.0
  else:
    literal_results=jiwer.process_words(literal_string,whisper_guess)
    whisper3_metric_df.loc[i, 'literal_WER']=min(1,literal_results.wer)
    whisper3_metric_df.loc[i, 'literal_ref']=len(literal_string.split())
    whisper3_metric_df.loc[i, 'literal_sub']=literal_results.substitutions
    whisper3_metric_df.loc[i, 'literal_ins']=literal_results.insertions
    whisper3_metric_df.loc[i, 'literal_del']=literal_results.deletions


  #SEMANTIC STUFF
  semantic_string=remove_punctuation(clean_ground_truth(ground_truth,method="semantic"))

  if(semantic_string.strip()=="" and whisper_guess.strip()==""):
    whisper3_metric_df.loc[i, 'semantic_WER']=0.0
    whisper3_metric_df.loc[i, 'semantic_ref']=0.0
    whisper3_metric_df.loc[i, 'semantic_sub']=0.0
    whisper3_metric_df.loc[i, 'semantic_ins']=0.0
    whisper3_metric_df.loc[i, 'semantic_del']=0.0
    
  elif(semantic_string.strip()==""):
    whisper3_metric_df.loc[i, 'semantic_WER']=1.0
    whisper3_metric_df.loc[i, 'semantic_ref']=len(whisper_guess.split())
    whisper3_metric_df.loc[i, 'semantic_sub']=1.0
    whisper3_metric_df.loc[i, 'semantic_ins']=1.0
    whisper3_metric_df.loc[i, 'semantic_del']=1.0
  else:
    semantic_results=jiwer.process_words(semantic_string,whisper_guess)
    whisper3_metric_df.loc[i, 'semantic_WER']=min(1,semantic_results.wer)
    whisper3_metric_df.loc[i, 'semantic_ref']=len(semantic_string.split())
    whisper3_metric_df.loc[i, 'semantic_sub']=semantic_results.substitutions
    whisper3_metric_df.loc[i, 'semantic_ins']=semantic_results.insertions
    whisper3_metric_df.loc[i, 'semantic_del']=semantic_results.deletions



In [None]:

for i in range(whisper3_metric_df.shape[0]):
  if(type(whisper3_metric_df.loc[i, 'annotated_groundTruth'])!=str):
    continue
  print(i)

  #ground truth pre-pre-processing
  ground_truth=whisper3_metric_df.loc[i, 'annotated_groundTruth']
  ground_truth=ground_truth.replace('<', '')
  ground_truth=ground_truth.replace('>', '')
  ground_truth=ground_truth.lower()

  #whisper pre-processing
  whisper_guess=whisper3_metric_df.loc[i, 'bigWhisperAnnotation']
  if(type(whisper_guess)==float):
    whisper_guess=" "
  whisper_guess=remove_punctuation(whisper_guess)
  whisper_guess=whisper_guess.lower()

  #LITERAL STUFF
  literal_string=remove_punctuation(clean_ground_truth(ground_truth,method="literal"))
  
  
  whisper3_metric_df.loc[i, 'literal_2gram_bleu']= sentence_bleu([literal_string.split()], whisper_guess.split(), weights=(1, 1, 0, 0))
   

  #SEMANTIC STUFF
  semantic_string=remove_punctuation(clean_ground_truth(ground_truth,method="semantic"))

  whisper3_metric_df.loc[i, 'semantic_2gram_bleu']= sentence_bleu([semantic_string.split()], whisper_guess.split(), weights=(1, 1, 0, 0))



In [None]:
whisper3_metric_df["literal_BERT_precision"]=None
whisper3_metric_df["literal_BERT_recall"]=None
whisper3_metric_df["literal_BERT_F1"]=None
whisper3_metric_df["semantic_BERT_precision"]=None
whisper3_metric_df["semantic_BERT_recall"]=None
whisper3_metric_df["semantic_BERT_F1"]=None

In [None]:

scorer = BERTScorer(model_type='bert-base-uncased')
for i in range(whisper3_metric_df.shape[0]):
  if(type(whisper3_metric_df.loc[i, 'annotated_groundTruth'])!=str):
    continue
  print(i)

  #ground truth pre-pre-processing
  ground_truth=whisper3_metric_df.loc[i, 'annotated_groundTruth']
  ground_truth=ground_truth.replace('<', '')
  ground_truth=ground_truth.replace('>', '')
  ground_truth=ground_truth.lower()

  #whisper pre-processing
  whisper_guess=whisper3_metric_df.loc[i, 'bigWhisperAnnotation']
  if(type(whisper_guess)==float):
    whisper_guess=" "
  whisper_guess=remove_punctuation(whisper_guess)
  whisper_guess=whisper_guess.lower()

  #LITERAL STUFF
  literal_string=remove_punctuation(clean_ground_truth(ground_truth,method="literal"))

  literalP, literalR, literalF1= scorer.score([whisper_guess], [literal_string])
  
  whisper3_metric_df.loc[i,"literal_BERT_precision"]=literalP.mean().item()
  whisper3_metric_df.loc[i,"literal_BERT_recall"]=literalR.mean().item()
  whisper3_metric_df.loc[i,"literal_BERT_F1"] = literalF1.mean().item()
  
   

  #SEMANTIC STUFF
  semantic_string=remove_punctuation(clean_ground_truth(ground_truth,method="semantic"))
  
  semanticP, semanticR, semanticF1=scorer.score([whisper_guess], [semantic_string])

  whisper3_metric_df.loc[i,"semantic_BERT_precision"]=semanticP.mean().item()
  whisper3_metric_df.loc[i,"semantic_BERT_recall"]=semanticR.mean().item()
  whisper3_metric_df.loc[i,"semantic_BERT_F1"] = semanticF1.mean().item()
  
  whisper3_metric_df.to_csv("<path to separate whisper2 metric sv>",index=False)


# Whisper 2 Results Graphing

## Literal Overview Graph

In [None]:
literal_WER=[]
literal_2gram_bleu=[]
literal_BERT_F1=[]

for i in ["YesStutteredWords","manual_fluent","manual_prolongation","manual_block","manual_soundRep","manual_wordRep","manual_interject"]:
    if(i=="YesStutteredWords"):
        group_df=whisper2_metric_df.groupby(["manual_fluent"]).mean(numeric_only=True)
        metrics=group_df.iloc[0,:][["literal_WER","literal_2gram_bleu","literal_BERT_F1"]].values
        
        literal_WER+=[metrics[0]]
        literal_2gram_bleu+=[metrics[1]]
        literal_BERT_F1+=[metrics[2]]
    else:
        group_df=whisper2_metric_df.groupby([i]).mean(numeric_only=True)
        metrics=group_df.iloc[1,:][["literal_WER","literal_2gram_bleu","literal_BERT_F1"]].values
        
        literal_WER+=[metrics[0]]
        literal_2gram_bleu+=[metrics[1]]
        literal_BERT_F1+=[metrics[2]]

literal_WER=[round(j, 3) for j in literal_WER ]
literal_2gram_bleu=[round(j, 3) for j in literal_2gram_bleu ]
literal_BERT_F1=[round(j, 3) for j in literal_BERT_F1 ]

In [None]:
speech_categories = ("Stutter","Fluent","Prolongation","Block","Sound\nRepetition","Word\nRepetition","Interjection")
speech_metrics = {
    'Literal WER': tuple(literal_WER),
    #'Literal_BLEU-2': tuple(literal_2gram_bleu),
    #'Literal_BERT_F1': tuple(literal_BERT_F1),
}

x = np.arange(len(speech_categories))  # the label locations
width = 0.5 # the width of the bars
multiplier = 0

fig, ax = plt.subplots()

for attribute, measurement in speech_metrics.items():
    offset = width * multiplier
    rects = ax.bar(x + offset, measurement, width, label=attribute,color="darkorange")
    ax.bar_label(rects, padding=3,fontsize='large')
    multiplier += 1

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Metric Score',fontsize='large')
ax.set_title('Speech to Text WER for Literal\n Transcriptions of each Stuttering Type',fontsize='x-large')
ax.set_xticks(x, speech_categories,fontsize='large',rotation=90)
ax.legend(loc='upper left', ncols=1,fontsize='large')
ax.set_ylim(0, 0.6)

fig.set_figheight(5)
fig.set_figwidth(7)


plt.show()

In [None]:
speech_categories = ("Stutter","Fluent","Prolongation","Block","Sound\nRepetition","Word\nRepetition","Interjection")
speech_metrics = {
    #'Literal WER': tuple(literal_WER),
    'Literal BLEU-2': tuple(literal_2gram_bleu),
    #'Literal_BERT_F1': tuple(literal_BERT_F1),
}

x = np.arange(len(speech_categories))  # the label locations
width = 0.5  # the width of the bars
multiplier = 0

fig, ax = plt.subplots()

for attribute, measurement in speech_metrics.items():
    offset = width * multiplier
    rects = ax.bar(x + offset, measurement, width, label=attribute,color="green")
    ax.bar_label(rects, padding=3,fontsize='large')
    multiplier += 1

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Metric Score',fontsize='large')
ax.set_title('Speech to Text BLEU-2 for Literal\n Transcriptions of each Stuttering Type',fontsize='x-large')
ax.set_xticks(x, speech_categories,fontsize='large',rotation=90)
ax.legend(loc='upper left', ncols=1,fontsize='large')
ax.set_ylim(0, 0.8)

fig.set_figheight(5)
fig.set_figwidth(7)

plt.show()

In [None]:
speech_categories = ("Stutter","Fluent","Prolongation","Block","Sound\nRepetition","Word\nRepetition","Interjection")
speech_metrics = {
    #'Literal WER': tuple(literal_WER),
    #'Literal_2gram_BLEU': tuple(literal_2gram_bleu),
    'Literal BERTScore F1': tuple(literal_BERT_F1),
}

x = np.arange(len(speech_categories))  # the label locations
width = 0.5  # the width of the bars
multiplier = 0

fig, ax = plt.subplots()

for attribute, measurement in speech_metrics.items():
    offset = width * multiplier
    rects = ax.bar(x + offset, measurement, width, label=attribute,color="C0")
    ax.bar_label(rects, padding=3,fontsize='large')
    multiplier += 1

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Metric Score',fontsize='large')
ax.set_title('Speech to Text BERTScore F1 for Literal\n Transcriptions of each Stuttering Type',fontsize='x-large')
ax.set_xticks(x, speech_categories,fontsize='large',rotation=90)
ax.legend(loc='upper left', ncols=1,fontsize='large')
ax.set_ylim(0, 1.05)

fig.set_figheight(5)
fig.set_figwidth(7)


plt.show()

## Semantic Overview Graph

In [None]:
semantic_WER=[]
semantic_2gram_bleu=[]
semantic_BERT_F1=[]

for i in ["YesStutteredWords","manual_fluent","manual_prolongation","manual_block","manual_soundRep","manual_wordRep","manual_interject"]:
    if(i=="YesStutteredWords"):
        group_df=whisper2_metric_df.groupby(["manual_fluent"]).mean(numeric_only=True)
        metrics=group_df.iloc[0,:][["semantic_WER","semantic_2gram_bleu","semantic_BERT_F1"]].values
        
        semantic_WER+=[metrics[0]]
        semantic_2gram_bleu+=[metrics[1]]
        semantic_BERT_F1+=[metrics[2]]
    else:
        group_df=whisper2_metric_df.groupby([i]).mean(numeric_only=True)
        metrics=group_df.iloc[1,:][["semantic_WER","semantic_2gram_bleu","semantic_BERT_F1"]].values
        
        semantic_WER+=[metrics[0]]
        semantic_2gram_bleu+=[metrics[1]]
        semantic_BERT_F1+=[metrics[2]]

semantic_WER=[round(j, 3) for j in semantic_WER ]
semantic_2gram_bleu=[round(j, 3) for j in semantic_2gram_bleu ]
semantic_BERT_F1=[round(j, 3) for j in semantic_BERT_F1 ]

In [None]:
speech_categories = ("Stutter","Fluent","Prolongation","Block","Sound\nRepetition","Word\nRepetition","Interjection")
speech_metrics = {
    'Semantic WER': tuple(semantic_WER),
    #'Semantic_2gram_BLEU': tuple(semantic_2gram_bleu),
    #'Semantic_BERT_F1': tuple(semantic_BERT_F1),
}

x = np.arange(len(speech_categories))  # the label locations
width = 0.5 # the width of the bars
multiplier = 0

fig, ax = plt.subplots()

for attribute, measurement in speech_metrics.items():
    offset = width * multiplier
    rects = ax.bar(x + offset, measurement, width, label=attribute,color="darkorange")
    ax.bar_label(rects, padding=3,fontsize='large')
    multiplier += 1

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Metric Score',fontsize='large')
ax.set_title('Speech to Text WER for Semantic\n Transcriptions of each Stuttering Type',fontsize='x-large')
ax.set_xticks(x, speech_categories,fontsize='large',rotation=90)
ax.legend(loc='upper left', ncols=1,fontsize='large')
ax.set_ylim(0, 0.6)

fig.set_figheight(5)
fig.set_figwidth(7)


plt.show()

In [None]:
speech_categories = ("Stutter","Fluent","Prolongation","Block","Sound\nRepetition","Word\nRepetition","Interjection")
speech_metrics = {
    #'Semantic WER': tuple(semantic_WER),
    'Semantic BLEU-2': tuple(semantic_2gram_bleu),
    #'Semantic_BERT_F1': tuple(semantic_BERT_F1),
}

x = np.arange(len(speech_categories))  # the label locations
width = 0.5  # the width of the bars
multiplier = 0

fig, ax = plt.subplots()

for attribute, measurement in speech_metrics.items():
    offset = width * multiplier
    rects = ax.bar(x + offset, measurement, width, label=attribute,color="green")
    ax.bar_label(rects, padding=3,fontsize='large')
    multiplier += 1

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Metric Score',fontsize='large')
ax.set_title('Speech to Text BLEU-2 for Semantic\n Transcriptions of each Stuttering Type',fontsize='x-large')
ax.set_xticks(x, speech_categories,fontsize='large',rotation=90)
ax.legend(loc='upper left', ncols=1,fontsize='large')
ax.set_ylim(0, 0.8)

fig.set_figheight(5)
fig.set_figwidth(7)



plt.show()

In [None]:
speech_categories = ("Stutter","Fluent","Prolongation","Block","Sound\nRepetition","Word\nRepetition","Interjection")
speech_metrics = {
    #'Semantic WER': tuple(semantic_WER),
    #'Semantic_2gram_BLEU': tuple(semantic_2gram_bleu),
    'Semantic BERT F1': tuple(semantic_BERT_F1),
}

x = np.arange(len(speech_categories))  # the label locations
width = 0.5  # the width of the bars
multiplier = 0

fig, ax = plt.subplots()

for attribute, measurement in speech_metrics.items():
    offset = width * multiplier
    rects = ax.bar(x + offset, measurement, width, label=attribute,color="C0")
    ax.bar_label(rects, padding=3,fontsize='large')
    multiplier += 1

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Metric Score',fontsize='large')
ax.set_title('Speech to Text BERTScore F1 for Semantic\n Transcriptions of each Stuttering Type',fontsize='x-large')
ax.set_xticks(x, speech_categories,fontsize='large',rotation=90)
ax.legend(loc='upper left', ncols=1,fontsize='large')
ax.set_ylim(0, 1.05)

fig.set_figheight(5)
fig.set_figwidth(7)


plt.show()



## Literal Insertions, deletions, etc.

In [None]:
def get_avg_literal_wer_comp(name: str, metric: str):
    if(name=="YesStutteredWords"):
        filtered_df=whisper2_metric_df[whisper2_metric_df["manual_fluent"]==0]
    else:
        filtered_df=whisper2_metric_df[whisper2_metric_df[name]==1]

    filtered_df=filtered_df.dropna(subset=[metric])
    relative_metric_arr=filtered_df[metric]/filtered_df["literal_ref"]
    return relative_metric_arr.mean()
    


In [None]:
literal_sub=[]
literal_ins=[]
literal_del=[]


for i in ["YesStutteredWords","manual_fluent","manual_prolongation","manual_block","manual_soundRep","manual_wordRep","manual_interject"]:
    if(i=="YesStutteredWords"):
        literal_sub+=[get_avg_literal_wer_comp("YesStutteredWords","literal_sub")]
        literal_ins+=[get_avg_literal_wer_comp("YesStutteredWords","literal_ins")]
        literal_del+=[get_avg_literal_wer_comp("YesStutteredWords","literal_del")]
    else:
        literal_sub+=[get_avg_literal_wer_comp(i,"literal_sub")]
        literal_ins+=[get_avg_literal_wer_comp(i,"literal_ins")]
        literal_del+=[get_avg_literal_wer_comp(i,"literal_del")]

literal_sub=[round(j, 3) for j in literal_sub ]
literal_ins=[round(j, 3) for j in literal_ins ]
literal_del=[round(j, 3) for j in literal_del ]

In [None]:
literal_sub=[]
literal_ins=[]
literal_del=[]


for i in ["YesStutteredWords","manual_fluent","manual_prolongation","manual_block","manual_soundRep","manual_wordRep","manual_interject"]:
    literal_sub+=[get_avg_literal_wer_comp(i,"literal_sub")]
    literal_ins+=[get_avg_literal_wer_comp(i,"literal_ins")]
    literal_del+=[get_avg_literal_wer_comp(i,"literal_del")]

literal_sub=[round(j, 3) for j in literal_sub ]
literal_ins=[round(j, 3) for j in literal_ins ]
literal_del=[round(j, 3) for j in literal_del ]

In [None]:
speech_categories = ("Stutter","Fluent","Prolongation","Block","Sound\nRepetition","Word\nRepetition","Interjection")
speech_metrics = {
    'Literal Substitutions': tuple(literal_sub),
    'Literal Insertions': tuple(literal_ins),
    'Literal Deletions': tuple(literal_del),
}

x = np.arange(len(speech_categories))  # the label locations
width = 0.3  # the width of the bars
multiplier = 0

fig, ax = plt.subplots()

for attribute, measurement in speech_metrics.items():
    offset = width * multiplier
    rects = ax.bar(x + offset, measurement, width, label=attribute)
    ax.bar_label(rects, padding=3)
    multiplier += 1

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Metric Score')
ax.set_title('Speech to Text Relative Substitutions, Insertions, and Deletions for Literal Transcriptions of each Stuttering Type',fontsize='x-large')
ax.set_xticks(x+width, speech_categories,fontsize='large')
ax.legend(loc='upper left', ncols=1,fontsize='x-large')
ax.set_ylim(0, 0.55)

fig.set_figheight(5)
fig.set_figwidth(16)


plt.show()

## Semantic Subs, Insertions, deletions.


In [None]:
def get_avg_semantic_wer_comp(name: str, metric: str):
    if(name=="YesStutteredWords"):
        filtered_df=whisper2_metric_df[whisper2_metric_df["manual_fluent"]==0]
    else:
        filtered_df=whisper2_metric_df[whisper2_metric_df[name]==1]

    filtered_df=filtered_df.dropna(subset=[metric])
    relative_metric_arr=filtered_df[metric]/filtered_df["semantic_ref"]
    return relative_metric_arr.mean()


In [None]:

semantic_sub=[]
semantic_ins=[]
semantic_del=[]


for i in ["YesStutteredWords","manual_fluent","manual_prolongation","manual_block","manual_soundRep","manual_wordRep","manual_interject"]:
    if(i=="YesStutteredWords"):
        semantic_sub+=[get_avg_semantic_wer_comp("YesStutteredWords","semantic_sub")]
        semantic_ins+=[get_avg_semantic_wer_comp("YesStutteredWords","semantic_ins")]
        semantic_del+=[get_avg_semantic_wer_comp("YesStutteredWords","semantic_del")]
    else:
        semantic_sub+=[get_avg_semantic_wer_comp(i,"semantic_sub")]
        semantic_ins+=[get_avg_semantic_wer_comp(i,"semantic_ins")]
        semantic_del+=[get_avg_semantic_wer_comp(i,"semantic_del")]

semantic_sub=[round(j, 3) for j in semantic_sub ]
semantic_ins=[round(j, 3) for j in semantic_ins ]
semantic_del=[round(j, 3) for j in semantic_del ]

In [None]:

speech_categories = ("Stutter","Fluent","Prolongation","Block","Sound\nRepetition","Word\nRepetition","Interjection")
speech_metrics = {
    'Semantic Substitutions': tuple(semantic_sub),
    'Semantic Insertions': tuple(semantic_ins),
    'Semantic Deletions': tuple(semantic_del),
}

x = np.arange(len(speech_categories))  # the label locations
width = 0.3  # the width of the bars
multiplier = 0

fig, ax = plt.subplots()

for attribute, measurement in speech_metrics.items():
    offset = width * multiplier
    rects = ax.bar(x + offset, measurement, width, label=attribute)
    ax.bar_label(rects, padding=3)
    multiplier += 1

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Metric Score')
ax.set_title('Speech to Text Relative Substitutions, Insertions, and Deletions for Semantic Transcriptions of each Stuttering Type',fontsize='x-large')
ax.set_xticks(x+width, speech_categories,fontsize='large')
ax.legend(loc='upper left', ncols=1,fontsize='x-large')
ax.set_ylim(0, 0.65)

fig.set_figheight(5)
fig.set_figwidth(16)


plt.show()

## Comparing Sematnic vs Literal

In [None]:
def get_nd_array(columnval):
    newdf=whisper2_metric_df[whisper2_metric_df["manual_fluent"]!=1]
    nparr=newdf[columnval].to_numpy()
    nparr=nparr[~np.isnan(nparr)]
    return nparr


In [None]:
speech_categories = ("Literal","Semantic")
speech_metrics = {
    'Average WER': (np.mean(get_nd_array("literal_WER")),np.mean(get_nd_array("semantic_WER"))),
    #'Average 2 Gram BLEU': (np.mean(get_nd_array("literal_2gram_bleu")),np.mean(get_nd_array("semantic_2gram_bleu"))),
    #'Average BERT F1': (np.mean(get_nd_array("literal_BERT_F1")),np.mean(get_nd_array("semantic_BERT_F1"))),
}

x = np.arange(len(speech_categories))  # the label locations
width = 0.5  # the width of the bars
multiplier = 0

fig, ax = plt.subplots()

for attribute, measurement in speech_metrics.items():
    offset = width * multiplier
    rects = ax.bar(x, measurement, width, label=attribute,color="darkorange")
    ax.bar_label(rects, padding=1,fontsize='x-large',fmt='%.2f')
    multiplier += 1

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Metric Score',fontsize="x-large")
ax.set_title("WER of Semantic vs Literal\n Transcription Speech to Text\n for Stuttered Speech",fontsize='x-large')
ax.set_xticks(x, speech_categories,fontsize='x-large')
ax.legend(loc='upper left', ncols=3,fontsize='x-large')
ax.set_ylim(0, 0.6)

fig.set_figheight(6)
fig.set_figwidth(4)


plt.show()

In [None]:
speech_categories = ("Literal","Semantic")
speech_metrics = {
    #'Average WER': (np.mean(get_nd_array("literal_WER")),np.mean(get_nd_array("semantic_WER"))),
    'Average 2 Gram BLEU': (np.mean(get_nd_array("literal_2gram_bleu")),np.mean(get_nd_array("semantic_2gram_bleu"))),
    #'Average BERT F1': (np.mean(get_nd_array("literal_BERT_F1")),np.mean(get_nd_array("semantic_BERT_F1"))),
}

x = np.arange(len(speech_categories))  # the label locations
width = 0.6  # the width of the bars
multiplier = 0

fig, ax = plt.subplots()

for attribute, measurement in speech_metrics.items():
    offset = width * multiplier
    rects = ax.bar(x, measurement, width, label=attribute,color="green")
    ax.bar_label(rects, padding=1,fontsize='x-large',fmt='%.2f')
    multiplier += 1

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Metric Score',fontsize="x-large")
ax.set_title("BLEU-2 of Semantic vs Literal\n Transcription Speech to Text\n for Stuttered Speech",fontsize='x-large')
ax.set_xticks(x, speech_categories,fontsize='x-large')
ax.legend(loc='upper left', ncols=3,fontsize='x-large')
ax.set_ylim(0, 0.55)

fig.set_figheight(6)
fig.set_figwidth(4)


plt.show()

In [None]:
speech_categories = ("Literal","Semantic")
speech_metrics = {
    #'Average WER': (np.mean(get_nd_array("literal_WER")),np.mean(get_nd_array("semantic_WER"))),
    #'Average 2 Gram BLEU': (np.mean(get_nd_array("literal_2gram_bleu")),np.mean(get_nd_array("semantic_2gram_bleu"))),
    'Average BERT F1': (np.mean(get_nd_array("literal_BERT_F1")),np.mean(get_nd_array("semantic_BERT_F1"))),
}

x = np.arange(len(speech_categories))  # the label locations
width = 0.6  # the width of the bars
multiplier = 0

fig, ax = plt.subplots()

for attribute, measurement in speech_metrics.items():
    offset = width * multiplier
    rects = ax.bar(x, measurement, width, label=attribute,color="C0")
    ax.bar_label(rects, padding=1,fontsize='x-large',fmt='%.2f')
    multiplier += 1

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Metric Score',fontsize="x-large")
ax.set_title("BERT of Semantic vs Literal\n Transcription Speech to Text\n for Stuttered Speech",fontsize='x-large')
ax.set_xticks(x, speech_categories,fontsize='x-large')
ax.legend(loc='upper left', ncols=3,fontsize='x-large')
ax.set_ylim(0, 0.9)

fig.set_figheight(6)
fig.set_figwidth(4)


plt.show()

# Whisper 3 Results Graphing

## Literal Overview Graph

In [None]:
literal_WER=[]
literal_2gram_bleu=[]
literal_BERT_F1=[]

for i in ["YesStutteredWords","manual_fluent","manual_prolongation","manual_block","manual_soundRep","manual_wordRep","manual_interject"]:
    if(i=="YesStutteredWords"):
        group_df=whisper3_metric_df.groupby(["manual_fluent"]).mean(numeric_only=True)
        metrics=group_df.iloc[0,:][["literal_WER","literal_2gram_bleu","literal_BERT_F1"]].values
        
        literal_WER+=[metrics[0]]
        literal_2gram_bleu+=[metrics[1]]
        literal_BERT_F1+=[metrics[2]]
    else:
        group_df=whisper3_metric_df.groupby([i]).mean(numeric_only=True)
        metrics=group_df.iloc[1,:][["literal_WER","literal_2gram_bleu","literal_BERT_F1"]].values
        
        literal_WER+=[metrics[0]]
        literal_2gram_bleu+=[metrics[1]]
        literal_BERT_F1+=[metrics[2]]

literal_WER=[round(j, 3) for j in literal_WER ]
literal_2gram_bleu=[round(j, 3) for j in literal_2gram_bleu ]
literal_BERT_F1=[round(j, 3) for j in literal_BERT_F1 ]

In [None]:
speech_categories = ("Stutter","Fluent","Prolongation","Block","Sound\nRepetition","Word\nRepetition","Interjection")
speech_metrics = {
    'Literal WER': tuple(literal_WER),
    #'Literal_BLEU-2': tuple(literal_2gram_bleu),
    #'Literal_BERT_F1': tuple(literal_BERT_F1),
}

x = np.arange(len(speech_categories))  # the label locations
width = 0.5 # the width of the bars
multiplier = 0

fig, ax = plt.subplots()

for attribute, measurement in speech_metrics.items():
    offset = width * multiplier
    rects = ax.bar(x + offset, measurement, width, label=attribute,color="darkorange")
    ax.bar_label(rects, padding=3,fontsize='large')
    multiplier += 1

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Metric Score',fontsize='large')
ax.set_title('Speech to Text WER for Literal\n Transcriptions of each Stuttering Type',fontsize='x-large')
ax.set_xticks(x, speech_categories,fontsize='large',rotation=90)
ax.legend(loc='upper left', ncols=1,fontsize='large')
ax.set_ylim(0, 0.6)

fig.set_figheight(5)
fig.set_figwidth(7)


plt.show()

In [None]:
speech_categories = ("Stutter","Fluent","Prolongation","Block","Sound\nRepetition","Word\nRepetition","Interjection")
speech_metrics = {
    #'Literal WER': tuple(literal_WER),
    'Literal BLEU-2': tuple(literal_2gram_bleu),
    #'Literal_BERT_F1': tuple(literal_BERT_F1),
}

x = np.arange(len(speech_categories))  # the label locations
width = 0.5  # the width of the bars
multiplier = 0

fig, ax = plt.subplots()

for attribute, measurement in speech_metrics.items():
    offset = width * multiplier
    rects = ax.bar(x + offset, measurement, width, label=attribute,color="green")
    ax.bar_label(rects, padding=3,fontsize='large')
    multiplier += 1

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Metric Score',fontsize='large')
ax.set_title('Speech to Text BLEU-2 for Literal\n Transcriptions of each Stuttering Type',fontsize='x-large')
ax.set_xticks(x, speech_categories,fontsize='large',rotation=90)
ax.legend(loc='upper left', ncols=1,fontsize='large')
ax.set_ylim(0, 0.8)

fig.set_figheight(5)
fig.set_figwidth(7)

plt.show()

In [None]:
speech_categories = ("Stutter","Fluent","Prolongation","Block","Sound\nRepetition","Word\nRepetition","Interjection")
speech_metrics = {
    #'Literal WER': tuple(literal_WER),
    #'Literal_2gram_BLEU': tuple(literal_2gram_bleu),
    'Literal BERTScore F1': tuple(literal_BERT_F1),
}

x = np.arange(len(speech_categories))  # the label locations
width = 0.5  # the width of the bars
multiplier = 0

fig, ax = plt.subplots()

for attribute, measurement in speech_metrics.items():
    offset = width * multiplier
    rects = ax.bar(x + offset, measurement, width, label=attribute,color="C0")
    ax.bar_label(rects, padding=3,fontsize='large')
    multiplier += 1

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Metric Score',fontsize='large')
ax.set_title('Speech to Text BERTScore F1 for Literal\n Transcriptions of each Stuttering Type',fontsize='x-large')
ax.set_xticks(x, speech_categories,fontsize='large',rotation=90)
ax.legend(loc='upper left', ncols=1,fontsize='large')
ax.set_ylim(0, 1.05)

fig.set_figheight(5)
fig.set_figwidth(7)


plt.show()

## Semantic Overview Graph

In [None]:
semantic_WER=[]
semantic_2gram_bleu=[]
semantic_BERT_F1=[]

for i in ["YesStutteredWords","manual_fluent","manual_prolongation","manual_block","manual_soundRep","manual_wordRep","manual_interject"]:
    if(i=="YesStutteredWords"):
        group_df=whisper3_metric_df.groupby(["manual_fluent"]).mean(numeric_only=True)
        metrics=group_df.iloc[0,:][["semantic_WER","semantic_2gram_bleu","semantic_BERT_F1"]].values
        
        semantic_WER+=[metrics[0]]
        semantic_2gram_bleu+=[metrics[1]]
        semantic_BERT_F1+=[metrics[2]]
    else:
        group_df=whisper3_metric_df.groupby([i]).mean(numeric_only=True)
        metrics=group_df.iloc[1,:][["semantic_WER","semantic_2gram_bleu","semantic_BERT_F1"]].values
        
        semantic_WER+=[metrics[0]]
        semantic_2gram_bleu+=[metrics[1]]
        semantic_BERT_F1+=[metrics[2]]

semantic_WER=[round(j, 3) for j in semantic_WER ]
semantic_2gram_bleu=[round(j, 3) for j in semantic_2gram_bleu ]
semantic_BERT_F1=[round(j, 3) for j in semantic_BERT_F1 ]

In [None]:
speech_categories = ("Stutter","Fluent","Prolongation","Block","Sound\nRepetition","Word\nRepetition","Interjection")
speech_metrics = {
    'Semantic WER': tuple(semantic_WER),
    #'Semantic_2gram_BLEU': tuple(semantic_2gram_bleu),
    #'Semantic_BERT_F1': tuple(semantic_BERT_F1),
}

x = np.arange(len(speech_categories))  # the label locations
width = 0.5 # the width of the bars
multiplier = 0

fig, ax = plt.subplots()

for attribute, measurement in speech_metrics.items():
    offset = width * multiplier
    rects = ax.bar(x + offset, measurement, width, label=attribute,color="darkorange")
    ax.bar_label(rects, padding=3,fontsize='large')
    multiplier += 1

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Metric Score',fontsize='large')
ax.set_title('Speech to Text WER for Semantic\n Transcriptions of each Stuttering Type',fontsize='x-large')
ax.set_xticks(x, speech_categories,fontsize='large',rotation=90)
ax.legend(loc='upper left', ncols=1,fontsize='large')
ax.set_ylim(0, 0.6)

fig.set_figheight(5)
fig.set_figwidth(7)


plt.show()

In [None]:
speech_categories = ("Stutter","Fluent","Prolongation","Block","Sound\nRepetition","Word\nRepetition","Interjection")
speech_metrics = {
    #'Semantic WER': tuple(semantic_WER),
    'Semantic BLEU-2': tuple(semantic_2gram_bleu),
    #'Semantic_BERT_F1': tuple(semantic_BERT_F1),
}

x = np.arange(len(speech_categories))  # the label locations
width = 0.5  # the width of the bars
multiplier = 0

fig, ax = plt.subplots()

for attribute, measurement in speech_metrics.items():
    offset = width * multiplier
    rects = ax.bar(x + offset, measurement, width, label=attribute,color="green")
    ax.bar_label(rects, padding=3,fontsize='large')
    multiplier += 1

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Metric Score',fontsize='large')
ax.set_title('Speech to Text BLEU-2 for Semantic\n Transcriptions of each Stuttering Type',fontsize='x-large')
ax.set_xticks(x, speech_categories,fontsize='large',rotation=90)
ax.legend(loc='upper left', ncols=1,fontsize='large')
ax.set_ylim(0, 0.8)

fig.set_figheight(5)
fig.set_figwidth(7)



plt.show()

In [None]:
speech_categories = ("Stutter","Fluent","Prolongation","Block","Sound\nRepetition","Word\nRepetition","Interjection")
speech_metrics = {
    #'Semantic WER': tuple(semantic_WER),
    #'Semantic_2gram_BLEU': tuple(semantic_2gram_bleu),
    'Semantic BERT F1': tuple(semantic_BERT_F1),
}

x = np.arange(len(speech_categories))  # the label locations
width = 0.5  # the width of the bars
multiplier = 0

fig, ax = plt.subplots()

for attribute, measurement in speech_metrics.items():
    offset = width * multiplier
    rects = ax.bar(x + offset, measurement, width, label=attribute,color="C0")
    ax.bar_label(rects, padding=3,fontsize='large')
    multiplier += 1

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Metric Score',fontsize='large')
ax.set_title('Speech to Text BERTScore F1 for Semantic\n Transcriptions of each Stuttering Type',fontsize='x-large')
ax.set_xticks(x, speech_categories,fontsize='large',rotation=90)
ax.legend(loc='upper left', ncols=1,fontsize='large')
ax.set_ylim(0, 1.05)

fig.set_figheight(5)
fig.set_figwidth(7)


plt.show()



## Literal Insertions, deletions, etc.

In [None]:
def get_avg_literal_wer_comp(name: str, metric: str):
    if(name=="YesStutteredWords"):
        filtered_df=whisper3_metric_df[whisper3_metric_df["manual_fluent"]==0]
    else:
        filtered_df=whisper3_metric_df[whisper3_metric_df[name]==1]

    filtered_df=filtered_df.dropna(subset=[metric])
    relative_metric_arr=filtered_df[metric]/filtered_df["literal_ref"]
    return relative_metric_arr.mean()
    


In [None]:
literal_sub=[]
literal_ins=[]
literal_del=[]


for i in ["YesStutteredWords","manual_fluent","manual_prolongation","manual_block","manual_soundRep","manual_wordRep","manual_interject"]:
    if(i=="YesStutteredWords"):
        literal_sub+=[get_avg_literal_wer_comp("YesStutteredWords","literal_sub")]
        literal_ins+=[get_avg_literal_wer_comp("YesStutteredWords","literal_ins")]
        literal_del+=[get_avg_literal_wer_comp("YesStutteredWords","literal_del")]
    else:
        literal_sub+=[get_avg_literal_wer_comp(i,"literal_sub")]
        literal_ins+=[get_avg_literal_wer_comp(i,"literal_ins")]
        literal_del+=[get_avg_literal_wer_comp(i,"literal_del")]

literal_sub=[round(j, 3) for j in literal_sub ]
literal_ins=[round(j, 3) for j in literal_ins ]
literal_del=[round(j, 3) for j in literal_del ]

In [None]:
literal_sub=[]
literal_ins=[]
literal_del=[]


for i in ["YesStutteredWords","manual_fluent","manual_prolongation","manual_block","manual_soundRep","manual_wordRep","manual_interject"]:
    literal_sub+=[get_avg_literal_wer_comp(i,"literal_sub")]
    literal_ins+=[get_avg_literal_wer_comp(i,"literal_ins")]
    literal_del+=[get_avg_literal_wer_comp(i,"literal_del")]

literal_sub=[round(j, 3) for j in literal_sub ]
literal_ins=[round(j, 3) for j in literal_ins ]
literal_del=[round(j, 3) for j in literal_del ]

In [None]:
speech_categories = ("Stutter","Fluent","Prolongation","Block","Sound\nRepetition","Word\nRepetition","Interjection")
speech_metrics = {
    'Literal Substitutions': tuple(literal_sub),
    'Literal Insertions': tuple(literal_ins),
    'Literal Deletions': tuple(literal_del),
}

x = np.arange(len(speech_categories))  # the label locations
width = 0.3  # the width of the bars
multiplier = 0

fig, ax = plt.subplots()

for attribute, measurement in speech_metrics.items():
    offset = width * multiplier
    rects = ax.bar(x + offset, measurement, width, label=attribute)
    ax.bar_label(rects, padding=3)
    multiplier += 1

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Metric Score')
ax.set_title('Speech to Text Relative Substitutions, Insertions, and Deletions for Literal Transcriptions of each Stuttering Type',fontsize='x-large')
ax.set_xticks(x+width, speech_categories,fontsize='large')
ax.legend(loc='upper left', ncols=1,fontsize='x-large')
ax.set_ylim(0, 0.55)

fig.set_figheight(5)
fig.set_figwidth(16)


plt.show()

## Semantic Subs, Insertions, deletions.


In [None]:
def get_avg_semantic_wer_comp(name: str, metric: str):
    if(name=="YesStutteredWords"):
        filtered_df=whisper3_metric_df[whisper3_metric_df["manual_fluent"]==0]
    else:
        filtered_df=whisper3_metric_df[whisper3_metric_df[name]==1]

    filtered_df=filtered_df.dropna(subset=[metric])
    relative_metric_arr=filtered_df[metric]/filtered_df["semantic_ref"]
    return relative_metric_arr.mean()


In [None]:

semantic_sub=[]
semantic_ins=[]
semantic_del=[]


for i in ["YesStutteredWords","manual_fluent","manual_prolongation","manual_block","manual_soundRep","manual_wordRep","manual_interject"]:
    if(i=="YesStutteredWords"):
        semantic_sub+=[get_avg_semantic_wer_comp("YesStutteredWords","semantic_sub")]
        semantic_ins+=[get_avg_semantic_wer_comp("YesStutteredWords","semantic_ins")]
        semantic_del+=[get_avg_semantic_wer_comp("YesStutteredWords","semantic_del")]
    else:
        semantic_sub+=[get_avg_semantic_wer_comp(i,"semantic_sub")]
        semantic_ins+=[get_avg_semantic_wer_comp(i,"semantic_ins")]
        semantic_del+=[get_avg_semantic_wer_comp(i,"semantic_del")]

semantic_sub=[round(j, 3) for j in semantic_sub ]
semantic_ins=[round(j, 3) for j in semantic_ins ]
semantic_del=[round(j, 3) for j in semantic_del ]

In [None]:

speech_categories = ("Stutter","Fluent","Prolongation","Block","Sound\nRepetition","Word\nRepetition","Interjection")
speech_metrics = {
    'Semantic Substitutions': tuple(semantic_sub),
    'Semantic Insertions': tuple(semantic_ins),
    'Semantic Deletions': tuple(semantic_del),
}

x = np.arange(len(speech_categories))  # the label locations
width = 0.3  # the width of the bars
multiplier = 0

fig, ax = plt.subplots()

for attribute, measurement in speech_metrics.items():
    offset = width * multiplier
    rects = ax.bar(x + offset, measurement, width, label=attribute)
    ax.bar_label(rects, padding=3)
    multiplier += 1

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Metric Score')
ax.set_title('Speech to Text Relative Substitutions, Insertions, and Deletions for Semantic Transcriptions of each Stuttering Type',fontsize='x-large')
ax.set_xticks(x+width, speech_categories,fontsize='large')
ax.legend(loc='upper left', ncols=1,fontsize='x-large')
ax.set_ylim(0, 0.65)

fig.set_figheight(5)
fig.set_figwidth(16)


plt.show()

## Comparing Sematnic vs Literal

In [None]:
def get_nd_array(columnval):
    newdf=whisper3_metric_df[whisper3_metric_df["manual_fluent"]!=1]
    nparr=newdf[columnval].to_numpy()
    nparr=nparr[~np.isnan(nparr)]
    return nparr


In [None]:
speech_categories = ("Literal","Semantic")
speech_metrics = {
    'Average WER': (np.mean(get_nd_array("literal_WER")),np.mean(get_nd_array("semantic_WER"))),
    #'Average 2 Gram BLEU': (np.mean(get_nd_array("literal_2gram_bleu")),np.mean(get_nd_array("semantic_2gram_bleu"))),
    #'Average BERT F1': (np.mean(get_nd_array("literal_BERT_F1")),np.mean(get_nd_array("semantic_BERT_F1"))),
}

x = np.arange(len(speech_categories))  # the label locations
width = 0.5  # the width of the bars
multiplier = 0

fig, ax = plt.subplots()

for attribute, measurement in speech_metrics.items():
    offset = width * multiplier
    rects = ax.bar(x, measurement, width, label=attribute,color="darkorange")
    ax.bar_label(rects, padding=1,fontsize='x-large',fmt='%.2f')
    multiplier += 1

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Metric Score',fontsize="x-large")
ax.set_title("WER of Semantic vs Literal\n Transcription Speech to Text\n for Stuttered Speech",fontsize='x-large')
ax.set_xticks(x, speech_categories,fontsize='x-large')
ax.legend(loc='upper left', ncols=3,fontsize='x-large')
ax.set_ylim(0, 0.6)

fig.set_figheight(6)
fig.set_figwidth(4)


plt.show()

In [None]:
speech_categories = ("Literal","Semantic")
speech_metrics = {
    #'Average WER': (np.mean(get_nd_array("literal_WER")),np.mean(get_nd_array("semantic_WER"))),
    'Average 2 Gram BLEU': (np.mean(get_nd_array("literal_2gram_bleu")),np.mean(get_nd_array("semantic_2gram_bleu"))),
    #'Average BERT F1': (np.mean(get_nd_array("literal_BERT_F1")),np.mean(get_nd_array("semantic_BERT_F1"))),
}

x = np.arange(len(speech_categories))  # the label locations
width = 0.6  # the width of the bars
multiplier = 0

fig, ax = plt.subplots()

for attribute, measurement in speech_metrics.items():
    offset = width * multiplier
    rects = ax.bar(x, measurement, width, label=attribute,color="green")
    ax.bar_label(rects, padding=1,fontsize='x-large',fmt='%.2f')
    multiplier += 1

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Metric Score',fontsize="x-large")
ax.set_title("BLEU-2 of Semantic vs Literal\n Transcription Speech to Text\n for Stuttered Speech",fontsize='x-large')
ax.set_xticks(x, speech_categories,fontsize='x-large')
ax.legend(loc='upper left', ncols=3,fontsize='x-large')
ax.set_ylim(0, 0.55)

fig.set_figheight(6)
fig.set_figwidth(4)


plt.show()

In [None]:
speech_categories = ("Literal","Semantic")
speech_metrics = {
    #'Average WER': (np.mean(get_nd_array("literal_WER")),np.mean(get_nd_array("semantic_WER"))),
    #'Average 2 Gram BLEU': (np.mean(get_nd_array("literal_2gram_bleu")),np.mean(get_nd_array("semantic_2gram_bleu"))),
    'Average BERT F1': (np.mean(get_nd_array("literal_BERT_F1")),np.mean(get_nd_array("semantic_BERT_F1"))),
}

x = np.arange(len(speech_categories))  # the label locations
width = 0.6  # the width of the bars
multiplier = 0

fig, ax = plt.subplots()

for attribute, measurement in speech_metrics.items():
    offset = width * multiplier
    rects = ax.bar(x, measurement, width, label=attribute,color="C0")
    ax.bar_label(rects, padding=1,fontsize='x-large',fmt='%.2f')
    multiplier += 1

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Metric Score',fontsize="x-large")
ax.set_title("BERT of Semantic vs Literal\n Transcription Speech to Text\n for Stuttered Speech",fontsize='x-large')
ax.set_xticks(x, speech_categories,fontsize='x-large')
ax.legend(loc='upper left', ncols=3,fontsize='x-large')
ax.set_ylim(0, 0.9)

fig.set_figheight(6)
fig.set_figwidth(4)


plt.show()

# Hallucination Graphing

Hallucinations were manually labeled

In [None]:
hallu_df=pd.read_csv("<path to hallucination df csv>")

In [None]:
for i in range(hallu_df.shape[0]):
    if(hallu_df.loc[i,"manual_hallucination"]==1):
        hallu_df.loc[i,"hallucination_binary"]=1

In [None]:
def get_avg_hallucinations(name: str, metric: str):
    if(name=="NoStutteredWords"):
        filtered_df=hallu_df[hallu_df["stutter_present"]==0]
    else:
        filtered_df=hallu_df[hallu_df[name]==1]

    filtered_df=filtered_df.dropna(subset=[metric])
    relative_metric_arr=filtered_df[metric].astype(float)
    return relative_metric_arr.mean()


In [None]:

hallucination_binary=[]


for i in ["stutter_present","NoStutteredWords","manual_prolongation","manual_block","manual_soundRep","manual_wordRep","manual_interject"]:
    hallucination_binary+=[get_avg_hallucinations(i,"hallucination_binary")]



hallucination_binary=[round(j, 3) for j in hallucination_binary ]



In [None]:
speech_categories = ("Stutter","Fluent","Prolongation","Block","Sound\nRepetition","Word\nRepetition","Interjection")
speech_metrics = {
    'Hallucination Frequency': tuple(hallucination_binary),
}

x = np.arange(len(speech_categories))  # the label locations
width = 0.5  # the width of the bars
multiplier = 0

fig, ax = plt.subplots()

for attribute, measurement in speech_metrics.items():
    offset = width * multiplier
    rects = ax.bar(x + offset, measurement, width, label=attribute)
    ax.bar_label(rects, padding=1,fontsize="large")
    multiplier += 1

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Metric Score')
ax.set_title('WhisperV2 Hallucination Frequency',fontsize='xx-large')
ax.set_xticks(x, speech_categories,fontsize='large',rotation=90)
ax.legend(loc='upper left', ncols=1,fontsize='large')
ax.set_ylim(0, 0.25)

fig.set_figheight(5)
fig.set_figwidth(7)


plt.show()