# <b><span style="color:#845EC2"> Results and Analysis:</span></b> <b><span style="color:#D65DB1"> One word transcriptions</span></b>

<!-- <b><span style="color:#d65db1"> ★ ★ ★ ★ </span></b>

<b><span style="color:#de7dc1"> ★ ★ ★ ★  </span></b>

<b><span style="color:#e69ed0"> ★ ★ ★ ★ </span></b>

<b><span style="color:#efbee0"> ★ ★ ★ ★</span></b> -->

Focus on <b><span style="color:#FF9671"> version 1 </span></b> in the first instance

### <b><span style="color:#b59eda"> Libary Imports </span>

In [51]:
# Library inmports
import self_made_functions as smf
import matplotlib.pyplot as plt
from   textwrap import wrap 
import seaborn as sns
import pandas as pd
import numpy as np
import jiwer
import os

### <b><span style="color:#e69ed0"> Data initialization  </span>

In [65]:
df_assessment, wv_path = smf.get_correct_df()

# Make a new directory for resaved files 
results_dir = './Transcriptions/Results'
if not os.path.exists(results_dir):
    os.makedirs(results_dir)
    print(f"Directory {results_dir} created")

# Read empty transcriptions
empty_transcriptions = pd.read_csv('Transcriptions/empty_transcriptions_v1.csv')

# Transcription directory information
transcription_dir = './Transcriptions'
lst_csv = os.listdir(transcription_dir)
lst_csv = [file for file in lst_csv if file.startswith('tran') & file.endswith('v1.csv')] # The v1 transcribed files

# Sort the list by model name
test_for_models = ['tiny', 'nb-whisper-tiny', 'nb-whisper-tiny-verbatim',
                    'base', 'nb-whisper-base', 'nb-whisper-base-verbatim',
                    'medium', 'nb-whisper-medium', 'nb-whisper-medium-verbatim']
lst_csv = sorted(lst_csv, key=lambda x: test_for_models.index(x.split('_')[-2]))

#### <b><span style="color:#e69ed0"> Save missing information </span></b>

 The transcription files are changed and re saved.

The <b><span style="color:#ff9671">empty-transcriptions</span></b> are added to the transcriptions.

The <b><span style="color:#ff9671">Phonetic Error Ratio (PER)</span></b> is calculated. 

A <b><span style="color:#ff9671">ID column</span></b> is added, and all the colum names are <b><span style="color:#ff9671">renamed</span></b> and organized.


A column for the <b><span style="color:#ff9671">model name</span></b> is also added. 

In [85]:
def resave_transcription_df(bool:True):
    lst_path = [os.path.join(transcription_dir, file) for file in lst_csv]

    # Iterate through the CSV files to add what is missing
    for path in lst_path:    
        # Read transcriped files
        df_csv = pd.read_csv(path)
        print(f"Reading {path}")
        df_csv = df_csv.drop(['CER Output',"CER Score"], axis=1) # axis = 1 : drops column, = 0 : drops row 
        
        # Fix the CER column
        df_csv['Transcribed'] = df_csv['Transcribed'].apply(lambda x: x.strip()  if isinstance(x, str) else x)
        df_csv['Transcribed'] = df_csv['Transcribed'].apply(lambda x: x.lower() if isinstance(x, str) else x)
        df_csv['Transcribed'] = df_csv['Transcribed'].apply(lambda x: x.replace(".", "") if isinstance(x, str) else x)
        df_csv['Transcribed'] = df_csv['Transcribed'].apply(lambda x: x.replace("!", "") if isinstance(x, str) else x)
        df_csv['Transcribed'] = df_csv['Transcribed'].apply(lambda x: x.replace("?", "") if isinstance(x, str) else x)        
        df_csv['CER (Character Error Rate)'] = df_csv.apply(lambda x: jiwer.cer(x['Word'], x['Transcribed']), axis=1)
        
        print(df_csv[df_csv['File name']=='a01_sykkel.wav'])
        
        # Add model name
        model_name = path.split('_')[-2]
        df_csv["model_name"] = model_name
        
        # Change names
        df_csv = df_csv.rename(columns={"File name": "file_name",
                                    "CER (Character Error Rate)": "CER", # Character Error Rate (CER)
                                    "Word": "target_word", 
                                    "Transcribed": "transcribed_word", 
                                    "OG Score": "global_score"})
        
        for i, row in df_csv.iterrows():
            pron_lst = df_assessment[df_assessment['File name'] == row["file_name"]].pronScores.values[0].split(' ')
            pron_count = pron_lst.count('0')
            per = pron_count/len(pron_lst)
            df_csv.loc[i, "PER"] = per # Phonetic Error Rate (PER)
        
        df_csv = pd.concat([df_csv, empty_transcriptions[empty_transcriptions['model_name']==model_name]], ignore_index=True)
        df_csv = df_csv.reset_index(drop=True)
        
        print(df_csv[df_csv['file_name']=='a01_sykkel.wav'])
        
        if len(df_csv.model_name.unique()) > 1:
            print(f"Error: {path} has more than one model name")
            break
        
        # Add ID column
        df_csv["id"] = df_csv["file_name"].apply(lambda x: x.split('_')[0])
        df_csv = df_csv.sort_values(by=['id'])
        
        # print(df_csv.columns)    
        # Reorder column names
        reorder_column = ["id", "global_score", 
                        "target_word", "PER", 
                        "transcribed_word", "CER", 
                        "file_name", "model_name"]
        
        df_csv = df_csv[reorder_column]
        # print(df_csv.columns)   

        if bool:
            # Save the file
            csv_name = path.split('/')[-1]
            df_csv.to_csv(os.path.join(results_dir, csv_name), index=False)    
            
resave_transcription_df(False)

Reading ./Transcriptions/transcriptions_tiny_v1.csv
          File name    Word Transcribed  CER (Character Error Rate)  OG Score
796  a01_sykkel.wav  sykkel      sikker                    0.333333         4
          file_name target_word transcribed_word       CER  global_score  \
796  a01_sykkel.wav      sykkel           sikker  0.333333             4   

    model_name  PER  
796       tiny  0.2  
Reading ./Transcriptions/transcriptions_nb-whisper-tiny_v1.csv
          File name    Word Transcribed  CER (Character Error Rate)  OG Score
795  a01_sykkel.wav  sykkel      sykkel                         0.0         4
          file_name target_word transcribed_word  CER  global_score  \
795  a01_sykkel.wav      sykkel           sykkel  0.0             4   

          model_name  PER  
795  nb-whisper-tiny  0.2  
Reading ./Transcriptions/transcriptions_nb-whisper-tiny-verbatim_v1.csv
          File name    Word Transcribed  CER (Character Error Rate)  OG Score
795  a01_sykkel.wav  sykkel

### **<span style="color:#FF6F91"> BOX PLOT &nbsp;:&nbsp; </span>**
#### <span style="color:#ebaed8">  Every model compar &nbsp;:&nbsp; CER vs. PER for <b>Native</b>, <b>Non Native</b> and <b>all</b> </span>

Create directory to save the box plots &nbsp; : &nbsp; <span style="color:#ebaed8"> *./BoxPlots/Model_CER*</i></span>

In [86]:
save_dir_box_cer = './Transcriptions/Results/BoxPlots/Model_CER'
if not os.path.exists(save_dir_box_cer):
    os.makedirs(save_dir_box_cer)
    print(f"Directory {save_dir_box_cer} created")

Make a fuction that works for both all, native and non-native speakers

**Giga Matrix**

In [87]:
lst_path = [os.path.join(results_dir, file) for file in lst_csv]
ggm = pd.DataFrame()
# Iterate through the CSV files to add what is missing
for path in lst_path:
    print(f"Reading {path}")
    df_csv = pd.read_csv(path)
    ggm = pd.concat([ggm, df_csv], ignore_index=True)
ggm = ggm.reset_index(drop=True)

Reading ./Transcriptions/Results/transcriptions_tiny_v1.csv
Reading ./Transcriptions/Results/transcriptions_nb-whisper-tiny_v1.csv
Reading ./Transcriptions/Results/transcriptions_nb-whisper-tiny-verbatim_v1.csv
Reading ./Transcriptions/Results/transcriptions_base_v1.csv
Reading ./Transcriptions/Results/transcriptions_nb-whisper-base_v1.csv
Reading ./Transcriptions/Results/transcriptions_nb-whisper-base-verbatim_v1.csv
Reading ./Transcriptions/Results/transcriptions_medium_v1.csv
Reading ./Transcriptions/Results/transcriptions_nb-whisper-medium_v1.csv
Reading ./Transcriptions/Results/transcriptions_nb-whisper-medium-verbatim_v1.csv
