# <b><span style="color:#ADF88B">Fix Concatenation Results</span></b>

Script to fix the concatenated result data frame. I recreate excatly how the audio files are concatenated, and exstracting the target words, target PER, target WER and 

In [6]:
import sys      
script_directory = '../'
sys.path.append(script_directory)
import self_made_functions as smf
import pandas as pd
import jiwer
import os

In [8]:
df_assessment, _ = smf.get_correct_df()
df_assessment['id'] = df_assessment['File name'].apply(lambda x: x.split('_')[0])


# Test 1 :  Grouping by score. Sorted by speaker.
df_test_1 = df_assessment.sort_values(by=['id'], ascending=False) # Sorted by ID
df_test_1 = df_test_1.reset_index(drop=True)
df_test_1 = df_test_1.groupby('Score')# Grouped by score
df_test_1_name = 'scores_id_sorted'

# Test 2 :  Grouping by score. Random speakers.
df_test_2 = df_assessment.sample(frac=1).reset_index(drop=True)
df_test_2 = df_test_2.groupby('Score')
df_test_2_name = 'scores_id_mixed'

# Test 3 :  Grouping by speaker. No sorting.
df_test_3 = df_assessment.groupby('id')# Grouped by speaker
df_test_3_name = 'group_id_sorted_score'

# Test 4 :  No group. Everything in random order.
df_test_4 = df_assessment.sample(frac=1).reset_index(drop=True)
df_test_4_name = 'no_group_mixed'

# --------- Store Information ------------ #
halo_df = pd.DataFrame(columns=['PER', 
                                'WER', 
                                'input_words_list', 
                                'audio_name', 
                                'test_name',
                                'test'])

In [9]:
# Test 1 : Grouping by score. Sorted by speaker.
for idx, score_group in df_test_1:
    for i in range(0, len(score_group)//10,10):
        s, e = i, i + 10 # get start and end index
        target_per, wer, input_lst, audio_name = concat_audio(score_group, s, e)
        new_df = {'PER' : target_per, 
                'WER' : wer, 
                'input_words_list' : [input_lst], 
                'audio_name' : audio_name,
                'test_name' : df_test_1_name,
                'test' : 1} 
        new_df = pd.DataFrame(new_df)
        halo_df = pd.concat([halo_df, new_df], ignore_index=True)

  halo_df = pd.concat([halo_df, new_df], ignore_index=True)


In [10]:
# Test 2 : Grouping by score. Random speakers.
for idx, score_group in df_test_2:
    for i in range(0, len(score_group)//10,10):
        s, e = i, i + 10 # get start and end index
        target_per, target_wer, input_lst, audio_name = concat_audio(score_group, s, e)
        new_df = {'PER' : target_per, 
                'WER' : target_wer, 
                'input_words_list' : [input_lst], 
                'audio_name' : audio_name,
                'test_name' : df_test_2_name,
                'test' : 2} 
        new_df = pd.DataFrame(new_df)
        halo_df = pd.concat([halo_df, new_df], ignore_index=True)

In [11]:
# Test 3 : Grouping by speaker. No sorting.
for idx, person in df_test_3:
    for i in range(0, len(score_group)//10,10):
        s, e = i, i + 10 # get start and end index
        speaker_id = person.iloc[0]['id']
        target_per, target_wer, input_lst, audio_name = concat_audio(score_group, s, e, name=f'{speaker_id}_')
        new_df = {'PER' : target_per, 
                'WER' : target_wer, 
                'input_words_list' : [input_lst], 
                'audio_name' : audio_name,
                'test_name' : df_test_3_name,
                'test' : 3} 
        new_df = pd.DataFrame(new_df)
        halo_df = pd.concat([halo_df, new_df], ignore_index=True)

In [12]:
# Test 4 : No group. Everything in random order.
for i in range(0, len(df_test_4)//10,10):
        s, e = i, i + 10 # get start and end index
        target_per, target_wer, input_lst, audio_name = concat_audio(df_test_4, s, e)
        new_df = {'PER' : target_per, 
                'WER' : target_wer, 
                'input_words_list' : [input_lst], 
                'audio_name' : audio_name,
                'test_name' : df_test_4_name,
                'test' : 4} 
        new_df = pd.DataFrame(new_df)
        halo_df = pd.concat([halo_df, new_df], ignore_index=True)

## Modify the current information CSV files

1) ~~Load the transcribed CSV~~
2) ~~Check that all the audio_names and input_words_list match~~
3) ~~Add the correct information to the CSV~~
4) Strip the transcriptions properly
5) ~~Fix update CER and WER~~
6) ~~Create a new file with the correct information~~
7) Check no transcriptions has NaN values

In [58]:
file_lst = os.listdir('../3x10_Concatenation_information')

for file in file_lst:
    # print(f'Working on file: {file}')
    # ----- Step 1 : Load the data ----- #
    df = pd.read_csv(f'../3x10_Concatenation_information/{file}')
    get_test = file.split('_information_')[1].split('_v1')[0]
    df = df.rename(columns={'target_CER': 'PER', 'target_WER': 'WER'}) # Renaming columns
    
    # Get the split data frame from halo_df
    same_df_split = halo_df[(halo_df['test_name'] == get_test) & 
                            (halo_df['audio_name'].isin(df['audio_name']))]
    
    # Rename columns in same_df_split to avoid conflicts
    same_df_split = same_df_split.rename(columns={'PER': 'PER_temp', 'WER': 'WER_temp'})
        
    # Verify there are no conflicting column names before merging
    if 'WER' in same_df_split.columns or 'PER' in same_df_split.columns:
        print(f'Error: Conflicting column names found in same_df_split for file {file}')
        break
    
    # ----- Step 2 & 3 : Merge the data ----- #
    # same_df_split = halo_df[(halo_df['test_name'] == get_test) & 
    #                     (halo_df['audio_name'].isin(df['audio_name']))]
    if not len(same_df_split) == len(df):
        print(f'Error: Length of split data frames does not match for file {file}.\n{len(same_df_split)} vs {len(df)}')
    
    # Merging df with same_df_split on the 'audio_name' column
    merged_df = df.merge(same_df_split[['audio_name', 'PER_temp', 'WER_temp']], on='audio_name', how='left')

    # Updating the 'PER' column in df with the values from the merged DataFrame
    df['PER'] = merged_df['PER_temp']
    df['WER'] = merged_df['WER_temp']
    
    # Check for nans to ensure smooth merge
    if df['PER'].isnull().values.any():
        print(f'Error Critical: NaN values in the PER column for file {file}.')
                
    if df['WER'].isnull().values.any():
        print(f'Error Critical: NaN values in the WER column for file {file}.')
        
    # ----- Step 4 : Strip the transcriptions ----- #
    df['translated_string'] = df['translated_string'].apply(lambda x: x.strip().lower().replace(".", "").replace(",", "").replace("!", "").replace("?", "") 
        if isinstance(x, str) else x)
    
    # ----- Step 5 : Calculate the CER & WER ----- #
    nan = df['translated_string'].isna().sum() # Check for NaN values   
    if nan > 0:
        print(f'Error: {nan} values in the translated_string column for file {file}.')  

    # Calculate WER, replacing with 1.0 if NaN
    df['translated_WER'] = df.apply(lambda row: 1.0 if pd.isna(row['translated_string']) 
                                    else jiwer.wer(row['input_string'], row['translated_string']), axis=1)

    # Calculate CER, replacing with 1.0 if NaN
    df['translated_CER'] = df.apply(lambda row: 1.0 if pd.isna(row['translated_string']) 
                                    else jiwer.cer(row['input_string'], row['translated_string']), axis=1)
    
    # ----- Step 6 : Remove and rearrange columns ----- #
    df = df.drop(columns=['trans_CER', 'trans_WER', 'target_CER_sum'])

    # Check if speaker_id is in the columns
    if 'speaker_id' in df.columns:
        new_arrangement = ['speaker_id', 'score',
                        'input_string', 'PER', 'WER',
                        'translated_string','translated_CER', 'translated_WER',
                        'length_deviation_words',	'audio_name',	'audio_path']
        df = df[new_arrangement]
    else:
        new_arrangement = ['score',
                        'input_string', 'PER', 'WER',
                        'translated_string','translated_CER', 'translated_WER',
                        'length_deviation_words', "unique_id's", "speaker_id's",
                        'audio_name',	'audio_path']
        df = df[new_arrangement]
    
    # --- Extract and modify the model name for plotting --- # 
    model_name = file.split('__')[0] 
    if model_name.startswith('nb'):
        mn_split = model_name.split('-whisper')
        if not len(mn_split) == 2:
            print(f'Error: model name split failed. Len: {len(mn_split)}. Split: {mn_split}')
        else:
            model_name = 'NNL' + mn_split[1]
    else:
        model_name = model_name.capitalize()
        
    df['model_name'] = model_name
    
    # ----- Step 7 : Save the data ----- #
    dir = '../Concat_results/corrected_data'
    if not os.path.exists(dir):
        os.makedirs(dir)

    # # Example condition on a DataFrame df
    # condition = (df['length_deviation_words'] != 0) & (df['translated_WER'] == 1.0)

    # # Applying the condition to filter rows
    # filtered_df = df[condition]

    # # Check if any rows meet the condition
    # if filtered_df.shape[0] > 0:
    #     # Iterate over the rows if needed
    #     for index, row in filtered_df.iterrows():
    #         print(f"Error: Length deviation is not 0 but WER is 1.0 for file.")
    #         print(row['length_deviation_words'])
    # else:
    #     print("No errors found.")

    # # Example to handle Series truth value ambiguity
    # import numpy as np
    # if np.any(condition):
    #     print("At least one row meets the condition.")
    # else:
    #     print("No rows meet the condition.")
        
    df.to_csv(f'{dir}/{file}', index=False)

Error: 3 values in the translated_string column for file nb-whisper-base-verbatim__concatenated_audio_information_group_id_sorted_score_v1.csv.
Error: 1 values in the translated_string column for file nb-whisper-base-verbatim__concatenated_audio_information_no_group_mixed_v1.csv.
Error: 2 values in the translated_string column for file nb-whisper-base-verbatim__concatenated_audio_information_scores_id_sorted_v1.csv.
Error: 1 values in the translated_string column for file nb-whisper-medium__concatenated_audio_information_scores_id_mixed_v1.csv.
Error: 1 values in the translated_string column for file nb-whisper-medium-verbatim__concatenated_audio_information_scores_id_mixed_v1.csv.
