# Remove Emty Transcriptions from Trascribed Data Frame

For some reason does some of the emty trascriptions sneek into the trascribed dataframe. This script fixes this.

In [None]:
import pandas as pd 
import os

# PART 1: Load the data -----------------------------------------------------------------
directory = './Transcriptions'
df_file_name = 'transcriptions_nb-whisper-medium-verbatim_v1.csv'
path = os.path.join(directory, df_file_name)

model_name = df_file_name.split('_')[1]
version = df_file_name.split('_')[2].split('.')[0]

df = pd.read_csv(path)

# PART 2: Check for missing values -------------------------------------------------------
nan_df = df[df.isna().any(axis=1)] # find rows that is nan
print('The missing values in the Trascriped file:\n', nan_df, '\n')

# PART 3: Check if the missing values are in the empty transcriptions file ----------------
# Find if the Emty rows in the dataframe is alreaddy added in the emty transcriptions file
mt_file_name = 'empty_transcription_v2.csv'
mt_path = os.path.join(directory, mt_file_name)

mt_df = pd.read_csv(mt_path)

# Check if the mt df conatins the mt transcriptions 
print('Current rows in the emty dataframe:\n', mt_df[mt_df['Model'] == model_name], '\n')

# If they are not in the file, add them to the file

if nan_df['File name'].values not in mt_df['File name'].values:
    print('The missing values are not in the empty transcriptions file\n')
    

    # PART 4: Add the missing values to the empty transcriptions file --------------------------
    # Find the relevant info in the mt datafram
    file_name = nan_df['File name'].values
    OG_word = nan_df['Word'].values
    idx = nan_df.index.values
    model_name = model_name

    # Add the new information to the mt dataframe
    new_mt_df = {
            'File name': row['File name'],
            'OG word': row['Word'],
            'idx' : idx,
            'Model': model_name
        }

    new_mt_df = pd.DataFrame(new_mt_df)
    print('Rows to the new dataframe:\n', new_mt_df, '\n')

    # Utpdate the mt_df to be correct og save it
    finito_mt_df = pd.concat([mt_df, new_mt_df], ignore_index=True)
    print('finito empty df:\n', finito_mt_df[finito_mt_df['Model'] == model_name], '\n')

# PART 5: remove the rows from the original dataframe and save it --------------------------
df = df.dropna()
# df.to_csv(path, index=False)

# PART 6: Save the updated empty transcriptions file ---------------------------------------
# finito_mt_df.to_csv(mt_path, index=False)

In [82]:
import os
import pandas as pd

# PART 1: Define directories and base filenames
mt_dir = './Transcriptions/Empty_Transcriptions'
df_dir = './Transcriptions'
df_base = 'transcriptions_'

# PART 2: Iterate through empty transcription files
for file in os.listdir(mt_dir):
    if file.startswith('empty') and file.endswith('.csv'):
        file_version = file.split('_')[2].split('.')[0]
        mt_path = os.path.join(mt_dir, file)
        mt_df = pd.read_csv(mt_path)
        
        models = mt_df['Model'].unique()
        
        for model in models:
            file_name = df_base + model + '_' + file_version + '.csv'
            df_path = os.path.join(df_dir, file_name)
            df = pd.read_csv(df_path)
            
            # PART 3: Check for missing values in the transcription file
            nan_df = df[df.isna().any(axis=1)]
            if not nan_df.empty:
                print(f'Model: {model}, Version: {file_version}')
                # print('The missing values in the Transcribed file:\n', nan_df, '\n')
                
                # Check if the missing values are already in the empty transcriptions file
                if not nan_df['File name'].isin(mt_df['File name']).any():
                    print('The missing values are not in the empty transcriptions file\n')
                    
                    # PART 4: Add the missing values to the empty transcriptions file
                    rows = []
                    for idx, row in nan_df.iterrows():
                        new_row = {
                            'File name': row['File name'],
                            'OG word': row['Word'],
                            'idx': idx,
                            'Model': model
                        }
                        rows.append(new_row)
                    
                    new_mt_df = pd.DataFrame(rows)
                    # print('Rows to be added to the new dataframe:\n', new_mt_df, '\n')
                    
                    # Update the mt_df and save it
                    finito_mt_df = pd.concat([mt_df, new_mt_df], ignore_index=True)
                    finito_mt_df.to_csv(mt_path, index=False)
                    # print('Updated empty df:\n', finito_mt_df[finito_mt_df['Model'] == model], '\n')
                    
                    # PART 5: Remove the rows with missing values from the original dataframe
                    df = df.dropna()
                    df.to_csv(df_path, index=False)
                    # print(f'Updated original df saved to {df_path}\n')
            else: 
                print('The missing values are already in the empty transcriptions file\n')

print('Process completed successfully.')

# PART 6: Save the updated empty transcriptions file
# (Already saved within the loop above)


The missing values are already in the empty transcriptions file

The missing values are already in the empty transcriptions file

The missing values are already in the empty transcriptions file

The missing values are already in the empty transcriptions file

The missing values are already in the empty transcriptions file

The missing values are already in the empty transcriptions file

The missing values are already in the empty transcriptions file

Process completed successfully.
