# <b><span style="color:#ADF88B">10 x 3 sec audio files to 30 sec audio file </span> </b>

This script concatenates 10x3sec audio files into one 30sec audio file. 
The script uses the <span style="color:#ADF88B"><b> pydub library</b> </span> to manipulate the audio files. 


## <span style="color:#58F2B3"> <b>Method: </b> Concatenated Clips  </span>

An ID column containing the assessment information was added to the data frame. 


It was then sorted and grouped by the ID column. The audio clips in each group were then concatenated to equally 10x3 audio clips. 


All the concatenated audio clips were saved to new directories for easy access.



Each audio clip is 3 seconds long, while Whisper expects audio clips of 30 seconds or longer. Therefore, Whisper's responses were tested using 30-second audio clips, achieved by concatenating ten 3-second audio clips. Each clip does not correlate, but it might reveal valuable results for Whisper's performance.

The Verbatim Tiny and Medium models from the Norwegian National Library (NNL) were used to compare the performance differences between the two, which had a high difference in features. 

Multiple tests were conducted to evaluate Whisper's response in this scenario:

1. <span style="color:#00E6DB"> Grouping by score: Each score was grouped before transcription. Sorted by speaker (?) .</span>
2. <span style="color:#00E6DB"> Grouping by person: Each person's independent words were grouped.</span>
3. <span style="color:#00E6DB"> Random shuffling of data before transcription.</span>


The results were analyzed using Word Error Rate (WER) and Character Error Rate (CER) scores to assess the performance of the models.




It showed promising results from one iteration, but after that, nothing was correct.


<span style="color:#00C0FF">#00C0FF </span>
<span style="color:#00D5F7">#00D5F7 </span>
<span style="color:#00E6DB">#00E6DB </span>

<span style="color:#58F2B3">#58F2B3 </span>
<span style="color:#ADF88B">#ADF88B </span>
<span style="color:#F9F871">#F9F871 </span>

In [2]:
# Libary imports
from prettytable import PrettyTable
import self_made_functions as smf
from transformers import pipeline
import matplotlib.pyplot as plt
from pydub import AudioSegment # Good for audio manipulation
import pandas as pd 
import numpy as np
import whisper
import os


2024-06-29 14:28:01.855018: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Concatination information functions

Used to save, and exstract wanted data

In [23]:
# --------- Part 1 - Generate the audio files ----------- #
def concat_audio(df:pd.DataFrame, save_directory:str, s:int=0, e:int=10, wv_path:str = '/talebase/data/speech_raw/teflon_no/speech16khz/', name = ''):
    concat_rows = df.iloc[s:e]  # Rows to concatenate
    input_string = concat_rows['Word'].values # get input string
    
    combined_audio = AudioSegment.empty() # Empty audio file to concatenate the audio clips
    id_list = [] # List of the speaker IDs
    
    for row in concat_rows.itertuples():
        id_list.append(row[1].split('_')[0])
        audio = AudioSegment.from_file(os.path.join(wv_path, row[1]))
        combined_audio += audio
    
    audio_name = f'{name}words_{s}_{e}.wav'
    audio_directory = os.path.join(save_directory, audio_name)
    
    # Save the combined audio
    combined_audio.export(audio_directory, format="wav")    
    return audio_name, id_list, input_string


# Create dir if not exist
def new_dir(name): 
    if not os.path.exists(name):
        os.makedirs(name)
        print(f"Directory {name} created")
    else:
        print(f"Directory {name} already exists")

# -------------- Part 2 - Look at the result -------------- #
def get_transcribed_words(string_words:str):
    # Return a list with lower case words and no dots
    split_string = string_words.split(' ')
    remove_empty = [word for word in split_string if word != '']
    lower = [word.lower() for word in remove_empty] 
    no_dot = [word.replace('.', '') for word in lower]
    return no_dot 

In [7]:
df_fin, wv_path = smf.get_correct_df()
# Speaker ID column added
df_trail =  df_fin
df_trail['id'] = df_fin['File name'].apply(lambda x: x.split('_')[0])

### <span style="color:#00C0FF"> <b>1. Grouping by score:</b> Each score was grouped before transcription. Sorted by speaker</span>

In [None]:
# Group by score, sort by speaker ID
df_trail_1 = df_trail.sort_values(by=['id'], ascending=False) # Sorted by ID
df_trail_1 = df_trail_1.reset_index(drop=True)
df_score_groups = df_trail_1.groupby('Score')# Grouped by score

concatenated_audio_information = pd.DataFrame(columns=[
    "audio_path", "audio_name", "unique_id's", "input_string", "speaker_id's"])

for idx, score_group in df_score_groups:
# -------------- Make or Find directory -------------- #
    score_directory = f'3x10_Concatenations/3x10_score_{idx}_sorted'
    new_dir(score_directory)
    
# --------- Concatenate and save audio files --------- #
    for i in range(0, len(score_group)//10,10):
        # Concatenate the 10 audio files together
        s, e = i, i + 10 # get start and end index
        audio_name, ids, input_words = concat_audio(score_group, score_directory, s, e)

# --------- Save audio file info in CSV --------- #
        unique_ids = list(set(ids))
        unique_input_words = np.unique(input_words)
        
        # add to dataframe
        new_row = {'audio_path': [score_directory],
                'audio_name': [audio_name],
                'unique_id\'s': [unique_ids],
                'input_string': [input_words],
                'speaker_id\'s': [ids]
                }
        
        new_row = pd.DataFrame(new_row, index=[0])
        concatenated_audio_information = pd.concat([concatenated_audio_information, new_row], ignore_index=True)

# ----- Save the concatenated audio information ----- #
concatenated_audio_information.to_csv(f'3x10_Concatenations/concatenated_audio_information_scores_id_sorted.csv', index=False)

### <span style="color:#00C0FF"> <b>1.1 Grouping by score:</b> Each score was grouped before transcription. Randomized speakers </span>

In [15]:
# Group by score, randomize id
df_trail_11 = df_trail.sample(frac=1).reset_index(drop=True)
df_score_groups = df_trail_11.groupby('Score')# Grouped by score

concatenated_audio_information = pd.DataFrame(columns=[
    "audio_path", "audio_name", "unique_id's", "input_string", "speaker_id's"])

for idx, score_group in df_score_groups:
# -------------- Make or Find directory -------------- #
    score_directory = f'3x10_Concatenations/3x10_score_{idx}_random'
    new_dir(score_directory)
    
# --------- Concatenate and save audio files --------- #
    for i in range(0, len(score_group)//10,10):
        # Concatenate the 10 audio files together
        s, e = i, i + 10 # get start and end index
        audio_name, ids, input_words = concat_audio(score_group, score_directory, s, e)

# --------- Save audio file info in CSV --------- #
        unique_ids = list(set(ids))
        unique_input_words = np.unique(input_words)
        
        # add to dataframe
        new_row = {'audio_path': [score_directory],
                'audio_name': [audio_name],
                'unique_id\'s': [unique_ids],
                'input_string': [input_words],
                'speaker_id\'s': [ids]
                }
        
        new_row = pd.DataFrame(new_row, index=[0])
        concatenated_audio_information = pd.concat([concatenated_audio_information, new_row], ignore_index=True)

# ----- Save the concatenated audio information ----- #
concatenated_audio_information.to_csv(f'3x10_Concatenations/concatenated_audio_information_scores_id_random.csv', index=False)

Directory 3x10_Concatenations/3x10_score_1_random created
Directory 3x10_Concatenations/3x10_score_2_random created
Directory 3x10_Concatenations/3x10_score_3_random created
Directory 3x10_Concatenations/3x10_score_4_random created
Directory 3x10_Concatenations/3x10_score_5_random created


### <span style="color:#00D5F7"><b> 2. Grouping by person:</b> Each person's independent words were grouped, and sorted by score</span>


In [24]:
# Group by file name, sort by speaker ID
df_name_groups = df_trail.groupby('id')  # Grouped by file name

# -------------- Make or Find directory -------------- #
file_directory = f'3x10_Concatenations/3x10_id_sorted'
new_dir(file_directory)

# --------- Concatenate and save audio files --------- #
concatenated_audio_information = pd.DataFrame(columns=["audio_name", "speaker_id", "input_string" "scores"])

for idx, person in df_name_groups:
        # person = person.sort_values(by=['Score'], ascending=False)  # Sorted by ID
        for i in range(0, len(person)//10, 10):
                # Concatenate the 10 audio files together
                start, end = i, i + 10  # get start and end index
                speaker_id = ids[0] # The speacker id is the same in the groups
                audio_name, ids, input_words = concat_audio(person, file_directory, start, end, name=f'{speaker_id}_')
                
                # Create audio name with speaker ID prefix
                scor_list = person.iloc[start:end]['Score'].values

                # Add to DataFrame
                new_row = {
                        'audio_name': [audio_name],
                        'speaker_id': [speaker_id],
                        'input_string': [input_words],
                        'scores': [scor_list]
                        }
                new_row = pd.DataFrame(new_row, index=[0])
                concatenated_audio_information = pd.concat([concatenated_audio_information, new_row], ignore_index=True)

# ----- Save the concatenated audio information to CSV ----- #
concatenated_info_csv = os.path.join(file_directory, 'concatenated_audio_information.csv')
concatenated_audio_information.to_csv(concatenated_info_csv, index=False)

Directory 3x10_Concatenations/3x10_id_sorted already exists


### <span style="color:#00E6DB"><b> 3. Random shuffling of data before transcription.</span></b >

Persons not sorted, and results not grouped by score.

In [26]:
# Group by file name, sort by speaker ID
df_no_group = df_trail.sample(frac=1).reset_index(drop=True)

# -------------- Make or Find directory -------------- #
file_directory = f'3x10_Concatenations/3x10_no_group_mixed'
new_dir(file_directory)

# --------- Concatenate and save audio files --------- #
concatenated_audio_information = pd.DataFrame(columns=["audio_name", "speaker_id", "input_string" "scores"])

# person = person.sort_values(by=['Score'], ascending=False)  # Sorted by ID
for i in range(0, len(df_no_group)//10, 10):
        # Concatenate the 10 audio files together
        start, end = i, i + 10  # get start and end index
        audio_name, ids, input_words = concat_audio(df_no_group, file_directory, start, end)
        
        # Create audio name with speaker ID prefix
        scor_list = df_no_group.iloc[start:end]['Score'].values

        # Add to DataFrame
        new_row = {
                'audio_name': [audio_name],
                'speaker_id': [ids],
                'input_string': [input_words],
                'scores': [scor_list]
                }
        new_row = pd.DataFrame(new_row, index=[0])
        concatenated_audio_information = pd.concat([concatenated_audio_information, new_row], ignore_index=True)

# ----- Save the concatenated audio information to CSV ----- #
concatenated_info_csv = os.path.join(file_directory, 'concatenated_audio_information.csv')
concatenated_audio_information.to_csv(concatenated_info_csv, index=False)

Directory 3x10_Concatenations/3x10_no_group_mixed already exists


## <span style="color:#58F2B3"> <b>Results:</b>  for the different tests </span>

## <span style="color:#F9F871"> Old Code</span>

Only tried for global score 5, but for multiple models

In [None]:
s, e = 20, 30
new_dir = './10x3sec_audio_files'
new_name = '10x3sec_5_rand_word.wav'
dire = os.path.join(new_dir, new_name)

load_audio = whisper.load_audio(dire)


# Only test for the ones with score 5
df = df_fin.sort_values(by='Score', ascending=False)
audio, ids = concat_audio(df, s, e)
save_audio(audio, new_name, new_dir)

print(ids)
print(df.iloc[s:e])

In [None]:
import self_made_functions as smf
# Tested this for only 5 score files to see if it could work. 
# If it does not work for the score 5 files, it is unlikely it will work for the lower score files

over_view = PrettyTable()
over_view.field_names = ['Model','Input length', 'Translated length', 'Word Match', 
                        'Unique input words', 'Unique translated words',
                        'Input words', 'Translated words']

test_for_models = ['tiny', 'nb-whisper-tiny', 'nb-whisper-tiny-verbatim',
                    'base', 'nb-whisper-base', 'nb-whisper-base-verbatim',
                    'medium', 'nb-whisper-medium', 'nb-whisper-medium-verbatim']


for model_name in test_for_models:
    model_path = smf.get_whisper_path(model_name)
    model = pipeline("automatic-speech-recognition", model_path)
    words = model(load_audio, generate_kwargs={'task': 'transcribe', 'language': 'no'}) 
    
    trans_words = get_transcribed_words(words['text'])
    unique_words = [unique_word for word, unique_word in enumerate(trans_words) if word == trans_words.index(unique_word)]

    input_string = get_input_string(df, s , e)
    unique_input_words = df['Word'].iloc[s:e].unique()
    
    # print(f'Length of translated words : {len(trans_words)}\n{trans_words}\n')
    # print(f'Length of input words : {len(input_string)}\n{input_string}\n')

    # print(f'Unique translated words : {unique_words}\n')
    # print(f'Unique input words : {unique_input_words}\n')
    if len(input_string)  == len(trans_words):
        # Compare elements and create a new list with 1s and 0s # Phind.com
        result = [int(a == b) for a, b in zip(input_string, trans_words)] 
        # The zip function stops creating pairs as soon as one of the input iterables is exhausted,
        
        over_view.add_row([model_name, len(input_string), len(trans_words), result, unique_input_words, unique_words, input_string, trans_words])
    else: 
        over_view.add_row([model_name, len(input_string), len(trans_words), '', unique_input_words, unique_words, input_string, trans_words])

print(over_view)   

In [None]:
# Save the table as a CSV file
with open('./10x3sec_audio_files/3x10_word_5_random_20_30result_v2.csv', 'w', newline='') as f_output:
    f_output.write(over_view.get_string())

The first test show good results for one example for the base,  nb-whisper-medium and  nb-whisper-medium-verbatim.

Considering the  nb-whisper-medium-verbatim showed promising results for the other metrics I will try concatenating the rest of teh 5 results for this model.

Then calculate the success rate, and deciding if the rest of the scores should be found from this.


# Continuing only with the nb-whisper-medium-verbatim model

In [None]:
df_5 = df_fin[df_fin['Score'] == 5] # new df with only score 5
len(df_5)//10 # number of new audio files to make

new_dir = './10x3sec_audio_files_5_rand'
model_name = 'nb-whisper-medium-verbatim'

# Store the information in a data frame
over_view_df = pd.DataFrame(columns=['Input length', 'Translated length', 'Word Match',
                                    'Unique input words', 'Unique translated words',
                                    'Input words', 'Translated words', 'ID'])

# Load Model
model_path = smf.get_whisper_path(model_name)
model = pipeline("automatic-speech-recognition", model_path)

for i in range(0, len(df_5)//10-10,10):
    # Concatenate the 10 audio files together
    s, e = i, i+10 # get start and end index
    audio, ids = concat_audio(df_5, s, e)
    
    new_name = f'words_{s}_{e}.wav'
    dire = os.path.join(new_dir, new_name)
    
    # Save the audio and upload using whisper
    save_audio(audio, new_name, new_dir)
    load_audio = whisper.load_audio(dire)

    # Transcribe audio file
    words = model(load_audio, generate_kwargs={'task': 'transcribe', 'language': 'no'})
    
    # Get the words from the transcription adn original data frame
    trans_words = get_transcribed_words(words['text'])
    unique_words = [unique_word for word, unique_word in enumerate(trans_words) 
                    if word == trans_words.index(unique_word)]

    input_string = get_input_string(df_5, s , e)
    unique_input_words = df_5['Word'].iloc[s:e].unique()
    
    if len(input_string)  == len(trans_words):
        # Compare elements and create a new list with 1s and 0s # Phind.com
        result = [int(a == b) for a, b in zip(input_string, trans_words)] 
    else: 
      result = ''
    
    new_df_row = pd.DataFrame(columns=over_view_df.columns)
    new_df_row.loc[0] = [len(input_string), len(trans_words), result, unique_input_words, 
                        unique_words, input_string, trans_words, ids]
    over_view_df = pd.concat([over_view_df, new_df_row], ignore_index=True)

# save the data frame as a csv in the same directory
file_name, version = smf.get_new_csv_name(new_dir, model_name)
over_view_df.to_csv(file_name)

Her ser vi at den kan i noen tilfeller transkribere rikgit antall ord, og riktig transcribering uten om noen.

Men i fleteparten av tilfellene er det feil antall ord, og feil transcribering.

NB! clip 22 -> 210_220 har en lengre setting 

Clip 20 med 23 ulike transcriberinger har ikke noen ekstra ord.

whisper base vireker helt vilt dårlig, så ser at nb sine modeller er bedre her i alle fall

In [None]:
# test_for_models = ['tiny', 'nb-whisper-tiny', 'nb-whisper-tiny-verbatim',
#                     'base', 'nb-whisper-base', 'nb-whisper-base-verbatim',
#                     'medium', 'nb-whisper-medium', 'nb-whisper-medium-verbatim']

new_dir = './10x3sec_audio_files_5_rand'
model_name = 'base'


list_dir = os.listdir(new_dir)
wav_files = [file for file in list_dir if file.endswith('.wav')]

over_view_df = pd.DataFrame(columns=['Input length', 'Translated length', 'Word Match',
                                    'Unique input words', 'Unique translated words',
                                    'Input words', 'Translated words', 'ID'])

# Load Model
model_path = smf.get_whisper_path(model_name)
model = pipeline("automatic-speech-recognition", model_path)

for file in wav_files:
    load_audio = whisper.load_audio(os.path.join(new_dir, file))
    words = model(load_audio, generate_kwargs={'task': 'transcribe', 'language': 'no'})
    s = int(file.split('_')[1])
    e = int(file.split('_')[2].split('.')[0])
    
    # Get the words from the transcription adn original data frame
    trans_words = get_transcribed_words(words['text'])
    unique_words = [unique_word for word, unique_word in enumerate(trans_words) 
                    if word == trans_words.index(unique_word)]

    input_string = get_input_string(df_5, s , e)
    unique_input_words = df_5['Word'].iloc[s:e].unique()
    
    if len(input_string)  == len(trans_words):
        # Compare elements and create a new list with 1s and 0s # Phind.com
        result = [int(a == b) for a, b in zip(input_string, trans_words)] 
    else: 
        result = ''
    
    new_df_row = pd.DataFrame(columns=over_view_df.columns)
    new_df_row.loc[0] = [len(input_string), len(trans_words), result, unique_input_words, 
                        unique_words, input_string, trans_words, '']
    over_view_df = pd.concat([over_view_df, new_df_row], ignore_index=True)

cvs_file_name, version = smf.get_new_csv_name(new_dir, f'{model_name}')
over_view_df.to_csv(cvs_file_name)

In [14]:

csv_df = pd.read_csv('/home/ajtruyen/language_master/child_d09_medium_verbatim.csv')

combined_audio = AudioSegment.empty()

for row in csv_df.itertuples():
    audio_path = os.path.join(wv_path, row[1])
    print(audio_path)
    audio = AudioSegment.from_file(audio_path)
    combined_audio += audio
               
combined_audio.export('./child_d09_MT', format="wav")



/talebase/data/speech_raw/teflon_no/speech16khz/d09_glorie.wav
/talebase/data/speech_raw/teflon_no/speech16khz/d09_loepe.wav
/talebase/data/speech_raw/teflon_no/speech16khz/d09_krykke.wav
/talebase/data/speech_raw/teflon_no/speech16khz/d09_skjorte.wav
/talebase/data/speech_raw/teflon_no/speech16khz/d09_oere.wav
/talebase/data/speech_raw/teflon_no/speech16khz/d09_oedelagt.wav
/talebase/data/speech_raw/teflon_no/speech16khz/d09_bryter.wav
/talebase/data/speech_raw/teflon_no/speech16khz/d09_kvart.wav
/talebase/data/speech_raw/teflon_no/speech16khz/d09_internett.wav
/talebase/data/speech_raw/teflon_no/speech16khz/d09_klo.wav
/talebase/data/speech_raw/teflon_no/speech16khz/d09_krakk.wav
/talebase/data/speech_raw/teflon_no/speech16khz/d09_brun.wav
/talebase/data/speech_raw/teflon_no/speech16khz/d09_fjorten.wav
/talebase/data/speech_raw/teflon_no/speech16khz/d09_port.wav
/talebase/data/speech_raw/teflon_no/speech16khz/d09_skjerf.wav
/talebase/data/speech_raw/teflon_no/speech16khz/d09_trapp.wa

<_io.BufferedRandom name='./child_d09_MT'>

In [15]:
import os
import pandas as pd
from pydub import AudioSegment

# Load CSV file into DataFrame
csv_df = pd.read_csv('/home/ajtruyen/language_master/child_d09_medium_verbatim.csv')

# Initialize an empty AudioSegment for combining audio
combined_audio = AudioSegment.empty()

# Path to the directory containing the audio files
wv_path = '/path/to/audio/files'  # Update this path to the actual location

# Iterate over each row in the DataFrame
for row in csv_df.itertuples(index=False):
    audio_path = os.path.join(wv_path, row[1])
    print(audio_path)  # Print the path of the audio file being processed
    audio = AudioSegment.from_file(audio_path)
    combined_audio += audio  # Concatenate the audio

# Export the combined audio as a WAV file
combined_audio.export('./child_d09_MT.wav', format="wav")

print("Combined audio file has been created successfully!")


/path/to/audio/files/glorie


  m = re.match('([su]([0-9]{1,2})p?) \(([0-9]{1,2}) bit\)$', token)
  m2 = re.match('([su]([0-9]{1,2})p?)( \(default\))?$', token)
  elif re.match('(flt)p?( \(default\))?$', token):
  elif re.match('(dbl)p?( \(default\))?$', token):


FileNotFoundError: [Errno 2] No such file or directory: '/path/to/audio/files/glorie'