In [1]:
!pip install --q git+https://github.com/m-bain/whisperx.git

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m208.7/208.7 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.9/32.9 MB[0m [31m45.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m36.7/36.7 MB[0m [31m47.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m98.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.6/44.6 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m89.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.5/79.5 kB[0m [31m11.8 MB/s[0m eta 

In [4]:
def process_audio_to_openai_training_format(original_list):
    """
    Processes a list of dictionaries into a format suitable for training with OpenAI.
    Each dictionary in the input list is expected to have 'text' and 'speaker' keys.
    The function unifies consecutive dictionaries with the same 'speaker' value,
    merging their 'text' values into a single string, separated by spaces.
    It also transforms 'speaker' into 'role', determining the most frequent speaker
    as 'assistant' and the less frequent as 'user'.

    Parameters:
    original_list (list): A list of dictionaries, each containing 'text' and 'speaker' keys.

    Returns:
    list: A list of unified dictionaries with 'content' and 'role' keys.
    """

    # Filter the list to retain only 'text' and 'speaker' keys from each dictionary
    filtered_list = [
    {
        'text': dic.get('text', ''),
        'speaker': dic.get('speaker', 'user')
    }
    for dic in original_list
    ]


    # Count the frequency of each 'speaker' value
    speaker_frequency = {}
    for dic in filtered_list:
        if dic['speaker'] in speaker_frequency:
            speaker_frequency[dic['speaker']] += 1
        else:
            speaker_frequency[dic['speaker']] = 1

    # Determine the most and least frequent 'speaker'
    most_frequent_speaker = max(speaker_frequency, key=speaker_frequency.get)
    least_frequent_speaker = min(speaker_frequency, key=speaker_frequency.get)

    # Format the list with new keys and values
    formatted_list = [
        {'content': dic['text'], 'role': 'assistant' if dic['speaker'] == most_frequent_speaker else 'user'}
        for dic in filtered_list
    ]

    # List to store the unified result
    unified_list = []

    # Temporary variable to hold the current dictionary while iterating
    temp_dic = None

    # Iterate over each dictionary in the formatted list
    for dic in formatted_list:
        # If there's no temporary dictionary, it's the start of a potential sequence
        if not temp_dic:
            temp_dic = dic
        else:
            # If the 'role' of the current dictionary matches the temporary one,
            # their contents are unified.
            if dic['role'] == temp_dic['role']:
                temp_dic['content'] += " " + dic['content']
            else:
                # If the 'role' is different, add the temporary dictionary to the unified list
                # and start a new temporary dictionary with the current one
                unified_list.append(temp_dic)
                temp_dic = dic

    # Add the last temporary dictionary to the unified list
    if temp_dic:
        unified_list.append(temp_dic)

    # Return the result
    return unified_list

In [5]:
import os
print(os.getenv("HF_TOKEN"))
from google.colab import userdata
userdata.get('HF_TOKEN')

None


'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'

In [5]:
import os
import json
from google.colab import userdata

import whisperx
import gc

# Environment variables
HF_TOKEN = userdata.get('HF_TOKEN')


def  speaker_transcription_and_identify(audio_file):
    """
    This function takes an audio file and returns the transcription of the audio
    and the speaker identification of the audio.

    Parameters:
    audio_file (str): The path to the audio file.

    Returns:
    dict: A dictionary containing the transcription and speaker identification of the audio.
    """
    # Configuration parameters
    device = "cuda"
    batch_size = 3 # reduce if low on GPU mem
    compute_type = "float32" # change to "int8" if low on GPU mem (may reduce accuracy)

    # Whisper procesing
    audio = whisperx.load_audio(audio_file)
    model = whisperx.load_model("large-v2", device, compute_type=compute_type)
    result = model.transcribe(audio, batch_size=batch_size)

    # 2. Align whisper output
    model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
    result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)


    ## Diarization of the text
    diarize_model = whisperx.DiarizationPipeline(use_auth_token=HF_TOKEN,
                                                device=device)

    diarize_segments = diarize_model(audio, min_speakers=2, max_speakers=2)
    result = whisperx.assign_word_speakers(diarize_segments, result)
    # Process the audio to OpenAI training format
    result = process_audio_to_openai_training_format(result["segments"])

    final_format = {"messages": result}
    print(final_format)
    return final_format

def process_directory(audio_directory, output_file):
    """
    Processes all audio files in the given directory and writes the results
    to a JSONL file, one line per audio file.

    Parameters:
    audio_directory (str): The path to the directory containing audio files.
    output_file (str): The path to the JSONL file where results will be saved.
    """
    # Filtrar para obtener solo archivos .wav y .mp3
    audio_files = [f for f in os.listdir(audio_directory)
                   if os.path.isfile(os.path.join(audio_directory, f)) and f.endswith(('.wav', '.mp3'))]

def process_audio_directory(directory_path, output_jsonl_path):
    """
    Process all .mp3 and .wav files in the given directory and append the results
    to a .jsonl file, one entry per audio file.

    Parameters:
    directory_path (str): Path to the directory containing the audio files.
    output_jsonl_path (str): Path to the output .jsonl file where results will be stored.
    """
    # List all files in the directory
    audio_files = [f for f in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, f)) and (f.endswith('.mp3') or f.endswith('.wav'))]

    for audio_file in audio_files:
        audio_path = os.path.join(directory_path, audio_file)
        try:
            # Process the audio file
            result_dict = speaker_transcription_and_identify(audio_path)
            # Append the result to the .jsonl file
            with open(output_jsonl_path, 'a') as outfile:
                json.dump(result_dict, outfile)
                outfile.write('\n')  # Add newline to separate entries
            print(f"Processed: {audio_file}")
        except Exception as e:
            print(f"Error processing {audio_file}: {e}")

if __name__ == "__main__":
    directory_path = "/content/error_files"
    output_jsonl_path = "/content/training_dialer.jsonl"
    process_audio_directory(directory_path, output_jsonl_path)


INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.2.1. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/torch/whisperx-vad-segmentation.bin`


No language specified, language will be first be detected for each audio file (increases inference time).
Model was trained with pyannote.audio 0.0.1, yours is 3.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.2.1+cu121. Bad things might happen unless you revert torch to 1.x.
Detected language: en (0.98) in first 30s of audio...
{'messages': [{'content': ' Yes, may I speak with the business owner, sir, of Yes Chinese Cuisine?', 'role': 'assistant'}, {'content': 'Yeah.', 'role': 'user'}, {'content': "Are you the owner, sir? Yeah. You're the owner of Chinese Cuisine, right?", 'role': 'assistant'}, {'content': 'Yes.', 'role': 'user'}, {'content': "Yes, my name is Nats. Thank you for taking my call. This would only take less than 40 seconds, okay? So firstly, I'm not here to change any of what you have. I respect what you have in the business, okay? Don't get me wrong.  This is regarding about new guidelines, sir, 

INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.2.1. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/torch/whisperx-vad-segmentation.bin`


No language specified, language will be first be detected for each audio file (increases inference time).
Model was trained with pyannote.audio 0.0.1, yours is 3.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.2.1+cu121. Bad things might happen unless you revert torch to 1.x.
Detected language: en (0.98) in first 30s of audio...
{'messages': [{'content': ' Hello? Hello? May I speak? Yes, sir. May I speak with the business owner of 3MJV Management?', 'role': 'assistant'}, {'content': 'Yeah, this is the business owner. I may have to... Oh!', 'role': 'user'}, {'content': "Pleasure speaking to you, sir. And apologies, by the way. I know you're busy. This is just a quick, what, less than a minute call. Okay? So, firstly, sir, my name here is Nat. I'm from Adventist Bay. And definitely, sir, I'm not here to change any of what you have in the business. I respect what you have here, okay? So, what's this?  This is only

INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.2.1. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/torch/whisperx-vad-segmentation.bin`


No language specified, language will be first be detected for each audio file (increases inference time).
Model was trained with pyannote.audio 0.0.1, yours is 3.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.2.1+cu121. Bad things might happen unless you revert torch to 1.x.
Detected language: en (0.99) in first 30s of audio...
{'messages': [{'content': ' Linda Blessing, how may I help you?', 'role': 'user'}, {'content': "Yes, ma'am. I speak with the business owner of Abundant Blessings Thrift Store.", 'role': 'assistant'}, {'content': 'Okay. Hold on, please.', 'role': 'user'}, {'content': "Thank you, ma'am.", 'role': 'assistant'}, {'content': 'Hello. Can I help you?', 'role': 'user'}, {'content': "Yes, ma'am. My name is Nat. Are you the business owner, ma'am, of Abundant Blessings Thrift Store?", 'role': 'assistant'}, {'content': 'Yes, I am.', 'role': 'user'}, {'content': "Yes. Pleasure speaking to you, ma'am

In [None]:
import whisperx
import gc

  torchaudio.set_audio_backend("soundfile")


In [None]:
device = "cuda"
batch_size = 4 # reduce if low on GPU mem
compute_type = "float16" # change to "int8" if low on GPU mem (may reduce accuracy)

In [None]:
audio_file = "/content/z20240312-233027_8123616141-all.mp3"

In [None]:
audio = whisperx.load_audio(audio_file)

In [None]:
model = whisperx.load_model("large-v2", device, compute_type=compute_type)


vocabulary.txt:   0%|          | 0.00/460k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.80k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.20M [00:00<?, ?B/s]

model.bin:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

No language specified, language will be first be detected for each audio file (increases inference time).


100%|█████████████████████████████████████| 16.9M/16.9M [00:01<00:00, 10.9MiB/s]
INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.2.1. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/torch/whisperx-vad-segmentation.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.2.1+cu121. Bad things might happen unless you revert torch to 1.x.


In [None]:
result = model.transcribe(audio, batch_size=batch_size)
print(result["segments"]) # before alignment

# delete model if low on GPU resources
# import gc; gc.collect(); torch.cuda.empty_cache(); del model

# 2. Align whisper output
model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)


Detected language: en (1.00) in first 30s of audio...
[{'text': " Hello? Hello? Yes, who am I speaking with? I'm sorry. It's Dhruvin Patel. Come again? I didn't hear you. You're correct, sir. Mr. Patel? Oh, Patel. Mr. Patel, I believe this number called you. That's why you called us back. Is that correct, sir? Right, right. Okay, so apologies, sir. My name is Nance. I'm from Aventis Bay.", 'start': 0.196, 'end': 25.094}, {'text': " And basically, sir, firstly, I respect what you have, okay? We're not here to change any of what you have, okay, for starters. And this is regarding about new guidelines, because you're a business owner, obviously, sir, what we do is we give information that you guys are no longer required to pay any processing fees in terms of the credit or debit cards, it's called zero processing.", 'start': 25.435, 'end': 45.674}, {'text': " Okay? So that's why we share this information so you guys have an idea that you can save money in terms of the credit and debit card

Downloading: "https://download.pytorch.org/torchaudio/models/wav2vec2_fairseq_base_ls960_asr_ls960.pth" to /root/.cache/torch/hub/checkpoints/wav2vec2_fairseq_base_ls960_asr_ls960.pth
100%|██████████| 360M/360M [00:01<00:00, 194MB/s]


In [None]:
result

{'segments': [{'start': 0.236,
   'end': 0.496,
   'text': ' Hello?',
   'words': [{'word': 'Hello?',
     'start': 0.236,
     'end': 0.496,
     'score': 0.699}]},
  {'start': 1.817,
   'end': 2.117,
   'text': 'Hello?',
   'words': [{'word': 'Hello?',
     'start': 1.817,
     'end': 2.117,
     'score': 0.744}]},
  {'start': 3.058,
   'end': 4.319,
   'text': 'Yes, who am I speaking with?',
   'words': [{'word': 'Yes,', 'start': 3.058, 'end': 3.258, 'score': 0.818},
    {'word': 'who', 'start': 3.398, 'end': 3.518, 'score': 0.416},
    {'word': 'am', 'start': 3.578, 'end': 3.638, 'score': 0.224},
    {'word': 'I', 'start': 3.679, 'end': 3.779, 'score': 0.433},
    {'word': 'speaking', 'start': 3.799, 'end': 4.139, 'score': 0.866},
    {'word': 'with?', 'start': 4.159, 'end': 4.319, 'score': 0.773}]},
  {'start': 4.359,
   'end': 4.759,
   'text': "I'm sorry.",
   'words': [{'word': "I'm", 'start': 4.359, 'end': 4.459, 'score': 0.475},
    {'word': 'sorry.', 'start': 4.499, 'end': 4

In [None]:
diarize_model = whisperx.DiarizationPipeline(use_auth_token="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",
                                             device=device)

config.yaml:   0%|          | 0.00/469 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/5.91M [00:00<?, ?B/s]

config.yaml:   0%|          | 0.00/399 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/26.6M [00:00<?, ?B/s]

config.yaml:   0%|          | 0.00/221 [00:00<?, ?B/s]

In [None]:
diarize_segments = diarize_model(audio, min_speakers=2, max_speakers=2)

In [None]:
diarize_segments

In [None]:
diarize_segments.speaker.unique()

In [None]:
result = whisperx.assign_word_speakers(diarize_segments, result)
print(diarize_segments)
print(result["segments"]) # segments are now assigned speaker IDs

                               segment label     speaker       start  \
0    [ 00:00:00.195 -->  00:00:00.551]     A  SPEAKER_01    0.195246   
1    [ 00:00:01.740 -->  00:00:02.181]     B  SPEAKER_00    1.740238   
2    [ 00:00:03.030 -->  00:00:04.813]     C  SPEAKER_01    3.030560   
3    [ 00:00:06.052 -->  00:00:07.139]     D  SPEAKER_00    6.052632   
4    [ 00:00:07.376 -->  00:00:07.512]     E  SPEAKER_00    7.376910   
..                                 ...   ...         ...         ...   
225  [ 00:09:55.967 -->  00:10:04.983]    HR  SPEAKER_01  595.967742   
226  [ 00:10:05.967 -->  00:10:07.903]    HS  SPEAKER_00  605.967742   
227  [ 00:10:06.833 -->  00:10:06.850]    HT  SPEAKER_01  606.833616   
228  [ 00:10:06.952 -->  00:10:08.752]    HU  SPEAKER_01  606.952462   
229  [ 00:10:08.480 -->  00:10:08.887]    HV  SPEAKER_00  608.480475   

            end  intersection       union  
0      0.551783   -607.977217  608.594754  
1      2.181664   -606.347336  607.049762  
2  

In [None]:
result

{'segments': [{'start': 0.236,
   'end': 0.496,
   'text': ' Hello?',
   'words': [{'word': 'Hello?',
     'start': 0.236,
     'end': 0.496,
     'score': 0.699,
     'speaker': 'SPEAKER_01'}],
   'speaker': 'SPEAKER_01'},
  {'start': 1.817,
   'end': 2.117,
   'text': 'Hello?',
   'words': [{'word': 'Hello?',
     'start': 1.817,
     'end': 2.117,
     'score': 0.744,
     'speaker': 'SPEAKER_00'}],
   'speaker': 'SPEAKER_00'},
  {'start': 3.058,
   'end': 4.319,
   'text': 'Yes, who am I speaking with?',
   'words': [{'word': 'Yes,',
     'start': 3.058,
     'end': 3.258,
     'score': 0.818,
     'speaker': 'SPEAKER_01'},
    {'word': 'who',
     'start': 3.398,
     'end': 3.518,
     'score': 0.416,
     'speaker': 'SPEAKER_01'},
    {'word': 'am',
     'start': 3.578,
     'end': 3.638,
     'score': 0.224,
     'speaker': 'SPEAKER_01'},
    {'word': 'I',
     'start': 3.679,
     'end': 3.779,
     'score': 0.433,
     'speaker': 'SPEAKER_01'},
    {'word': 'speaking',
     '