### Install necessary libraries

In [1]:
!pip install pydub pyannote.audio openai-whisper



### Import the libraries

In [2]:
from pydub import AudioSegment
from pydub.silence import split_on_silence
from pyannote.audio import Pipeline
import whisper
import tempfile
import os
import json
from tqdm import tqdm
import torch

### Initialize Diarization and ASR Models

In [3]:
pipeline = Pipeline.from_pretrained('pyannote/speaker-diarization-3.1',
                                    use_auth_token='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')
asr_model = whisper.load_model("base")

config.yaml:   0%|          | 0.00/469 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/5.91M [00:00<?, ?B/s]

config.yaml:   0%|          | 0.00/399 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/26.6M [00:00<?, ?B/s]

config.yaml:   0%|          | 0.00/221 [00:00<?, ?B/s]

100%|███████████████████████████████████████| 139M/139M [00:02<00:00, 65.3MiB/s]
  checkpoint = torch.load(fp, map_location=device)


##### Use GPU

In [4]:
pipeline.to(torch.device("cuda"))

<pyannote.audio.pipelines.speaker_diarization.SpeakerDiarization at 0x7bab88509390>

### Preprocess the audio file

Preprocess the audio by converting it to mono channel, resampling to the target rate, and splitting on silence for long audio files.

**Sample Rate = 16kHz**

**Chunk Size = 5 mins or 300000 milliseconds**

In [5]:
def preprocess_audio(file_path, target_sample_rate=16000, chunk_duration=300000):
    audio = AudioSegment.from_file(file_path)
    print('File loaded.')

    audio = audio.set_channels(1).set_frame_rate(target_sample_rate)
    print('Preprocessing done.')

    print('Breaking the file into chunks.')
    # Split on silence
    chunks = [audio[i:i + chunk_duration] for i in range(0, len(audio), chunk_duration)]
    print('Chunking done.')

    return chunks

### Diarization and Transcription

Each audio file will be made of chunks and each chunk will be diarized and transcribed individually.

In [6]:
def diarize_transcribe_chunk(chunk, chunk_index, base_timestamp=0):
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
        chunk.export(temp_file.name, format="wav")
        temp_file_path = temp_file.name

    # Diarization
    diarization = pipeline({"audio": temp_file_path})
    diarization_segments = [
        {"speaker": turn[2], "start": turn[0].start, "end": turn[0].end}
        for turn in diarization.itertracks(yield_label=True)
    ]

    # Transcription
    transcription_data = []
    for segment in diarization_segments:
        start, end = segment["start"], segment["end"]

        # converting seconds to milliseconds
        segment_audio = chunk[start * 1000: end * 1000]

        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_segment_file:
            segment_audio.export(temp_segment_file.name, format="wav")

            segment_text = asr_model.transcribe(temp_segment_file.name, language='en')['text']

            transcription_data.append({
                "speaker": segment["speaker"],
                "start_time": start + base_timestamp,
                "end_time": end + base_timestamp,
                "text": segment_text
            })

        # clean up the temporary files
        os.remove(temp_segment_file.name)

    os.remove(temp_file_path)

    return transcription_data

### Process the entire file

Process the entire audio file:

1. Chunking
2. Preprocessing
3. Diarizing
4. Transcribing each chunk
5. Combining the results into JSON format

In [7]:
def process_audio(file_path):
    # Chunking and preprocessing
    chunks = preprocess_audio(file_path)

    final_transcription = []

    print('Proceeding with Speaker Diarization and Transcription...')

    # Diarization and Transcription
    for index, chunk in tqdm(enumerate(chunks)):
        # base timestamp of each chunk
        base_timestamp = index * chunk.duration_seconds
        chunk_transcription = diarize_transcribe_chunk(chunk, index, base_timestamp)
        final_transcription.extend(chunk_transcription)

    # JSON output
    transcription = {
        "transcription": [
            {
                "start_time": entry["start_time"],
                "end_time": entry["end_time"],
                "speaker": entry["speaker"],
                "text": entry["text"]
            }
            for entry in tqdm(final_transcription)
        ]
    }

    return transcription

## Get the results

In [8]:
file_path = '/content/drive/MyDrive/Proj/IN1001.Mix-Headset.wav'
transcription_output = process_audio(file_path)

# Print JSON output
print(json.dumps(transcription_output, indent=2))

File loaded.
Preprocessing done.
Breaking the file into chunks.
Chunking done.
Proceeding with Speaker Diarization and Transcription...


It can be re-enabled by calling
   >>> import torch
   >>> torch.backends.cuda.matmul.allow_tf32 = True
   >>> torch.backends.cudnn.allow_tf32 = True
See https://github.com/pyannote/pyannote-audio/issues/1370 for more details.

  std = sequences.std(dim=-1, correction=1)
12it [07:54, 39.52s/it]
100%|██████████| 866/866 [00:00<00:00, 758143.87it/s]

{
  "transcription": [
    {
      "start_time": 4.4353437499999995,
      "end_time": 4.53659375,
      "speaker": "SPEAKER_00",
      "text": ""
    },
    {
      "start_time": 6.59534375,
      "end_time": 7.000343750000001,
      "speaker": "SPEAKER_00",
      "text": " What?"
    },
    {
      "start_time": 7.21971875,
      "end_time": 7.27034375,
      "speaker": "SPEAKER_01",
      "text": ""
    },
    {
      "start_time": 7.27034375,
      "end_time": 8.957843750000002,
      "speaker": "SPEAKER_00",
      "text": " I think you don't. I think you're right."
    },
    {
      "start_time": 8.957843750000002,
      "end_time": 8.99159375,
      "speaker": "SPEAKER_01",
      "text": ""
    },
    {
      "start_time": 13.88534375,
      "end_time": 14.087843750000001,
      "speaker": "SPEAKER_01",
      "text": ""
    },
    {
      "start_time": 14.222843750000003,
      "end_time": 14.239718750000002,
      "speaker": "SPEAKER_01",
      "text": ""
    },
    {
      "st

### Save the JSON file

In [9]:
output_file = '/content/drive/MyDrive/Proj/output_1.json'

with open(output_file, 'w') as json_file:
    json.dump(transcription_output, json_file, indent=2)

### Another file

In [10]:
file_path = '/content/drive/MyDrive/Proj/IB4001.Mix-Headset.wav'
transcription_output = process_audio(file_path)

output_file = '/content/drive/MyDrive/Proj/output_2.json'

with open(output_file, 'w') as json_file:
    json.dump(transcription_output, json_file, indent=2)

File loaded.
Preprocessing done.
Breaking the file into chunks.
Chunking done.
Proceeding with Speaker Diarization and Transcription...


6it [04:29, 44.91s/it]
100%|██████████| 577/577 [00:00<00:00, 622457.15it/s]


### Check out the accuracy

In [14]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [15]:
!pip install jiwer nltk

Collecting jiwer
  Downloading jiwer-3.0.4-py3-none-any.whl.metadata (2.6 kB)
Collecting rapidfuzz<4,>=3 (from jiwer)
  Downloading rapidfuzz-3.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading jiwer-3.0.4-py3-none-any.whl (21 kB)
Downloading rapidfuzz-3.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m60.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, jiwer
Successfully installed jiwer-3.0.4 rapidfuzz-3.10.1


#### Import necessary libraries

In [32]:
import json
import re
from jiwer import wer
from nltk.translate.bleu_score import sentence_bleu

#### IN1001 audio file of AMI corpus

In [39]:
ref_path = '/content/drive/MyDrive/Proj/IB4001.txt'
gen_path = '/content/drive/MyDrive/Proj/output_2.json'

##### Generated transcript

In [40]:
with open(gen_path, "r") as f:
    generated_transcript = json.load(f)["transcription"]

In [41]:
generated_transcript[:5]

[{'start_time': 6.392843750000001,
  'end_time': 7.11846875,
  'speaker': 'SPEAKER_01',
  'text': ''},
 {'start_time': 7.810343750000001,
  'end_time': 38.30346875,
  'speaker': 'SPEAKER_01',
  'text': " Okay, so as you guys know, ISCO is too small. Well, the building is too small for our group. So we're moving to a new building. It's going to be the building across the street and we're actually moving in three weeks. But the problem is that the administration wants to know who's going to be in what office in the new building and where we're going to put things like the printer, the fax machine, all the big equipment. And the catch is that they have the final approval of where we put people and equipment."},
 {'start_time': 38.69159375,
  'end_time': 75.07409375,
  'speaker': 'SPEAKER_01',
  'text': " and they want to know where we're going to do all of this by November 2nd. So that means basically next Tuesday. And since we don't really have any other time, we have to make these decis

##### Reference text

In [42]:
with open(ref_path, "r") as f:
    reference_text = f.read()

In [43]:
reference_text

"SPEAKER_02: Okay So as you guys know ISSCO is too small Well the building is too small for our group So we're moving to a new building Uh it's gonna be the building across the street And we're actually moving in three weeks But the problem is that the administration wants to know who is going to be in what office in the new building and where we're we're gonna put things like the printer the fax machine all the big equipment And the catch is that they have the final approval of where we put people and equipment And they want to know where we're gonna do all this by November second So that means basically next Tuesday And since we don't really have any other time we have to make these decisions today What we're getting is nine rooms for a maximum of eighteen people which is okay 'cause there's only fifteen of us And all the rooms have windows luckily so there's no fights about who gets a window and who doesn't Um there's two views either of the old town or the mountains and the rooms h

##### Clean reference text

In [44]:
reference_transcript = []

for line in reference_text.splitlines():
    if line.strip():
        # Split the line into speaker and text
        speaker, text = line.split(":", 1)
        reference_transcript.append({
            "speaker": speaker.strip(),
            "text": text.strip()
        })

In [45]:
reference_transcript[:5]

[{'speaker': 'SPEAKER_02',
  'text': "Okay So as you guys know ISSCO is too small Well the building is too small for our group So we're moving to a new building Uh it's gonna be the building across the street And we're actually moving in three weeks But the problem is that the administration wants to know who is going to be in what office in the new building and where we're we're gonna put things like the printer the fax machine all the big equipment And the catch is that they have the final approval of where we put people and equipment And they want to know where we're gonna do all this by November second So that means basically next Tuesday And since we don't really have any other time we have to make these decisions today What we're getting is nine rooms for a maximum of eighteen people which is okay 'cause there's only fifteen of us And all the rooms have windows luckily so there's no fights about who gets a window and who doesn't Um there's two views either of the old town or the 

##### Prepare reference and hypothesis texts

In [46]:
reference_text_combined = " ".join([entry["text"] for entry in reference_transcript])
generated_text_combined = " ".join([entry["text"] for entry in generated_transcript if entry["text"].strip()])

In [47]:
reference_text_combined

"Okay So as you guys know ISSCO is too small Well the building is too small for our group So we're moving to a new building Uh it's gonna be the building across the street And we're actually moving in three weeks But the problem is that the administration wants to know who is going to be in what office in the new building and where we're we're gonna put things like the printer the fax machine all the big equipment And the catch is that they have the final approval of where we put people and equipment And they want to know where we're gonna do all this by November second So that means basically next Tuesday And since we don't really have any other time we have to make these decisions today What we're getting is nine rooms for a maximum of eighteen people which is okay 'cause there's only fifteen of us And all the rooms have windows luckily so there's no fights about who gets a window and who doesn't Um there's two views either of the old town or the mountains and the rooms have various 

In [48]:
generated_text_combined

" Okay, so as you guys know, ISCO is too small. Well, the building is too small for our group. So we're moving to a new building. It's going to be the building across the street and we're actually moving in three weeks. But the problem is that the administration wants to know who's going to be in what office in the new building and where we're going to put things like the printer, the fax machine, all the big equipment. And the catch is that they have the final approval of where we put people and equipment.  and they want to know where we're going to do all of this by November 2nd. So that means basically next Tuesday. And since we don't really have any other time, we have to make these decisions today. What we're getting is nine rooms for a maximum of 18 people, which is okay, because there's only 15 of us. And all the rooms have windows, luckily, so there's no fights about who gets a window and who doesn't. There's two views either of the old town or the mountains. And the rooms are 

#### Compute WER Score

In [49]:
wer_score = wer(reference_text_combined, generated_text_combined)

#### Compute BLEU Score

In [50]:
reference_tokens = [reference_text_combined.split()]
generated_tokens = generated_text_combined.split()
bleu_score = sentence_bleu(reference_tokens, generated_tokens)

In [51]:
print(f"Word Error Rate (WER): {wer_score * 100:.4f}")
print(f"BLEU Score: {bleu_score * 100:.4f}")

Word Error Rate (WER): 57.1203
BLEU Score: 38.6883


BLEU Score - Higher the better

Word Error Rate - Lower the better


I think the inaccuracy is due to the punctuations and the spellings of different words along with the fact that the model picks up even the talking at the back which is not required.