# Match Diarization-Transcription in batch

## Preparation

### Import

In [1]:
import pandas as pd
import numpy as np
import datetime
import os
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Function Definition

### Match Diarization-Transcription

In [2]:
def match_transcription_diarization(file_trans,file_dia,file_out):
    
    # create dataframes
    df_dia = pd.read_csv(file_dia)
    df_trans = pd.read_csv(file_trans)

    # extract columns of interest from dataframes
    trans = df_trans[["Start time (s)", "End time (s)", "Subtitle",]].values.tolist()
    dia = df_dia[["Start time (s)", "End time (s)", "Speaker"]].values.tolist()

    # create empty list for each speaker
    speaker_lists = [[] for _ in range(len(dia))]

    # iterate over each subtitle and find the speaker with the highest difference value
    for subtitle in trans:
        max_difference = 0
        best_speaker = 0
        for i, speaker in enumerate(dia):
            difference = (min(speaker[1],subtitle[1])-max(speaker[0],subtitle[0]))
            if difference > max_difference:
                max_difference = difference
                best_speaker = i
        if max_difference >0:
            speaker_lists[best_speaker].append(subtitle)
    
    # combine subtitles for each speaker into a single string
    speaker_data = []
    for num_speaker,speaker in enumerate(speaker_lists):
        full_subtitle = ["",100000,-1,""]
        for subtitle in speaker:
            full_subtitle[1]=min(full_subtitle[1],subtitle[0])
            full_subtitle[2]=max(full_subtitle[2],subtitle[1])
            full_subtitle[3]=full_subtitle[3]+ " " + subtitle[2]
        full_subtitle[0]=dia[num_speaker][2]
        speaker_data.append(full_subtitle)

    # convert data to dataframe and save to CSV
    df = pd.DataFrame(speaker_data, columns=["Speaker", "Start time (s)", "End time (s)", "Subtitle"])
    df = df[df["End time (s)"] > -1]
    df["Start time"] = df["Start time (s)"].apply(lambda x: "{:0>8}".format(str(datetime.timedelta(seconds=int(x))), int((x - int(x)) * 1000)))
    df["End time"] = df["End time (s)"].apply(lambda x: "{:0>8}".format(str(datetime.timedelta(seconds=int(x))), int((x - int(x)) * 1000)))
    df = df[["Speaker", "Start time", "End time", "Start time (s)", "End time (s)", "Subtitle"]]
    df.to_csv(file_out, encoding="utf-8-sig", index=False)
    

### Match Diarization-Transcription Batch

In [3]:
def match_transcription_diarization_batch(transcription_dir, diarization_dir, output_dir):
  # Moves to the directory and execute the function for every rttm file
  os.chdir(diarization_dir)
  n=1    
  for file in os.listdir():
      if file.endswith("_diarization.csv"):
          file_dia = f"{diarization_dir}/{file}"
          file_trans = os.path.join(transcription_dir, file[:-16] + "_transcription.csv")
          file_out = os.path.join(output_dir, file[:-16] + "_dia_trans.csv")
          if os.path.isfile(file_trans)==True:
              match_transcription_diarization(file_trans,file_dia,file_out)
              print(str(n)+") "+ str(file_out))
              n=n+1

## Use of Function

In [4]:
transcription_dir= r'/content/drive/MyDrive/Projects/tps/data/6. transcription (csv)'
diarization_dir= r'/content/drive/MyDrive/Projects/tps/data/4. diarization (csv)'
output_dir= r'/content/drive/MyDrive/Projects/tps/data/7. diarization_transcription'

In [5]:
match_transcription_diarization_batch(transcription_dir, diarization_dir, output_dir)

1) /content/drive/MyDrive/Projects/tps/data/7. diarization_transcription/1.1_dia_trans.csv
2) /content/drive/MyDrive/Projects/tps/data/7. diarization_transcription/12.1_dia_trans.csv
3) /content/drive/MyDrive/Projects/tps/data/7. diarization_transcription/2.2_dia_trans.csv
4) /content/drive/MyDrive/Projects/tps/data/7. diarization_transcription/10.2_dia_trans.csv
5) /content/drive/MyDrive/Projects/tps/data/7. diarization_transcription/1.2_dia_trans.csv
6) /content/drive/MyDrive/Projects/tps/data/7. diarization_transcription/4.1_dia_trans.csv
7) /content/drive/MyDrive/Projects/tps/data/7. diarization_transcription/2.1_dia_trans.csv
8) /content/drive/MyDrive/Projects/tps/data/7. diarization_transcription/11.1_dia_trans.csv
9) /content/drive/MyDrive/Projects/tps/data/7. diarization_transcription/10.1_dia_trans.csv
10) /content/drive/MyDrive/Projects/tps/data/7. diarization_transcription/11.2_dia_trans.csv
11) /content/drive/MyDrive/Projects/tps/data/7. diarization_transcription/12.2_dia_t