# Batch Format Diarization to Table (rttm to csv)

## Preparation

### Import

In [1]:
import pandas as pd
import datetime
import os
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Function Definition

### Rttm to Csv

In [2]:
def rttm_to_csv(file_in,file_out):
    # Load the file into a dataframe and drop unnecessary columns
    df = pd.read_csv(file_in , encoding="utf8", header=None, delimiter=" ", usecols=[3, 4, 7], names=["Start time (s)", "Length (s)", "Speaker"])
    
    # Add a column for end time in seconds and filter out rows with length less than 1 second
    df["End time (s)"] = df["Start time (s)"] + df["Length (s)"]
    #df = df[df["Length (s)"] >= 1] #(ground truth)
    
    # Convert start and end times from seconds to HH:MM:SS format
    df["Start time"] = df["Start time (s)"].apply(lambda x: "{:0>8}".format(str(datetime.timedelta(seconds=int(x))), int((x - int(x)) * 1000)))
    df["End time"] = df["End time (s)"].apply(lambda x: "{:0>8}".format(str(datetime.timedelta(seconds=int(x))), int((x - int(x)) * 1000)))
    
    # Group lines by speaker and combine back-to-back lines
    df = df.groupby((df["Speaker"] != df["Speaker"].shift()).cumsum()).agg({
        "Speaker": "first",
        "Start time": "first",
        "End time": "last",
        "Start time (s)": "first",
        "End time (s)": "last",
        "Length (s)": "sum"
    }).reset_index(drop=True)

    # Reorder columns and save to a CSV file
    df = df[["Start time", "End time", "Start time (s)","End time (s)", "Length (s)", "Speaker"]]
    df.to_csv(file_out, index=False)

### Rttm to Csv Batch

In [3]:
def rttm_to_csv_batch_directories(input_dir,output_dir):
  # Moves to the directory and execute the function for every rttm file
  os.chdir(input_dir)
  n=1    
  for file in os.listdir():
      if file.endswith(".rttm"):
          file_in = f"{input_dir}/{file}"
          file_out = os.path.join(output_dir, file[:-5] + "_diarization.csv")
          rttm_to_csv(file_in,file_out)
          print(str(n)+") "+ str(file_out))
          n=n+1

## Use of Function

In [4]:
directory_in = r'/content/drive/MyDrive/Projects/tps/data/3. diarization (rttm)'
directory_out = r'/content/drive/MyDrive/Projects/tps/data/4. diarization (csv)'

In [6]:
rttm_to_csv_batch_directories(directory_in,directory_out)

1) /content/drive/MyDrive/Projects/tps/data/4. diarization (csv)/1.1_diarization.csv
2) /content/drive/MyDrive/Projects/tps/data/4. diarization (csv)/1.2_diarization.csv
3) /content/drive/MyDrive/Projects/tps/data/4. diarization (csv)/10.1_diarization.csv
4) /content/drive/MyDrive/Projects/tps/data/4. diarization (csv)/10.2_diarization.csv
5) /content/drive/MyDrive/Projects/tps/data/4. diarization (csv)/11.1_diarization.csv
6) /content/drive/MyDrive/Projects/tps/data/4. diarization (csv)/11.2_diarization.csv
7) /content/drive/MyDrive/Projects/tps/data/4. diarization (csv)/12.1_diarization.csv
8) /content/drive/MyDrive/Projects/tps/data/4. diarization (csv)/12.2_diarization.csv
9) /content/drive/MyDrive/Projects/tps/data/4. diarization (csv)/2.1_diarization.csv
10) /content/drive/MyDrive/Projects/tps/data/4. diarization (csv)/2.2_diarization.csv
11) /content/drive/MyDrive/Projects/tps/data/4. diarization (csv)/4.1_diarization.csv
12) /content/drive/MyDrive/Projects/tps/data/4. diarizati