# Batch Format Subtitles to Table (srt to csv)

## Preparation

### Import

In [None]:
import pandas as pd
import os
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Function Definition

### Srt to Csv

In [None]:
def srt_to_csv(file_in,file_out):
    # Open the SRT file and read the data into a list
    with open(file_in, "r", encoding="utf-8") as file:
        srt_data = file.readlines()

    # Remove the newline characters from the SRT data
    srt_data = [line.strip() for line in srt_data]

    # Check that the length of srt_data is a multiple of 4
    if len(srt_data) % 4 == 3:
        srt_data.append(" ")
    elif len(srt_data) % 4 != 0:
        print(file_in)
        raise ValueError("SRT file is not properly formatted")

    # Combine every four lines into a single string and create a DataFrame
    subtitle_data = [f" {srt_data[i+1]} --> {srt_data[i+2]}" for i in range(0, len(srt_data), 4)]
    df = pd.DataFrame(subtitle_data, columns=["Subtitle"])

    # Split the subtitle data into separate columns and strip off spaces
    df[["Start time", "End time","Subtitle"]] = df["Subtitle"].str.split(" --> ", expand=True)
    df[["Start time", "End time"]] = df[["Start time", "End time"]].apply(lambda x: x.str.replace(" ", ""))
    
    #Create new columns with the duration in seconds
    df["Start time (s)"] = df["Start time"].apply(lambda x: int(x[:2]) * 3600 + int(x[3:5]) * 60 + int(x[6:8]) + float(x[9:]) / 1000)
    df["End time (s)"] = df["End time"].apply(lambda x: int(x[:2]) * 3600 + int(x[3:5]) * 60 + int(x[6:8]) + float(x[9:]) / 1000)
    
    # Reorder columns and save to a CSV file
    df = df[["Start time", "End time", "Start time (s)","End time (s)", "Subtitle"]]
    df.to_csv(file_out,encoding='utf-8-sig', index=False)

### Srt to Csv Batch

In [None]:
def srt_to_csv_batch(input_dir,output_dir):
  # Moves to the directory and execute the function for every srt file
  os.chdir(input_dir)
  n=1     
  for file in os.listdir():
      if file.endswith(".srt"):
          file_in = f"{input_dir}/{file}"
          file_out = os.path.join(output_dir, file[:-4] + "_transcription.csv")
          srt_to_csv(file_in,file_out)
          print(str(n)+") "+ str(file_out))
          n=n+1

## Use of Function

In [None]:
directory_in = r'/content/drive/MyDrive/tps/data/5. transcription (srt)'
directory_out = r'/content/drive/MyDrive/tps/data/6. transcription (csv)'

In [None]:
srt_to_csv_batch(directory_in,directory_out)

1) /content/drive/MyDrive/tps/data/6. transcription (csv)/10.1_transcription.csv
2) /content/drive/MyDrive/tps/data/6. transcription (csv)/1.1_transcription.csv
3) /content/drive/MyDrive/tps/data/6. transcription (csv)/5.1_transcription.csv
4) /content/drive/MyDrive/tps/data/6. transcription (csv)/3.2_transcription.csv
5) /content/drive/MyDrive/tps/data/6. transcription (csv)/12.2_transcription.csv
6) /content/drive/MyDrive/tps/data/6. transcription (csv)/3.1_transcription.csv
7) /content/drive/MyDrive/tps/data/6. transcription (csv)/4.2_transcription.csv
8) /content/drive/MyDrive/tps/data/6. transcription (csv)/2.2_transcription.csv
9) /content/drive/MyDrive/tps/data/6. transcription (csv)/10.2_transcription.csv
10) /content/drive/MyDrive/tps/data/6. transcription (csv)/1.2_transcription.csv
11) /content/drive/MyDrive/tps/data/6. transcription (csv)/4.3_transcription.csv
12) /content/drive/MyDrive/tps/data/6. transcription (csv)/12.1_transcription.csv
13) /content/drive/MyDrive/tps/da