# Utility functions

In [None]:
import json
from utils.names import get_file_names, get_model_names

files = get_file_names()
models = get_model_names()

In [3]:
class Subtitle():
    def __init__(self, start, end, text):        
        self.start = start
        self.end = end
        self.text = text

In [3]:
def save_json(predictions, json_path):
    with open(json_path, 'w', encoding='utf-8') as f:
        json.dump(predictions, f, ensure_ascii=False, indent=2)

def save_text_file(transcriptions, path):
    hp_text = ""
    for subtitle in transcriptions:
        hp_text += f"{subtitle.text} "
    with open(path, 'w', encoding='utf-8') as f:
        f.write(hp_text)

def save_text_shift_file(transcriptions, path):
    hp_text = ""
    for subtitle in transcriptions:
        hp_text += f"{subtitle.text}\n"
    with open(path, 'w', encoding='utf-8') as f:
        f.write(hp_text)

def ms_to_srt_time(ms):
    hours = ms // 3600000
    minutes = (ms % 3600000) // 60000
    seconds = (ms % 60000) // 1000
    milliseconds = ms % 1000
    return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}"


def save_srt_file(transcriptions, path):
    with open(path, 'w', encoding='utf-8') as f:
        for id, subtitle in enumerate(transcriptions):
            f.write(f"{id}\n")
            f.write(f"{ms_to_srt_time(subtitle.start)} --> {ms_to_srt_time(subtitle.end)}\n")
            f.write(f"{subtitle.text}\n\n")

# Preprocess Whisper predictions

In [4]:
def process_whisper_json(predictions):
    transcriptions = []

    # iterate over all segments
    for prediction in predictions['predictions']:                
        for segment in prediction["result"]:                    
            start_sec = segment["start"]
            end_sec = segment["end"]

            transcriptions.append(
                Subtitle(                    
                    text=segment["text"].strip(),                            
                    start=int(start_sec * 1000),
                    end=int(end_sec * 1000)              
                )
            )
    return transcriptions

In [None]:
for model in models:
    for file in files:
        json_path = f"../data/{model}/json/{file}.json"
        text_path = f"../data/{model}/text/{file}.txt"
        srt_path = f"../data/{model}/srt/{file}.srt"
        with open(json_path, 'r', encoding='utf-8') as f:
            predictions = json.load(f)        
        try:
            if (
                "predictions" in predictions and
                isinstance(predictions["predictions"], list) and
                len(predictions["predictions"]) > 0 and
                "result" in predictions["predictions"][0]
            ):
                save_json(predictions["predictions"][0]['result'],json_path)
                predictions = process_whisper_json(predictions)        
                save_text_file(predictions, text_path)                
                save_srt_file(predictions, srt_path)
        except Exception as e:
            print(f"Check {json_path} because i didn't success in saving")  
            

# Preprocess AssemblyAI predictions

AssemblyAI gives us the times in milliseconds: need to change to second to be standard with whisper and parakeet

In [None]:
for file in files:
        assemblyai_path = f"../data/assemblyai/json/{file}.json"

        with open(assemblyai_path, 'r', encoding='utf-8') as f:
                predictions = json.load(f)  

        for segment in predictions:
                segment["start"] = segment["start"]/1000
                segment["end"] = segment["end"]/1000
                for word in segment["words"]:
                        word["start"] = word["start"]/1000
                        word["end"] = word["end"]/1000

        with open(assemblyai_path, 'w', encoding='utf-8') as f:
                json.dump(predictions, f, ensure_ascii=False, indent=2)


# Preprocess Parakeet predictions

In [11]:
def process_parakeet_json(predictions):
    transcriptions = []

    # iterate over all segments
    for segment in predictions:                       
        start_sec = segment["start"]
        end_sec = segment["end"]

        transcriptions.append(
            Subtitle(                    
                text=segment["text"].strip(),                            
                start=int(start_sec * 1000),
                end=int(end_sec * 1000)              
            )
        )
    return transcriptions


In [None]:
for file in files:
    json_path = f"../data/parakeet/json/{file}.json"
    text_path = f"../data/parakeet/text/{file}.txt"
    srt_path = f"../data/parakeet/srt/{file}.srt"
    with open(json_path, 'r', encoding='utf-8') as f:
        predictions = json.load(f)        
    try:            
        predictions = process_parakeet_json(predictions)        
        save_text_file(predictions, text_path)
        save_srt_file(predictions, srt_path)
    except Exception as e:
        print(f"Check {json_path} because i didn't success in saving: {e}") 

# Exploration of the predictions

In [None]:
from standardization.standardization_utils import load_all_subtitles

srt_directories = [f"../data/{model}/srt" for model in models]


all_subtitles = load_all_subtitles(srt_directories)


# Verifica esempio:
print(f"Caricati {len(all_subtitles)} file SRT")
print(f"Esempio: primo modello = {all_subtitles[0][0]} primo file = {all_subtitles[0][1]}, numero segmenti = {len(all_subtitles[0][2])}")

Caricati 120 file SRT
Esempio: primo modello = parakeet primo file = MEZZORAINPIU_10_10_21.srt, numero segmenti = 317
