# 04 - Pré-Processamento

    O objetivo deste pré-processamento é gerar a base de consulta do Bot.

## Importação das bibliotecas

In [None]:
import pickle as pkl
import whisper as wr
from collections import Counter
from math import ceil, floor
import re

## Criação de métodos

In [None]:
def load_diarization():
    with open('./bot/data/diarization.pkl', '+rb') as file:
        r = pkl.load(file)
    return r

def load_transcription():
    with open('./bot/data/transcription.pkl', '+rb') as file:
        r = pkl.load(file)
    return r

def format_time(seconds) -> str:
    h = seconds // 3600
    m = (seconds % 3600) // 60
    s = seconds % 60
    return(f'{h:02d}:{m:02d}:{s:02d}')

def format_text(text : str) -> str:
    text = text.lower()
    text = re.sub(r'\[[0-9]*\]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text

## Carregar os dados

In [None]:
diarization = load_diarization()
transcription = load_transcription()

## Gerar o arquivo de dados

In [None]:
text_formatted = ""
text_original = ""
text_data = []

for segment in transcription["segments"]:
    start, end = ceil(segment.get("start")), floor(segment.get("end"))
    if end <= start:
        end = start + 1
    speakers = diarization["diarization"][start:end]
    speaker = diarization["labels"].get(Counter(speakers).most_common(1)[0][0])
    text_segment_original = format_text(segment["text"].strip())
    text_segment_formatted = " ".join(["[", format_time(start), "-", format_time(end), "]", speaker.strip(), ":", text_segment_original.strip()])
    
    text_original += (text_segment_original.strip() + "\n")
    text_formatted += (text_segment_formatted.strip() + "\n")
    text_data.append({ "start": start, "end": end, "speaker": speaker, "text_original": text_segment_original, "text_formatted": text_segment_formatted })
    
dataset = {
    "text_original": text_original,
    "text_formatted": text_formatted,
    "segment_data": text_data
}


## Persistir os dados para consumo

In [None]:
import pickle as pkl

with open('./bot/data/dataset.pkl', '+wb') as file:
    pkl.dump(dataset, file)