In [None]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
# from datasets import load_dataset  # ①

device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-large-v3-turbo"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id,
    torch_dtype=torch_dtype,
    low_cpu_mem_usage=True,
    use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
    return_timestamps=True,   # ②
    chunk_length_s=10,        # ③
    stride_length_s=2
)

# dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation")  # ①
# sample = dataset[0]["audio"]
sample = "lsy_audio_2023_58s.mp3"

result = pipe(sample)
# print(result["text"])   # ④

print(result)


In [None]:
import os
os.environ["PATH"] += os.pathsep + r"C:\Users\Owner\Downloads\ffmpeg-2025-10-27-git-68152978b5-full_build\ffmpeg-2025-10-27-git-68152978b5-full_build\bin"


In [None]:
import torch
torch.cuda.is_available()

In [None]:
start_end_text = []

for chunk in result["chunks"]:
    start = chunk["timestamp"][0]
    end = chunk["timestamp"][1]
    text = chunk["text"]
    start_end_text.append((start, end, text))
    
import pandas as pd
df = pd.DataFrame(start_end_text, columns=["start", "end", "text"])
df.to_csv("lsy_audio_2023_58.csv", index=False, sep = ":")
df

In [None]:
# instantiate the pipeline
from pyannote.audio import Pipeline
from dotenv import load_dotenv
import os

load_dotenv()
huggingface_access_token = os.getenv("huggingface_access_token")

pipeline = Pipeline.from_pretrained(
  "pyannote/speaker-diarization-3.1",
  use_auth_token=huggingface_access_token)

In [None]:
sourse = pipeline("싼기타_비싼기타.mp3")

with open("싼기타_비싼기타.rttm", "w", encoding = 'utf-8') as rttm:
    sourse.write_rttm(rttm)

In [None]:
import pandas as pd
rttm_path = "싼기타_비싼기타.rttm"
df_rttm = pd.read_csv(rttm_path, sep = " ", header = None, names=['type', 'file', 'chnl', 'start', 'duration', 'C1', 'C2', 'speaker_id', 'C3', 'C4'])
df_rttm['end'] = df_rttm['start'] + df_rttm['duration']
df_rttm


In [None]:
df_rttm["number"] = None
df_rttm.at[0, "number"] = 0
for i in range(1, len(df_rttm)):
    if df_rttm.at[i, "speaker_id"] == df_rttm.at[i-1, "speaker_id"]:
        df_rttm.at[i, "number"] = df_rttm.at[i-1, "number"]
    else:
        df_rttm.at[i, "number"] = df_rttm.at[i-1, "number"] + 1
        
df_rttm.head(10)

In [None]:
df_rttm_group = df_rttm.groupby("number").agg( start = pd.NamedAgg(column="start", aggfunc="min"),
                                               end = pd.NamedAgg(column="end", aggfunc="max"),
                                                  speaker_id = pd.NamedAgg(column="speaker_id", aggfunc="first"))
df_rttm_group

In [1]:
from glob import glob
from openai import OpenAI
from dotenv import load_dotenv
import json
import os
import base64

load_dotenv()
api_key = os.getenv("OPENAI_API_KEY") 
client = OpenAI(api_key=api_key)      


voice = "ash"
mp3_file = f"Hello_world_{voice}.mp3"

response = client.audio.speech.create(
    model="tts-1",
    voice=voice,
    input="Hello, world! I'm {voice}. This is a TTS test",
)

response.write_to_file(mp3_file)

import IPython.display as ipd

ipd.Audio(mp3_file)