In [None]:
# !pip install datasets==3.6.0
from huggingface_hub import login
login("")

In [92]:
# Elevenlabs dependencies
################################
from elevenlabs import ElevenLabs
from io import BytesIO
import os
import requests
################################

from datasets import Dataset, DatasetDict, Audio, load_dataset
from sklearn.model_selection import train_test_split

In [93]:
elevenlabs_client = ElevenLabs(api_key="sk_d4c8588361de924135e18ff917a18903a234dcfd5849da0e")

In [94]:
from datasets import load_dataset

ds = load_dataset("Elormiden/RIK-Cypriot-news")

README.md: 0.00B [00:00, ?B/s]

Downloading data:   0%|          | 0/16 [00:00<?, ?files/s]

data/train-00000-of-00016.parquet:   0%|          | 0.00/486M [00:00<?, ?B/s]

data/train-00001-of-00016.parquet:   0%|          | 0.00/485M [00:00<?, ?B/s]

data/train-00002-of-00016.parquet:   0%|          | 0.00/486M [00:00<?, ?B/s]

data/train-00003-of-00016.parquet:   0%|          | 0.00/486M [00:00<?, ?B/s]

data/train-00004-of-00016.parquet:   0%|          | 0.00/486M [00:00<?, ?B/s]

data/train-00005-of-00016.parquet:   0%|          | 0.00/485M [00:00<?, ?B/s]

data/train-00006-of-00016.parquet:   0%|          | 0.00/480M [00:00<?, ?B/s]

data/train-00007-of-00016.parquet:   0%|          | 0.00/485M [00:00<?, ?B/s]

data/train-00008-of-00016.parquet:   0%|          | 0.00/485M [00:00<?, ?B/s]

data/train-00009-of-00016.parquet:   0%|          | 0.00/486M [00:00<?, ?B/s]

data/train-00010-of-00016.parquet:   0%|          | 0.00/486M [00:00<?, ?B/s]

data/train-00011-of-00016.parquet:   0%|          | 0.00/485M [00:00<?, ?B/s]

data/train-00012-of-00016.parquet:   0%|          | 0.00/485M [00:00<?, ?B/s]

data/train-00013-of-00016.parquet:   0%|          | 0.00/485M [00:00<?, ?B/s]

data/train-00014-of-00016.parquet:   0%|          | 0.00/485M [00:00<?, ?B/s]

data/train-00015-of-00016.parquet:   0%|          | 0.00/485M [00:00<?, ?B/s]

data/validation-00000-of-00002.parquet:   0%|          | 0.00/485M [00:00<?, ?B/s]

data/validation-00001-of-00002.parquet:   0%|          | 0.00/485M [00:00<?, ?B/s]

data/test-00000-of-00002.parquet:   0%|          | 0.00/486M [00:00<?, ?B/s]

data/test-00001-of-00002.parquet:   0%|          | 0.00/485M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/34701 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/4337 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/4339 [00:00<?, ? examples/s]

In [95]:
import tempfile
from tqdm import tqdm
import librosa
import soundfile as sf

In [96]:
def eleven_process_logic(audio_path: str) -> str:
    try:
        if not os.path.exists(audio_path):
            raise FileNotFoundError(f"Audio file not found: {audio_path}")
        with open(audio_path, 'rb') as f:
            audio_data = BytesIO(f.read())
        transcription = elevenlabs_client.speech_to_text.convert(
            file=audio_data,
            model_id="scribe_v1",
            tag_audio_events=True,
            language_code="ell",
            diarize=True,
        )
        return transcription.text
    except Exception as e:
        print(f"Error transcribing {audio_path}: {str(e)}")
        return ""

In [97]:
import os
import tempfile
import soundfile as sf
from tqdm import tqdm
import concurrent.futures

# Предполагаем, что у вас есть эта функция, которая принимает путь к файлу
# и возвращает транскрипцию.
# def eleven_process_logic(file_path: str) -> str:
#     # ... ваш код для вызова API Eleven Labs ...
#     return "транскрипция"

def process_dataset_parallel(dataset, batch_size=10, max_workers=10):
    """
    Обрабатывает датасет, отправляя запросы параллельно в рамках одного батча.
    """
    transcriptions = []
    train_data = dataset["train"]
    total_samples = len(train_data)

    for i in tqdm(range(0, total_samples, batch_size), desc="Processing batches"):
        batch_indices = range(i, min(i + batch_size, total_samples))
        temp_files = []

        try:
            # 1. Создаем все временные файлы для батча
            for j in batch_indices:
                sample = train_data[j]
                with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
                    sf.write(temp_file.name, sample["audio"]["array"], sample["audio"]["sampling_rate"])
                    temp_files.append(temp_file.name)

            # 2. Обрабатываем все файлы в батче параллельно
            batch_results = []
            with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
                # executor.map применяет функцию eleven_process_logic к каждому файлу в temp_files
                # и возвращает результаты в том же порядке.
                future_to_file = {executor.submit(eleven_process_logic, f): f for f in temp_files}
                for future in concurrent.futures.as_completed(future_to_file):
                    try:
                        result = future.result()
                        batch_results.append(result)
                    except Exception as exc:
                        print(f'Файл {future_to_file[future]} сгенерировал исключение: {exc}')

            transcriptions.extend(batch_results)

        finally:
            # 3. Удаляем временные файлы
            for temp_file in temp_files:
                if os.path.exists(temp_file):
                    os.unlink(temp_file)

    return transcriptions

In [None]:
# transcriptions = process_dataset_parallel(ds)
news_transcriptions = process_dataset_parallel(ds)

Processing batches:   3%|▎         | 94/3471 [06:41<68:39:17, 73.19s/it]

Error transcribing /tmp/tmpmwbpnb8h.wav: The read operation timed out


Processing batches:  12%|█▏        | 400/3471 [16:00<1:30:33,  1.77s/it]

In [84]:
type(transcriptions)

list

In [88]:
transcriptions_list_of_dicts = [{"text": t} for t in transcriptions]
trans_hf = Dataset.from_list(transcriptions_list_of_dicts)

In [89]:
trans_hf

Dataset({
    features: ['text'],
    num_rows: 17133
})

In [90]:
trans_hf.push_to_hub("Elormiden/transcriptions")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/18 [00:00<?, ?ba/s]

Uploading...:   0%|          | 0.00/1.55M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/Elormiden/transcriptions/commit/cb3d5aaab71a8e8b8dddaa10f15d1aac486ea0d4', commit_message='Upload dataset', commit_description='', oid='cb3d5aaab71a8e8b8dddaa10f15d1aac486ea0d4', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Elormiden/transcriptions', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Elormiden/transcriptions'), pr_revision=None, pr_num=None)