In [None]:
import pandas as pd
import re
import os

In [None]:
dataset = pd.read_json('/kaggle/input/output/output_cy_data.json')
full_audio_segments = '/kaggle/input/full-audio-segments/segments'

In [None]:
df = pd.DataFrame(dataset)

In [None]:
df.head()

In [None]:
# Заменить префикс пути в столбце
df['audio_filepath'] = df['audio_filepath'].str.replace('./Audio_Files/', '/kaggle/input/audio-files/Audio_Files/', regex=False)

In [None]:
df

In [None]:
from datasets import Dataset, Audio
from pathlib import Path

In [None]:
dataset['audio_segments'] = dataset.apply(lambda row: 
    f"{full_audio_segments}/{Path(row['audio_filepath']).stem}_segment_{row.name:06d}.wav", 
    axis=1
)

In [None]:
dataset

In [None]:
dataset = dataset.drop(columns=['start'])
dataset = dataset.drop(columns=['end'])

In [None]:
dataset

In [None]:
dataset = dataset.drop(columns=['audio_filepath'])

In [None]:
dataset

In [None]:
dataset['pronunciation'] = combined_phonetics

In [None]:
dataset.to_csv('phonetcs_base_full.csv')

In [None]:
## Phonetic transcriptions

# !pip install phonemizer
# !apt-get install -y espeak espeak-data libespeak1 libespeak-dev

In [None]:
from phonemizer import phonemize
from phonemizer.backend import EspeakBackend
from tqdm import tqdm

In [None]:
backend = EspeakBackend('el')  # 'el' = Greek

def add_pronunciation(df, batch_size=50):
    pronunciations = []
    
    for i in tqdm(range(0, len(df), batch_size)):
        batch = df.iloc[i:i+batch_size]
        
        for text in batch['text']:
            try:
                pronunciation = phonemize(
                    text, 
                    language='el',
                    backend='espeak',
                    strip=True,
                    preserve_punctuation=True,
                    with_stress=True 
                )
                pronunciations.append(pronunciation)
            except Exception as e:
                print(f"Ошибка для текста '{text}': {e}")
                pronunciations.append("")
        
        if i % 1000 == 0:
            import gc
            gc.collect()
    
    df['pronunciation'] = pronunciations
    return df

In [None]:
def process_chunk(start, end, chunk_num):
    chunk = df.iloc[start:end].copy()
    
    print(f"Обрабатываем кусок {chunk_num}: строки {start}-{end}")
    
    try:
        chunk = add_pronunciation(chunk)
        chunk.to_csv(f'updated_chunk_{chunk_num}.csv', index=False)
        print(f"✅ Кусок {chunk_num} готов!")
        return True
    except Exception as e:
        print(f"❌ Кусок {chunk_num} упал! Ошибка: {e}")
        return False

In [None]:
def make_readable_pronunciation(ipa_text):
    """Делаем фонетику читабельной"""
    replacements = {
        'ˈ': '',      # убираем ударение
        'ˌ': '',      # убираем второстепенное ударение
        'ð': 'th',    # греческая δ → th
        'θ': 'th',    # греческая θ → th
        'ɣ': 'gh',    # греческая γ → gh
        'x': 'ch',    # греческая χ → ch
        'ɲ': 'ny',    # мягкий n
        'ʝ': 'y',     # мягкий y
    }
    
    result = ipa_text.lower()
    for ipa, readable in replacements.items():
        result = result.replace(ipa, readable)
    
    return result

In [None]:
process_chunk(26000, 27304, 27)

In [None]:
all_chunks = []

for i in range(1, 28):
    chunk = pd.read_csv(f'/kaggle/working/updated_chunk_{i}.csv')
    all_chunks.append(chunk)

combined_df = pd.concat(all_chunks, ignore_index=True)
combined_phonetics = combined_df['pronunciation']

In [None]:
combined_phonetics.info()

In [None]:
fonetic_filter = pd.read_csv('/kaggle/working/updated_chunk_27.csv')
fonetic_filter['pronunciation'] = fonetic_filter['pronunciation'].apply(make_readable_pronunciation)
fonetic_filter.to_csv('/kaggle/working/updated_chunk_27.csv')

In [None]:
# # Обработка по кускам (копируй и вставляй по одной строке):
# process_chunk(0, 1000, 1)
# process_chunk(1000, 2000, 2)
# process_chunk(2000, 3000, 3)
# process_chunk(3000, 4000, 4)
# process_chunk(4000, 5000, 5)
# process_chunk(5000, 6000, 6)
# process_chunk(6000, 7000, 7)
# process_chunk(7000, 8000, 8)
# process_chunk(8000, 9000, 9)
# process_chunk(9000, 10000, 10)
# process_chunk(10000, 11000, 11)
# process_chunk(11000, 12000, 12)
# process_chunk(12000, 13000, 13)
# process_chunk(13000, 14000, 14)
# process_chunk(14000, 15000, 15)
# process_chunk(15000, 16000, 16)
# process_chunk(16000, 17000, 17)
# process_chunk(17000, 18000, 18)
# process_chunk(18000, 19000, 19)
# process_chunk(19000, 20000, 20)
# process_chunk(20000, 21000, 21)
# process_chunk(21000, 22000, 22)
# process_chunk(22000, 23000, 23)
# process_chunk(23000, 24000, 24)
# process_chunk(24000, 25000, 25)
# process_chunk(25000, 26000, 26)
# process_chunk(26000, 27000, 27)
# process_chunk(27000, len(df), 28)  # Последний кусок до конца

In [None]:
hf_dataset = Dataset.from_pandas(dataset)

In [None]:
hf_dataset = hf_dataset.cast_column("audio_segments", Audio())

In [None]:
hf_dataset = hf_dataset.rename_column("audio_segments", "audio")
hf_dataset = hf_dataset.rename_column("text", "sentence")

In [None]:
from huggingface_hub import login
login()