In [None]:
!apt-get install espeak-ng -y

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  espeak-ng-data libespeak-ng1 libpcaudio0 libsonic0
The following NEW packages will be installed:
  espeak-ng espeak-ng-data libespeak-ng1 libpcaudio0 libsonic0
0 upgraded, 5 newly installed, 0 to remove and 41 not upgraded.
Need to get 4,526 kB of archives.
After this operation, 11.9 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/main amd64 libpcaudio0 amd64 1.1-6build2 [8,956 B]
Get:2 http://archive.ubuntu.com/ubuntu jammy/main amd64 libsonic0 amd64 0.2.0-11build1 [10.3 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 espeak-ng-data amd64 1.50+dfsg-10ubuntu0.1 [3,956 kB]
Get:4 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 libespeak-ng1 amd64 1.50+dfsg-10ubuntu0.1 [207 kB]
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 espeak-ng amd64 1.50+dfsg-1

In [None]:
!pip install phonemizer pandas

Collecting phonemizer
  Downloading phonemizer-3.3.0-py3-none-any.whl.metadata (48 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/48.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Collecting segments (from phonemizer)
  Downloading segments-2.3.0-py2.py3-none-any.whl.metadata (3.5 kB)
Collecting dlinfo (from phonemizer)
  Downloading dlinfo-2.0.0-py3-none-any.whl.metadata (1.1 kB)
Collecting csvw>=1.5.6 (from segments->phonemizer)
  Downloading csvw-3.7.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting isodate (from csvw>=1.5.6->segments->phonemizer)
  Downloading isodate-0.7.2-py3-none-any.whl.metadata (11 kB)
Collecting rfc3986<2 (from csvw>=1.5.6->segments->phonemizer)
  Downloading rfc3986-1.5.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting language-tags (from csvw>=1.5.6->segments->phonemizer)
  Downloading language_tags-1.2.0-py3-none-a

In [None]:
import pandas as pd
from phonemizer import phonemize
import gc
import os

TSV_PATH = r"/content/data/common_voice_de_test_phoneme.tsv"
OUTPUT_TSV = r"/content/common_voice_de_test_phoneme.tsv"

print("Loading TSV file...")
df = pd.read_csv(TSV_PATH, sep='\t')
df = df[df['sentence'].notna()].reset_index(drop=True)
print(f"Processing {len(df)} rows")

BATCH_SIZE = 1000
total_batches = (len(df) + BATCH_SIZE - 1) // BATCH_SIZE

# Process and save incrementally
first_batch = True

for batch_idx in range(0, len(df), BATCH_SIZE):
    batch_end = min(batch_idx + BATCH_SIZE, len(df))
    batch_df = df.iloc[batch_idx:batch_end].copy()

    print(f"\nBatch {batch_idx//BATCH_SIZE + 1}/{total_batches} "
          f"(rows {batch_idx}-{batch_end})")

    try:
        # Phonemize batch
        batch_phonemes = phonemize(
            batch_df['sentence'].tolist(),
            language='de',
            backend='espeak',
            strip=True,
            preserve_punctuation=False,
            with_stress=False,
            njobs=4
        )

        batch_df['phonemes'] = batch_phonemes
        batch_df = batch_df[batch_df['phonemes'] != ""]

        # Append to output file
        batch_df.to_csv(
            OUTPUT_TSV,
            sep='\t',
            index=False,
            mode='w' if first_batch else 'a',
            header=first_batch
        )

        first_batch = False
        print(f"  ✓ Saved {len(batch_df)} samples")

        # Clear memory
        del batch_df, batch_phonemes
        gc.collect()

    except Exception as e:
        print(f"  ⚠ Error: {e}")

print(f"\n✓ Complete! Saved to: {OUTPUT_TSV}")

# Show final count
final_df = pd.read_csv(OUTPUT_TSV, sep='\t')
print(f"Total phonemized samples: {len(final_df)}")

Loading TSV file...
Processing 9342 rows

Batch 1/10 (rows 0-1000)




  ✓ Saved 1000 samples

Batch 2/10 (rows 1000-2000)




  ✓ Saved 1000 samples

Batch 3/10 (rows 2000-3000)




  ✓ Saved 1000 samples

Batch 4/10 (rows 3000-4000)




  ✓ Saved 1000 samples

Batch 5/10 (rows 4000-5000)




  ✓ Saved 1000 samples

Batch 6/10 (rows 5000-6000)




  ✓ Saved 1000 samples

Batch 7/10 (rows 6000-7000)




  ✓ Saved 1000 samples

Batch 8/10 (rows 7000-8000)




  ✓ Saved 1000 samples

Batch 9/10 (rows 8000-9000)




  ✓ Saved 1000 samples

Batch 10/10 (rows 9000-9342)




  ✓ Saved 342 samples

✓ Complete! Saved to: /content/common_voice_de_test_phoneme.tsv
Total phonemized samples: 9342


In [None]:
# !apt-get install espeak-ng -y

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  espeak-ng-data libespeak-ng1 libpcaudio0 libsonic0
The following NEW packages will be installed:
  espeak-ng espeak-ng-data libespeak-ng1 libpcaudio0 libsonic0
0 upgraded, 5 newly installed, 0 to remove and 41 not upgraded.
Need to get 4,526 kB of archives.
After this operation, 11.9 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/main amd64 libpcaudio0 amd64 1.1-6build2 [8,956 B]
Get:2 http://archive.ubuntu.com/ubuntu jammy/main amd64 libsonic0 amd64 0.2.0-11build1 [10.3 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 espeak-ng-data amd64 1.50+dfsg-10ubuntu0.1 [3,956 kB]
Get:4 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 libespeak-ng1 amd64 1.50+dfsg-10ubuntu0.1 [207 kB]
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 espeak-ng amd64 1.50+dfsg-1

In [None]:
# !pip install phonemizer pandas

Collecting phonemizer
  Downloading phonemizer-3.3.0-py3-none-any.whl.metadata (48 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/48.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Collecting segments (from phonemizer)
  Downloading segments-2.3.0-py2.py3-none-any.whl.metadata (3.5 kB)
Collecting dlinfo (from phonemizer)
  Downloading dlinfo-2.0.0-py3-none-any.whl.metadata (1.1 kB)
Collecting csvw>=1.5.6 (from segments->phonemizer)
  Downloading csvw-3.7.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting isodate (from csvw>=1.5.6->segments->phonemizer)
  Downloading isodate-0.7.2-py3-none-any.whl.metadata (11 kB)
Collecting rfc3986<2 (from csvw>=1.5.6->segments->phonemizer)
  Downloading rfc3986-1.5.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting language-tags (from csvw>=1.5.6->segments->phonemizer)
  Downloading language_tags-1.2.0-py3-none-a

In [None]:
# #huggingface data phonemizer

# import pandas as pd
# from phonemizer import phonemize
# import os
# from tqdm import tqdm

# # Paths
# TSV_FOLDER = r"/home/abhinav.pm/ABHI/SAL/v4/common_voice_pl_tsv_files"
# OUTPUT_FOLDER = r"/home/abhinav.pm/ABHI/SAL/v4/common_voice_pl_tsv_phonemes"

# # Create output folder
# os.makedirs(OUTPUT_FOLDER, exist_ok=True)

# # Process each TSV file
# tsv_files = [f for f in os.listdir(TSV_FOLDER) if f.endswith('.tsv')]

# print("="*60)
# print("PHONEME CONVERSION FOR POLISH DATASET")
# print("="*60)

# for tsv_file in tsv_files:
#     print(f"\n--- Processing: {tsv_file} ---")

#     # Paths
#     input_path = os.path.join(TSV_FOLDER, tsv_file)
#     output_path = os.path.join(OUTPUT_FOLDER, tsv_file.replace('.tsv', '_phonemes.tsv'))

#     # Load TSV
#     print("Loading TSV file...")
#     df = pd.read_csv(input_path, sep='\t')
#     print(f"Loaded {len(df)} rows")

#     # Check if 'sentence' column exists
#     if 'sentence' not in df.columns:
#         print(f"⚠ Warning: No 'sentence' column found. Skipping {tsv_file}")
#         print(f"Available columns: {list(df.columns)}")
#         continue

#     print("\nConverting sentences to phonemes (Polish - pl)...")
#     print("="*60)

#     # Convert to phonemes with progress bar
#     phoneme_list = []
#     error_count = 0
#     empty_count = 0

#     for idx, sentence in tqdm(enumerate(df['sentence']), total=len(df), desc="Converting"):
#         # Handle empty/NaN sentences
#         if pd.isna(sentence) or str(sentence).strip() == "":
#             phoneme_list.append("")
#             empty_count += 1
#             continue

#         try:
#             phonemes = phonemize(
#                 str(sentence),
#                 language='pl',  # FIXED: Changed from 'de' to 'pl' for Polish
#                 backend='espeak',
#                 strip=True,
#                 preserve_punctuation=False,
#                 with_stress=False
#             )
#             phoneme_list.append(phonemes)

#             # Show first 3 examples
#             if idx < 3:
#                 print(f"\nExample {idx + 1}:")
#                 print(f"  Text:    {sentence}")
#                 print(f"  Phoneme: {phonemes}")

#         except Exception as e:
#             if error_count == 0:  # Print first error for debugging
#                 print(f"\n⚠ Error at row {idx}: {e}")
#             phoneme_list.append("")
#             error_count += 1

#     # Add phonemes column
#     df['phonemes'] = phoneme_list

#     # Remove rows with empty phonemes
#     original_len = len(df)
#     df = df[df['phonemes'] != ""]
#     removed_count = original_len - len(df)

#     # Save to TSV
#     df.to_csv(output_path, sep='\t', index=False)

#     # Summary
#     print(f"\n{'='*60}")
#     print(f"Summary for {tsv_file}:")
#     print(f"  Original rows: {original_len:,}")
#     print(f"  Empty sentences: {empty_count:,}")
#     print(f"  Conversion errors: {error_count:,}")
#     print(f"  Rows removed: {removed_count:,}")
#     print(f"  Final rows: {len(df):,}")
#     print(f"  Output: {os.path.basename(output_path)}")
#     print(f"  Size: {os.path.getsize(output_path) / (1024*1024):.2f} MB")
#     print(f"{'='*60}")

# print("\n" + "="*60)
# print("ALL FILES PROCESSED")
# print("="*60)
# print(f"Output folder: {OUTPUT_FOLDER}\n")

# # Show final summary
# print("Created files:")
# for file in sorted(os.listdir(OUTPUT_FOLDER)):
#     if file.endswith('.tsv'):
#         file_path = os.path.join(OUTPUT_FOLDER, file)
#         file_size = os.path.getsize(file_path)

#         # Count lines
#         with open(file_path, 'r', encoding='utf-8') as f:
#             line_count = sum(1 for _ in f) - 1  # Subtract header

#         print(f"  {file}")
#         print(f"    Size: {file_size / (1024*1024):.2f} MB")
#         print(f"    Rows: {line_count:,}")
#         print()

# print("="*60)

In [None]:
# #huggingface data batch process

# import pandas as pd
# from phonemizer import phonemize
# import gc
# import os
# from tqdm import tqdm

# # Paths
# TSV_PATH = r"/content/data1/invalidated.tsv"
# OUTPUT_TSV = r"/content/common_voice_pl_invalidated_phoneme.tsv"

# print("="*60)
# print("BATCH PHONEME CONVERSION - POLISH")
# print("="*60)

# print("\nLoading TSV file...")
# df = pd.read_csv(TSV_PATH, sep='\t')

# # Check if sentence column exists
# if 'sentence' not in df.columns:
#     print(f"⚠ Error: 'sentence' column not found!")
#     print(f"Available columns: {list(df.columns)}")
#     exit(1)

# # Filter and clean
# initial_count = len(df)
# df = df[df['sentence'].notna()].reset_index(drop=True)
# # Also filter out empty strings
# df = df[df['sentence'].astype(str).str.strip() != ""].reset_index(drop=True)

# print(f"Initial rows: {initial_count:,}")
# print(f"After filtering empty/NaN: {len(df):,}")
# print(f"Removed: {initial_count - len(df):,}")

# # Configuration
# BATCH_SIZE = 1000
# total_batches = (len(df) + BATCH_SIZE - 1) // BATCH_SIZE

# print(f"\nBatch size: {BATCH_SIZE}")
# print(f"Total batches: {total_batches}")
# print(f"Language: Polish (pl)")
# print("="*60)

# # Process and save incrementally
# first_batch = True
# total_saved = 0
# total_errors = 0
# total_empty_phonemes = 0

# for batch_idx in range(0, len(df), BATCH_SIZE):
#     batch_num = batch_idx // BATCH_SIZE + 1
#     batch_end = min(batch_idx + BATCH_SIZE, len(df))
#     batch_df = df.iloc[batch_idx:batch_end].copy()

#     print(f"\n[Batch {batch_num}/{total_batches}] Processing rows {batch_idx:,} to {batch_end:,}")

#     try:
#         # Convert sentences to list and ensure they're strings
#         sentences = batch_df['sentence'].astype(str).tolist()

#         # Phonemize batch
#         batch_phonemes = phonemize(
#             sentences,
#             language='pl',  # Polish language
#             backend='espeak',
#             strip=True,
#             preserve_punctuation=False,
#             with_stress=False,
#             njobs=4  # Parallel processing
#         )

#         # Handle case where phonemize returns a single string instead of list
#         if isinstance(batch_phonemes, str):
#             batch_phonemes = [batch_phonemes]

#         batch_df['phonemes'] = batch_phonemes

#         # Count empty phonemes before filtering
#         empty_in_batch = (batch_df['phonemes'] == "").sum()
#         total_empty_phonemes += empty_in_batch

#         # Filter out empty phonemes
#         batch_df = batch_df[batch_df['phonemes'] != ""]

#         # Show first example from first batch
#         if batch_num == 1 and len(batch_df) > 0:
#             print(f"\n  Example from first batch:")
#             print(f"    Text:    {batch_df.iloc[0]['sentence']}")
#             print(f"    Phoneme: {batch_df.iloc[0]['phonemes']}")

#         # Append to output file
#         batch_df.to_csv(
#             OUTPUT_TSV,
#             sep='\t',
#             index=False,
#             mode='w' if first_batch else 'a',
#             header=first_batch,
#             encoding='utf-8'  # Ensure proper encoding
#         )

#         first_batch = False
#         total_saved += len(batch_df)

#         print(f"  ✓ Processed: {len(batch_df):,} samples saved")
#         if empty_in_batch > 0:
#             print(f"  ⚠ Empty phonemes: {empty_in_batch:,} (filtered out)")

#         # Clear memory
#         del batch_df, batch_phonemes, sentences
#         gc.collect()

#     except Exception as e:
#         print(f"  ✗ Error in batch {batch_num}: {str(e)[:200]}")
#         total_errors += 1

#         # Try to save partial results if error occurs
#         try:
#             if 'batch_df' in locals() and len(batch_df) > 0:
#                 batch_df.to_csv(
#                     OUTPUT_TSV,
#                     sep='\t',
#                     index=False,
#                     mode='w' if first_batch else 'a',
#                     header=first_batch
#                 )
#                 first_batch = False
#                 print(f"  ⚠ Saved partial batch before error")
#         except:
#             pass

# print("\n" + "="*60)
# print("PROCESSING COMPLETE")
# print("="*60)

# # Show final statistics
# if os.path.exists(OUTPUT_TSV):
#     try:
#         final_df = pd.read_csv(OUTPUT_TSV, sep='\t')
#         file_size = os.path.getsize(OUTPUT_TSV)

#         print(f"\nOutput file: {OUTPUT_TSV}")
#         print(f"File size: {file_size / (1024*1024):.2f} MB")
#         print(f"\nStatistics:")
#         print(f"  Input rows: {initial_count:,}")
#         print(f"  Valid sentences: {len(df):,}")
#         print(f"  Successfully phonemized: {len(final_df):,}")
#         print(f"  Empty phonemes (filtered): {total_empty_phonemes:,}")
#         print(f"  Batch errors: {total_errors}")
#         print(f"  Success rate: {len(final_df)/len(df)*100:.2f}%")

#         # Show column names
#         print(f"\nColumns in output: {list(final_df.columns)}")

#         # Show sample
#         if len(final_df) > 0:
#             print(f"\n--- Sample Output (first row) ---")
#             print(f"Sentence: {final_df.iloc[0]['sentence']}")
#             print(f"Phonemes: {final_df.iloc[0]['phonemes']}")

#         del final_df

#     except Exception as e:
#         print(f"\n⚠ Could not read final file: {e}")
#         print(f"But file exists at: {OUTPUT_TSV}")
# else:
#     print(f"\n⚠ Warning: Output file not created!")

# print("\n" + "="*60)

BATCH PHONEME CONVERSION - POLISH

Loading TSV file...
Initial rows: 6,906
After filtering empty/NaN: 6,906
Removed: 0

Batch size: 1000
Total batches: 7
Language: Polish (pl)

[Batch 1/7] Processing rows 0 to 1,000





  Example from first batch:
    Text:    Jakiś stół, ława, dwa zydle i łóżko.
    Phoneme: jakiɕ stuw wava dva zɨdlɛ i wuʃkɔ
  ✓ Processed: 1,000 samples saved

[Batch 2/7] Processing rows 1,000 to 2,000




  ✓ Processed: 1,000 samples saved

[Batch 3/7] Processing rows 2,000 to 3,000
  ✓ Processed: 1,000 samples saved





[Batch 4/7] Processing rows 3,000 to 4,000
  ✓ Processed: 1,000 samples saved

[Batch 5/7] Processing rows 4,000 to 5,000




  ✓ Processed: 1,000 samples saved

[Batch 6/7] Processing rows 5,000 to 6,000




  ✓ Processed: 1,000 samples saved

[Batch 7/7] Processing rows 6,000 to 6,906
  ✓ Processed: 906 samples saved

PROCESSING COMPLETE

Output file: /content/common_voice_pl_invalidated_phoneme.tsv
File size: 4.04 MB

Statistics:
  Input rows: 6,906
  Valid sentences: 6,906
  Successfully phonemized: 6,906
  Empty phonemes (filtered): 0
  Batch errors: 0
  Success rate: 100.00%

Columns in output: ['client_id', 'path', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant', 'audio_path', 'phonemes']

--- Sample Output (first row) ---
Sentence: Jakiś stół, ława, dwa zydle i łóżko.
Phonemes: jakiɕ stuw wava dva zɨdlɛ i wuʃkɔ

