# Load data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import zipfile
import os
import shutil
import glob
import pandas as pd
import numpy as np
from tqdm import tqdm

def unzip_files(zip_file_path, destination_folder):

  # destination_folder = zip_file_path.split('.zip')[0]

  # Create the destination folder if it doesn't exist
  if not os.path.exists(destination_folder):
      os.makedirs(destination_folder)

  # Open the zip file
  with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
      # Get total number of files to extract
      total_files = len(zip_ref.namelist())

      # Extract all contents to the destination folder with progress bar
      for file in tqdm(zip_ref.namelist(), total=total_files, desc=f'Extracting {zip_file_path}', unit='files'):
          zip_ref.extract(file, destination_folder)

In [None]:
# shutil.rmtree('OriginalAudios')
# shutil.rmtree('ClonedAudiosout')
unzip_files('/content/drive/MyDrive/Colab Notebooks/TFM/Exp2/OriginalAudios.zip', 'OriginalAudios')
unzip_files('/content/drive/MyDrive/Colab Notebooks/TFM/Exp2/ClonedAudiosout.zip', 'ClonedAudiosout')

Extracting /content/drive/MyDrive/Colab Notebooks/TFM/Exp2/OriginalAudios.zip: 100%|██████████| 7171/7171 [00:39<00:00, 181.04files/s]
Extracting /content/drive/MyDrive/Colab Notebooks/TFM/Exp2/ClonedAudiosout.zip: 100%|██████████| 13355/13355 [01:54<00:00, 116.53files/s]


# Obtain audio durations

In [None]:
metadata_df = pd.read_csv('metadata_so_vits.csv')
metadata_df['Reference audio path'] = metadata_df['Reference audio path'].str.replace('ClonedAudios/', 'OriginalAudios/')
metadata_df

Unnamed: 0,Reference audio path,Synthesized audio path,Text,Speaker ID,Original speaker ID,RTF GPU,RTF CPU,SECS
0,OriginalAudios/3922/3922_715_000006_000003.wav,ClonedAudiosout/3922/locutor4/audio_22492_2249...,—Le prepararemos un baño y le proporcionaremo...,3922,locutor4,1.242560,1.923357,0.612966
1,OriginalAudios/3922/3922_715_000023_000005.wav,ClonedAudiosout/3922/locutor4/audio_16440.out.wav,Siempre han existido personas que han gustado...,3922,locutor4,1.019879,1.592835,0.735934
2,OriginalAudios/3922/3922_715_000118_000003.wav,ClonedAudiosout/3922/locutor4/audio_5732.out.wav,Pero es evidente que los viejos relatos bíbli...,3922,locutor4,0.665137,1.212811,0.676058
3,OriginalAudios/3922/3922_715_000071_000001.wav,ClonedAudiosout/3922/locutor4/audio_17514_1751...,"Así, se dijo en mil novecientos treinta y cua...",3922,locutor4,0.760370,1.359842,0.675240
4,OriginalAudios/3922/3922_723_000015_000000.wav,ClonedAudiosout/3922/locutor4/audio_24488_2448...,Clotilde dio un manotazo a una pila de libros...,3922,locutor4,0.908556,1.298693,0.643677
...,...,...,...,...,...,...,...,...
13205,OriginalAudios/6209/6209_34601_000068_000032.wav,ClonedAudiosout/6209/locutor3/audio_1512.out.wav,"La verdad es que Pedro se preocupaba por mí, a...",6209,locutor3,0.979441,1.641814,0.838969
13206,OriginalAudios/6209/6209_34601_000091_000003.wav,ClonedAudiosout/6209/locutor3/audio_154.out.wav,"Da risa, pero todavía me acuerdo de todo eso y...",6209,locutor3,0.739581,0.978915,0.814811
13207,OriginalAudios/6209/6209_34601_000163_000012.wav,ClonedAudiosout/6209/locutor3/audio_3020.out.wav,"—Farfullaba, una y otra vez: «No quiero morir ...",6209,locutor3,0.741628,0.995618,0.729366
13208,OriginalAudios/6209/6209_34601_000096_000060.wav,ClonedAudiosout/6209/locutor3/audio_1263.out.wav,"Cuando coincidía, hablaba sin parar, yo creo q...",6209,locutor3,1.011268,1.795860,0.829923


In [None]:
import wave
def get_audio_length(file_path):
    with wave.open(file_path, 'rb') as audio_file:
        # Get the number of frames and the frame rate
        num_frames = audio_file.getnframes()
        frame_rate = audio_file.getframerate()

        # Calculate the duration in seconds
        duration = num_frames / frame_rate

    return duration

In [None]:
metadata_df.rename(columns={'Synthesized audio path': 'Audio path',
                            'Original speaker ID': 'Source speaker ID'},
                            inplace=True)

metadata_df['Source audio path'] = metadata_df['Audio path'].str.replace(r'^[^/]+/[^/]+/', 'OriginalAudios/', regex=True)
metadata_df['Source audio path'] = metadata_df['Source audio path'].str.replace('.out', '')

durations_audio = []

for i in range(len(metadata_df)):
  durations_audio.append(get_audio_length(metadata_df.loc[i, 'Audio path']))

metadata_df['Duration audio'] = durations_audio
metadata_df

Unnamed: 0,Reference audio path,Audio path,Text,Speaker ID,Source speaker ID,RTF GPU,RTF CPU,SECS,Source audio path,Duration audio
0,OriginalAudios/3922/3922_715_000006_000003.wav,ClonedAudiosout/3922/locutor4/audio_22492_2249...,—Le prepararemos un baño y le proporcionaremo...,3922,locutor4,1.242560,1.923357,0.612966,OriginalAudios/locutor4/audio_22492_22493_2249...,5.270930
1,OriginalAudios/3922/3922_715_000023_000005.wav,ClonedAudiosout/3922/locutor4/audio_16440.out.wav,Siempre han existido personas que han gustado...,3922,locutor4,1.019879,1.592835,0.735934,OriginalAudios/locutor4/audio_16440.wav,6.339048
2,OriginalAudios/3922/3922_715_000118_000003.wav,ClonedAudiosout/3922/locutor4/audio_5732.out.wav,Pero es evidente que los viejos relatos bíbli...,3922,locutor4,0.665137,1.212811,0.676058,OriginalAudios/locutor4/audio_5732.wav,9.996190
3,OriginalAudios/3922/3922_715_000071_000001.wav,ClonedAudiosout/3922/locutor4/audio_17514_1751...,"Así, se dijo en mil novecientos treinta y cua...",3922,locutor4,0.760370,1.359842,0.675240,OriginalAudios/locutor4/audio_17514_17515.wav,8.463673
4,OriginalAudios/3922/3922_723_000015_000000.wav,ClonedAudiosout/3922/locutor4/audio_24488_2448...,Clotilde dio un manotazo a una pila de libros...,3922,locutor4,0.908556,1.298693,0.643677,OriginalAudios/locutor4/audio_24488_24489.wav,7.430385
...,...,...,...,...,...,...,...,...,...,...
13205,OriginalAudios/6209/6209_34601_000068_000032.wav,ClonedAudiosout/6209/locutor3/audio_1512.out.wav,"La verdad es que Pedro se preocupaba por mí, a...",6209,locutor3,0.979441,1.641814,0.838969,OriginalAudios/locutor3/audio_1512.wav,6.740363
13206,OriginalAudios/6209/6209_34601_000091_000003.wav,ClonedAudiosout/6209/locutor3/audio_154.out.wav,"Da risa, pero todavía me acuerdo de todo eso y...",6209,locutor3,0.739581,0.978915,0.814811,OriginalAudios/locutor3/audio_154.wav,9.080181
13207,OriginalAudios/6209/6209_34601_000163_000012.wav,ClonedAudiosout/6209/locutor3/audio_3020.out.wav,"—Farfullaba, una y otra vez: «No quiero morir ...",6209,locutor3,0.741628,0.995618,0.729366,OriginalAudios/locutor3/audio_3020.wav,9.080499
13208,OriginalAudios/6209/6209_34601_000096_000060.wav,ClonedAudiosout/6209/locutor3/audio_1263.out.wav,"Cuando coincidía, hablaba sin parar, yo creo q...",6209,locutor3,1.011268,1.795860,0.829923,OriginalAudios/locutor3/audio_1263.wav,6.420181


# Merge converted audios metadata and original audios metadata

In [None]:
metadata_df['Model'] = 'so-vits-svc'
metadata_df

Unnamed: 0,Reference audio path,Audio path,Text,Speaker ID,Source speaker ID,RTF GPU,RTF CPU,SECS,Source audio path,Duration audio,Model
0,OriginalAudios/3922/3922_715_000006_000003.wav,ClonedAudiosout/3922/locutor4/audio_22492_2249...,—Le prepararemos un baño y le proporcionaremo...,3922,locutor4,1.242560,1.923357,0.612966,OriginalAudios/locutor4/audio_22492_22493_2249...,5.270930,so-vits-svc
1,OriginalAudios/3922/3922_715_000023_000005.wav,ClonedAudiosout/3922/locutor4/audio_16440.out.wav,Siempre han existido personas que han gustado...,3922,locutor4,1.019879,1.592835,0.735934,OriginalAudios/locutor4/audio_16440.wav,6.339048,so-vits-svc
2,OriginalAudios/3922/3922_715_000118_000003.wav,ClonedAudiosout/3922/locutor4/audio_5732.out.wav,Pero es evidente que los viejos relatos bíbli...,3922,locutor4,0.665137,1.212811,0.676058,OriginalAudios/locutor4/audio_5732.wav,9.996190,so-vits-svc
3,OriginalAudios/3922/3922_715_000071_000001.wav,ClonedAudiosout/3922/locutor4/audio_17514_1751...,"Así, se dijo en mil novecientos treinta y cua...",3922,locutor4,0.760370,1.359842,0.675240,OriginalAudios/locutor4/audio_17514_17515.wav,8.463673,so-vits-svc
4,OriginalAudios/3922/3922_723_000015_000000.wav,ClonedAudiosout/3922/locutor4/audio_24488_2448...,Clotilde dio un manotazo a una pila de libros...,3922,locutor4,0.908556,1.298693,0.643677,OriginalAudios/locutor4/audio_24488_24489.wav,7.430385,so-vits-svc
...,...,...,...,...,...,...,...,...,...,...,...
13205,OriginalAudios/6209/6209_34601_000068_000032.wav,ClonedAudiosout/6209/locutor3/audio_1512.out.wav,"La verdad es que Pedro se preocupaba por mí, a...",6209,locutor3,0.979441,1.641814,0.838969,OriginalAudios/locutor3/audio_1512.wav,6.740363,so-vits-svc
13206,OriginalAudios/6209/6209_34601_000091_000003.wav,ClonedAudiosout/6209/locutor3/audio_154.out.wav,"Da risa, pero todavía me acuerdo de todo eso y...",6209,locutor3,0.739581,0.978915,0.814811,OriginalAudios/locutor3/audio_154.wav,9.080181,so-vits-svc
13207,OriginalAudios/6209/6209_34601_000163_000012.wav,ClonedAudiosout/6209/locutor3/audio_3020.out.wav,"—Farfullaba, una y otra vez: «No quiero morir ...",6209,locutor3,0.741628,0.995618,0.729366,OriginalAudios/locutor3/audio_3020.wav,9.080499,so-vits-svc
13208,OriginalAudios/6209/6209_34601_000096_000060.wav,ClonedAudiosout/6209/locutor3/audio_1263.out.wav,"Cuando coincidía, hablaba sin parar, yo creo q...",6209,locutor3,1.011268,1.795860,0.829923,OriginalAudios/locutor3/audio_1263.wav,6.420181,so-vits-svc


In [None]:
!pip install resemblyzer
from resemblyzer import VoiceEncoder, preprocess_wav
from pathlib import Path

encoder = VoiceEncoder()

Collecting resemblyzer
  Downloading Resemblyzer-0.1.4-py3-none-any.whl (15.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.7/15.7 MB[0m [31m48.0 MB/s[0m eta [36m0:00:00[0m
Collecting webrtcvad>=2.0.10 (from resemblyzer)
  Downloading webrtcvad-2.0.10.tar.gz (66 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.2/66.2 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting typing (from resemblyzer)
  Downloading typing-3.7.4.3.tar.gz (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.0.1->resemblyzer)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.0.1->resemblyzer)
  Using cached nvidi

Loaded the voice encoder model on cpu in 0.06 seconds.


In [None]:
import random

audios_dir = 'OriginalAudios'
metadata_txts = [os.path.join(audios_dir, file) for file in os.listdir(audios_dir) if '.txt' in file]
metadata_txts

for txt in tqdm(metadata_txts):

  metadata_speaker_df = pd.read_csv(txt, sep='|', header=None, names=['Audio ID', 'Text'])

  speaker = txt.split('_')[-1].split('.txt')[0]

  for i in range(len(metadata_speaker_df)):

    reference_audio_id = metadata_speaker_df.sample()['Audio ID'].iloc[0]
    reference_audio_path = f'{audios_dir}/{speaker}/{reference_audio_id}.wav'

    audio_id = metadata_speaker_df.loc[i, 'Audio ID']
    audio_path = f'{audios_dir}/{speaker}/{audio_id}.wav'

    text = metadata_speaker_df.loc[i, 'Text']

    speaker_id = speaker

    source_speaker_id = np.nan

    rtf_gpu, rtf_cpu = np.nan, np.nan

    audio_fpath = Path(audio_path)
    audio_wav = preprocess_wav(audio_fpath)
    audio_embed = encoder.embed_utterance(audio_wav)

    reference_audio_fpath = Path(reference_audio_path)
    reference_audio_wav = preprocess_wav(reference_audio_fpath)
    reference_audio_embed = encoder.embed_utterance(reference_audio_wav)

    SECS =  audio_embed @ reference_audio_embed

    source_audio_path = np.nan

    duration_audio = get_audio_length(audio_path)

    model = 'Ground truth'

    new_row = pd.DataFrame({'Reference audio path': [reference_audio_path],
                            'Audio path': [audio_path],
                            'Text': [text],
                            'Speaker ID': [speaker_id],
                            'Source spaker ID': [source_speaker_id],
                            'RTF GPU': [rtf_gpu],
                            'RTF CPU': [rtf_cpu],
                            'SECS': [SECS],
                            'Source audio path': [source_audio_path],
                            'Duration audio': [duration_audio],
                            'Model': [model]})

    metadata_df = pd.concat([metadata_df, new_row], ignore_index=True)

100%|██████████| 12/12 [38:42<00:00, 193.58s/it]


In [None]:
metadata_df

Unnamed: 0,Reference audio path,Audio path,Text,Speaker ID,Source speaker ID,RTF GPU,RTF CPU,SECS,Source audio path,Duration audio,Model,Source spaker ID
0,OriginalAudios/3922/3922_715_000006_000003.wav,ClonedAudiosout/3922/locutor4/audio_22492_2249...,—Le prepararemos un baño y le proporcionaremo...,3922,locutor4,1.242560,1.923357,0.612966,OriginalAudios/locutor4/audio_22492_22493_2249...,5.270930,so-vits-svc,
1,OriginalAudios/3922/3922_715_000023_000005.wav,ClonedAudiosout/3922/locutor4/audio_16440.out.wav,Siempre han existido personas que han gustado...,3922,locutor4,1.019879,1.592835,0.735934,OriginalAudios/locutor4/audio_16440.wav,6.339048,so-vits-svc,
2,OriginalAudios/3922/3922_715_000118_000003.wav,ClonedAudiosout/3922/locutor4/audio_5732.out.wav,Pero es evidente que los viejos relatos bíbli...,3922,locutor4,0.665137,1.212811,0.676058,OriginalAudios/locutor4/audio_5732.wav,9.996190,so-vits-svc,
3,OriginalAudios/3922/3922_715_000071_000001.wav,ClonedAudiosout/3922/locutor4/audio_17514_1751...,"Así, se dijo en mil novecientos treinta y cua...",3922,locutor4,0.760370,1.359842,0.675240,OriginalAudios/locutor4/audio_17514_17515.wav,8.463673,so-vits-svc,
4,OriginalAudios/3922/3922_723_000015_000000.wav,ClonedAudiosout/3922/locutor4/audio_24488_2448...,Clotilde dio un manotazo a una pila de libros...,3922,locutor4,0.908556,1.298693,0.643677,OriginalAudios/locutor4/audio_24488_24489.wav,7.430385,so-vits-svc,
...,...,...,...,...,...,...,...,...,...,...,...,...
20027,OriginalAudios/3638/3638_758_000003_000004.wav,OriginalAudios/3638/3638_696_000060_000007.wav,And what do they talk about in that momentary ...,3638,,,,0.830264,,3.089958,Ground truth,
20028,OriginalAudios/3638/3638_696_000020_000001.wav,OriginalAudios/3638/3638_696_000068_000000.wav,Joking?\n3638_696_000026_000003|Is it true tha...,3638,,,,0.712442,,0.730000,Ground truth,
20029,OriginalAudios/3638/3638_758_000022_000000.wav,OriginalAudios/3638/3638_696_000068_000024.wav,Even if parallel lines do meet and I see it my...,3638,,,,0.921110,,6.520042,Ground truth,
20030,OriginalAudios/3638/3638_758_000015_000008.wav,OriginalAudios/3638/3638_696_000038_000010.wav,"It's been going on nearly six months, and all ...",3638,,,,0.863674,,3.940000,Ground truth,


In [None]:
metadata_df = metadata_df[['Model', 'Reference audio path', 'Source audio path', 'Audio path', 'Source speaker ID', 'Speaker ID', 'Duration audio', 'RTF GPU', 'RTF CPU', 'SECS', 'Text']]
metadata_df

Unnamed: 0,Model,Reference audio path,Source audio path,Audio path,Source speaker ID,Speaker ID,Duration audio,RTF GPU,RTF CPU,SECS,Text
0,so-vits-svc,OriginalAudios/3922/3922_715_000006_000003.wav,OriginalAudios/locutor4/audio_22492_22493_2249...,ClonedAudiosout/3922/locutor4/audio_22492_2249...,locutor4,3922,5.270930,1.242560,1.923357,0.612966,—Le prepararemos un baño y le proporcionaremo...
1,so-vits-svc,OriginalAudios/3922/3922_715_000023_000005.wav,OriginalAudios/locutor4/audio_16440.wav,ClonedAudiosout/3922/locutor4/audio_16440.out.wav,locutor4,3922,6.339048,1.019879,1.592835,0.735934,Siempre han existido personas que han gustado...
2,so-vits-svc,OriginalAudios/3922/3922_715_000118_000003.wav,OriginalAudios/locutor4/audio_5732.wav,ClonedAudiosout/3922/locutor4/audio_5732.out.wav,locutor4,3922,9.996190,0.665137,1.212811,0.676058,Pero es evidente que los viejos relatos bíbli...
3,so-vits-svc,OriginalAudios/3922/3922_715_000071_000001.wav,OriginalAudios/locutor4/audio_17514_17515.wav,ClonedAudiosout/3922/locutor4/audio_17514_1751...,locutor4,3922,8.463673,0.760370,1.359842,0.675240,"Así, se dijo en mil novecientos treinta y cua..."
4,so-vits-svc,OriginalAudios/3922/3922_723_000015_000000.wav,OriginalAudios/locutor4/audio_24488_24489.wav,ClonedAudiosout/3922/locutor4/audio_24488_2448...,locutor4,3922,7.430385,0.908556,1.298693,0.643677,Clotilde dio un manotazo a una pila de libros...
...,...,...,...,...,...,...,...,...,...,...,...
20027,Ground truth,OriginalAudios/3638/3638_758_000003_000004.wav,,OriginalAudios/3638/3638_696_000060_000007.wav,,3638,3.089958,,,0.830264,And what do they talk about in that momentary ...
20028,Ground truth,OriginalAudios/3638/3638_696_000020_000001.wav,,OriginalAudios/3638/3638_696_000068_000000.wav,,3638,0.730000,,,0.712442,Joking?\n3638_696_000026_000003|Is it true tha...
20029,Ground truth,OriginalAudios/3638/3638_758_000022_000000.wav,,OriginalAudios/3638/3638_696_000068_000024.wav,,3638,6.520042,,,0.921110,Even if parallel lines do meet and I see it my...
20030,Ground truth,OriginalAudios/3638/3638_758_000015_000008.wav,,OriginalAudios/3638/3638_696_000038_000010.wav,,3638,3.940000,,,0.863674,"It's been going on nearly six months, and all ..."


# Remove repeated audio

In [None]:
metadata_df.drop([11991, 11992, 11993, 11994, 11995, 11996, 11997, 11998, 11999, 12000], inplace=True)
metadata_df.reset_index(drop=True, inplace=True)
metadata_df

Unnamed: 0,Model,Reference audio path,Source audio path,Audio path,Source speaker ID,Speaker ID,Duration audio,RTF GPU,RTF CPU,SECS,Text
0,so-vits-svc,OriginalAudios/3922/3922_715_000006_000003.wav,OriginalAudios/locutor4/audio_22492_22493_2249...,ClonedAudiosout/3922/locutor4/audio_22492_2249...,locutor4,3922,5.270930,1.242560,1.923357,0.612966,—Le prepararemos un baño y le proporcionaremo...
1,so-vits-svc,OriginalAudios/3922/3922_715_000023_000005.wav,OriginalAudios/locutor4/audio_16440.wav,ClonedAudiosout/3922/locutor4/audio_16440.out.wav,locutor4,3922,6.339048,1.019879,1.592835,0.735934,Siempre han existido personas que han gustado...
2,so-vits-svc,OriginalAudios/3922/3922_715_000118_000003.wav,OriginalAudios/locutor4/audio_5732.wav,ClonedAudiosout/3922/locutor4/audio_5732.out.wav,locutor4,3922,9.996190,0.665137,1.212811,0.676058,Pero es evidente que los viejos relatos bíbli...
3,so-vits-svc,OriginalAudios/3922/3922_715_000071_000001.wav,OriginalAudios/locutor4/audio_17514_17515.wav,ClonedAudiosout/3922/locutor4/audio_17514_1751...,locutor4,3922,8.463673,0.760370,1.359842,0.675240,"Así, se dijo en mil novecientos treinta y cua..."
4,so-vits-svc,OriginalAudios/3922/3922_723_000015_000000.wav,OriginalAudios/locutor4/audio_24488_24489.wav,ClonedAudiosout/3922/locutor4/audio_24488_2448...,locutor4,3922,7.430385,0.908556,1.298693,0.643677,Clotilde dio un manotazo a una pila de libros...
...,...,...,...,...,...,...,...,...,...,...,...
20017,Ground truth,OriginalAudios/3638/3638_758_000003_000004.wav,,OriginalAudios/3638/3638_696_000060_000007.wav,,3638,3.089958,,,0.830264,And what do they talk about in that momentary ...
20018,Ground truth,OriginalAudios/3638/3638_696_000020_000001.wav,,OriginalAudios/3638/3638_696_000068_000000.wav,,3638,0.730000,,,0.712442,Joking?\n3638_696_000026_000003|Is it true tha...
20019,Ground truth,OriginalAudios/3638/3638_758_000022_000000.wav,,OriginalAudios/3638/3638_696_000068_000024.wav,,3638,6.520042,,,0.921110,Even if parallel lines do meet and I see it my...
20020,Ground truth,OriginalAudios/3638/3638_758_000015_000008.wav,,OriginalAudios/3638/3638_696_000038_000010.wav,,3638,3.940000,,,0.863674,"It's been going on nearly six months, and all ..."


# Some text are incorrect, correct them

In [None]:
for i in tqdm(range(len(metadata_df))):

  if metadata_df.loc[i, 'Model'] == 'so-vits-svc':
    source_speaker_id = metadata_df.loc[i, 'Source speaker ID']

  else:
    source_speaker_id = metadata_df.loc[i, 'Speaker ID']

  txt = f'OriginalAudios/metadata_{source_speaker_id}.txt'

  # Open the text file and read its contents
  with open(txt, 'r') as file:
      text = file.read()

  # Remove all double quotes from the text
  text_without_quotes = text.replace('"', '')

  # Overwrite the file with the modified content
  with open('aux_metadata.txt', 'w') as file:
      file.write(text_without_quotes)

  metadata_source_speaker = pd.read_csv('aux_metadata.txt', sep='|', header=None, names=['Audio ID', 'Text'])

  audio_id = metadata_df.loc[i, 'Audio path'].split('/')[-1]
  audio_id = audio_id.replace('.wav', '')
  audio_id = audio_id.replace('.out', '')

  text = metadata_source_speaker[metadata_source_speaker['Audio ID'] == audio_id]['Text'].iloc[0]

  metadata_df.loc[i, 'Text'] = text

metadata_df

100%|██████████| 20022/20022 [02:15<00:00, 147.32it/s]


Unnamed: 0,Model,Reference audio path,Source audio path,Audio path,Source speaker ID,Speaker ID,Duration audio,RTF GPU,RTF CPU,SECS,Text
0,so-vits-svc,OriginalAudios/3922/3922_715_000006_000003.wav,OriginalAudios/locutor4/audio_22492_22493_2249...,ClonedAudiosout/3922/locutor4/audio_22492_2249...,locutor4,3922,5.270930,1.242560,1.923357,0.612966,—Le prepararemos un baño y le proporcionaremo...
1,so-vits-svc,OriginalAudios/3922/3922_715_000023_000005.wav,OriginalAudios/locutor4/audio_16440.wav,ClonedAudiosout/3922/locutor4/audio_16440.out.wav,locutor4,3922,6.339048,1.019879,1.592835,0.735934,Siempre han existido personas que han gustado...
2,so-vits-svc,OriginalAudios/3922/3922_715_000118_000003.wav,OriginalAudios/locutor4/audio_5732.wav,ClonedAudiosout/3922/locutor4/audio_5732.out.wav,locutor4,3922,9.996190,0.665137,1.212811,0.676058,Pero es evidente que los viejos relatos bíbli...
3,so-vits-svc,OriginalAudios/3922/3922_715_000071_000001.wav,OriginalAudios/locutor4/audio_17514_17515.wav,ClonedAudiosout/3922/locutor4/audio_17514_1751...,locutor4,3922,8.463673,0.760370,1.359842,0.675240,"Así, se dijo en mil novecientos treinta y cua..."
4,so-vits-svc,OriginalAudios/3922/3922_723_000015_000000.wav,OriginalAudios/locutor4/audio_24488_24489.wav,ClonedAudiosout/3922/locutor4/audio_24488_2448...,locutor4,3922,7.430385,0.908556,1.298693,0.643677,Clotilde dio un manotazo a una pila de libros...
...,...,...,...,...,...,...,...,...,...,...,...
20017,Ground truth,OriginalAudios/3638/3638_758_000003_000004.wav,,OriginalAudios/3638/3638_696_000060_000007.wav,,3638,3.089958,,,0.830264,And what do they talk about in that momentary ...
20018,Ground truth,OriginalAudios/3638/3638_696_000020_000001.wav,,OriginalAudios/3638/3638_696_000068_000000.wav,,3638,0.730000,,,0.712442,Joking?
20019,Ground truth,OriginalAudios/3638/3638_758_000022_000000.wav,,OriginalAudios/3638/3638_696_000068_000024.wav,,3638,6.520042,,,0.921110,Even if parallel lines do meet and I see it my...
20020,Ground truth,OriginalAudios/3638/3638_758_000015_000008.wav,,OriginalAudios/3638/3638_696_000038_000010.wav,,3638,3.940000,,,0.863674,"It's been going on nearly six months, and all ..."


In [None]:
metadata_df.to_csv('metadata_with_secs.csv', index=False)

In [None]:
from google.colab import files
files.download('metadata_with_secs.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>