In [1]:
import os
import pandas as pd
import torch
import torchaudio
from TTS.api import TTS
from time import time
from text_split_and import split_text_into_chunks;


%run text_split_and.py

Chunk: Your input text here This is an example text to split How does it handle, longer sentences, you may ask? Let's find out!
Length: 120

Chunk: Your input text here This is an example text to split How does it handle, longer sentences, you may ask? Let's find out!
Length: 120



In [2]:
def load_replacements(file_path):
    """Load text replacements from an Excel file."""
    df = pd.read_excel(file_path)
    # df = df[df['langua'] != 'es']
    replacements = {}
    for index, row in df.iterrows():
        replacements[row['original']] = row['replace']
    return replacements

def replace_text(text, replacements):
    """Replace text based on a dictionary of replacements."""
    for original, replace in replacements.items():
        text = text.replace(original, replace)
    return text

In [3]:
# Get device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [4]:
# Load replacements from Excel file
excel_file = 'replaces_words.xlsx'
replacements = load_replacements(excel_file)

In [5]:

# Ruta donde se deben eliminar las carpetas
output_path = 'output_es_short'

# Eliminar todas las carpetas en output_path
if os.path.exists(output_path):
    # Iterar sobre los elementos dentro del directorio
    for item in os.listdir(output_path):
        item_path = os.path.join(output_path, item)
        # Eliminar archivos
        if os.path.isfile(item_path):
            os.remove(item_path)
        # Eliminar carpetas y su contenido de forma recursiva
        elif os.path.isdir(item_path):
            for root, dirs, files in os.walk(item_path, topdown=False):
                for name in files:
                    os.remove(os.path.join(root, name))
                for name in dirs:
                    os.rmdir(os.path.join(root, name))
            os.rmdir(item_path)  # Finalmente eliminar la carpeta vacía
else:
    print(f"The folder {output_path} does not exist.")

In [6]:
# Asegurarse de que la carpeta exista después de eliminar su contenido
os.makedirs(output_path, exist_ok=True)

# Ruta de la carpeta de entrada con los archivos .txt
folder_path = '/mnt/D8E84E4DE84E2A58/Env_python/Create_video_news/0_create text/text/es/shorts/'

# Lista de archivos .txt en la carpeta
files = [f for f in os.listdir(folder_path) if f.endswith('.txt')]

# Ordenar los archivos si es necesario
files.sort()

In [7]:
# Init TTS

tts = TTS("xtts").to(device)
# tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)

for i, file in enumerate(files):
    file_path = os.path.join(folder_path, file)
    new_folder_path = os.path.join(output_path, f'{i+1}')
    os.makedirs(new_folder_path, exist_ok=True)
    print(f"Procesando: {file} en {new_folder_path}")

    # Leer el contenido del archivo .txt
    with open(file_path, 'r', encoding='utf-8') as f:
        text_content = f.read()

    text = replace_text(text_content, replacements)

    # Dividir el texto en fragmentos manejables
    separated_input = split_text_into_chunks(text)

    # Inicializar lista de clips de audio
    all_audio_parts = []

    # Generar audios por fragmento
    for index, text in enumerate(separated_input):
        audio_file_path = os.path.join(new_folder_path, f"{index}.wav")
        wav_data = tts.tts_to_file(
            text=text,
            speaker_wav="/mnt/D8E84E4DE84E2A58/Env_python/Machine_learing_Test/0_Create_audio/data/wavs/complete/sample_1.wav",
            language="es",
            temperature=0.9,
            file_path=audio_file_path
        )
        print(f"Audio generado: {audio_file_path}")
        torch.cuda.empty_cache()  # Liberar memoria GPU
        audio_part, _ = torchaudio.load(audio_file_path)
        all_audio_parts.append(audio_part)

    # Concatenar todos los clips de audio
    concatenated_audio = torch.cat(all_audio_parts, dim=-1)

    # Guardar el audio concatenado
    final_audio_path = os.path.join(new_folder_path, f"audio_final_{i+1}.wav")
    torchaudio.save(final_audio_path, concatenated_audio, sample_rate=24000)
    print(f"Audio final guardado en: {final_audio_path}")


print(f"Se procesaron {len(files)} archivos y se generaron carpetas correspondientes en {output_path}.")

  from .autonotebook import tqdm as notebook_tqdm


 > Using model: xtts


  self.speakers = torch.load(speaker_file_path)
  return torch.load(f, map_location=map_location, **kwargs)
GPT2InferenceModel has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


Procesando: 1.txt en output_es_short/1
Chunk: Resumen de la semana
Length: 20

 > Text splitted to sentences.
['Resumen de la semana']


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


 > Processing time: 0.9950864315032959
 > Real-time factor: 0.3725491682736973
Audio generado: output_es_short/1/0.wav
Audio final guardado en: output_es_short/1/audio_final_1.wav
Procesando: 2.txt en output_es_short/2
Chunk: La supuesta GPU Arc B580 de Intel ha aparecido en Geekbench, según los rumores Cuenta con 20 núcleos Xe,
Length: 104

Chunk: 12 Gigabyte de BI RAM y un reloj de impulso de 285 GHz Sin embargo, su rendimiento en el benchmark fue decepcionante,
Length: 117

Chunk: obteniendo solo 78,743 puntos, siendo más lenta que la A580
Length: 59

 > Text splitted to sentences.
['La supuesta GPU Arc B580 de Intel ha aparecido en Geekbench, según los rumores Cuenta con 20 núcleos Xe,']
 > Processing time: 2.1685121059417725
 > Real-time factor: 0.19077438531765115
Audio generado: output_es_short/2/0.wav
 > Text splitted to sentences.
['12 Gigabyte de BI RAM y un reloj de impulso de 285 GHz Sin embargo, su rendimiento en el benchmark fue decepcionante,']
 > Processing time: 2.1867