In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from IPython.display import Audio
import numpy as np
import nltk
from bark.generation import generate_text_semantic, preload_models
from bark.api import semantic_to_waveform
from bark import generate_audio, SAMPLE_RATE
import soundfile as sf
import gradio as gr

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
preload_models()

In [3]:
male_speakers = [
    ("English", "v2/en_speaker_6"),
    ("German", "v2/de_speaker_6"),
    ("Italian", "v2/it_speaker_4"),
    ("Korean", "v2/ko_speaker_9"),
    ("Japanese", "v2/ja_speaker_2"),
    ("Chinese", "v2/zh_speaker_8"),
]
female_speakers = [
    ("English", "v2/en_speaker_9"),
    ("German", "v2/de_speaker_3"),
    ("Italian", "v2/it_speaker_9"),
    ("Korean", "v2/ko_speaker_0"),
    ("Japanese", "v2/ja_speaker_3"),
    ("Chinese", "v2/zh_speaker_9"),
]

all_speakers = male_speakers + female_speakers

In [4]:
StarRail = [
    ("Himeko", r"D:\Workspace\模組\Star-Rail\himeko\himeko.pt"),
    ("Dan Heng", r"D:\Workspace\模組\Star-Rail\dan-heng\dan-heng.pt")
]

## Functions

### Tab 1

In [5]:
def play_audio(story_name):
    if story_name == "Little-Red-Riding-Hood":
        with open("bark/Little-Red-Riding-Hood.txt", 'r', encoding='utf-8') as file:
            story = file.read()
        return "picture/Little-Red-Riding-Hood.png", story, "bark/English9_[]_Little-Red-Riding-Hood.wav"
    elif story_name == "The-Three-Little-Pigs":
        with open("bark/The-Three-Little-Pigs.txt", 'r', encoding='utf-8') as file:
            story = file.read()
        return "picture/The-Three-Little-Pigs.png", story, "bark/English9_[]_The-Three-Little-Pigs.wav"
    elif story_name == "Sleeping-Beauty":
        with open("bark/Sleeping-Beauty.txt", 'r', encoding='utf-8') as file:
            story = file.read()
        return "picture/Sleeping-Beauty.png", story, "bark/English9_[]_Sleeping-Beauty.wav"
    elif story_name == "The-Little-Match-Girl":
        with open("bark/The-Little-Match-Girl.txt", 'r', encoding='utf-8') as file:
            story = file.read()
        return "picture/The-Little-Match-Girl.png", story, "bark/English9_[]_The-Little-Match-Girl.wav"
    elif story_name == "The-Ugly-Duckling":
        with open("bark/The-Ugly-Duckling.txt", 'r', encoding='utf-8') as file:
            story = file.read()
        return "picture/The-Ugly-Duckling.png", story, "bark/English9_[]_The-Ugly-Duckling.wav"

def update_speaker_dropdown(gender):
    if gender == "Male":
        result = male_speakers
    elif gender == "Female":
        result = female_speakers
    else:
        result = None
    # print("Result:", result)
    # print("Type of result:", type(result))
    return gr.update(choices=result)

def clear_tab1():
    return "", None, ""

def text_to_speech(script, speaker):
    if not script or not speaker:
        warning_msg = "Please "
        warning_msg += "enter story content" if not script else ""
        warning_msg += " and " if not script and not speaker else ""
        warning_msg += "select a speaker" if not speaker else ""
        warning_msg += "."
        return None, warning_msg
    else:
        result = "Successfully converted text to speech!"
        sentences = nltk.sent_tokenize(script)
        GEN_TEMP = 0.6
        silence = np.zeros(int(0.25 * SAMPLE_RATE))  # quarter second of silence

        pieces = []
        for sentence in sentences:
            semantic_tokens = generate_text_semantic(
                sentence,
                history_prompt=speaker,
                temp=GEN_TEMP,
                min_eos_p=0.05,
            )

            audio_array = semantic_to_waveform(semantic_tokens, history_prompt=speaker)
            pieces += [audio_array, silence.copy()]

        combined_audio = np.concatenate(pieces, axis=0)
        sf.write('result/generated_audio.wav', combined_audio, SAMPLE_RATE)
        return 'result/generated_audio.wav', result

### Tab 2

In [6]:
import subprocess

def audio_file(input_wav):
    return input_wav

def clear_tab2():
    return "", "", None, None, 0, 1, "output.wav"

def change_voice(input_wav, input_pt, star_rail_pt, keychange, output_wav, speedup):
    if not input_pt:
        pt = star_rail_pt
    elif not star_rail_pt:
        pt = input_pt[0]
    else:
        pt = input_pt[0]
        gr.Info("You have selected both StarRail and DDSP-SVC. We will use DDSP-SVC to change the voice.")
    
    if input_wav.split(".")[-1] != "wav":
        error_message = "Please upload a .wav file."
        # return error_message, None
        gr.Warning(error_message)
    
    print("input_wav:", input_wav, "input_pt:", input_pt, "star_rail_pt:", star_rail_pt, "keychange:", keychange, "output_wav:", output_wav, "speedup:", speedup)
    # Construct command line instruction
    command = f"python main_diff.py -i \"{input_wav}\" -diff \"{pt}\" -o D:\\Workspace\\模組\\result\\{output_wav} -k {keychange} -speedup {speedup} -method 'dpm-solver' -kstep 100"
    # command = f"python D:\\Workspace\\模組\\DDSP-SVC-master\\main_diff.py -i \"{input_wav}\" -diff D:\\Workspace\\模組\\DDSP-SVC-master\\exp\\diff_3\\model_24000.pt -o D:\\Workspace\\模組\\DDSP-SVC-master\\data\\result\\{output_wav} -k {keychange} -speedup {speedup} -method 'dpm-solver' -kstep 100"

    # Execute the command line instruction
    process = subprocess.run(command, shell=True, text=True, capture_output=True, cwd=r"D:\Workspace\模組\DDSP-SVC-master")

    # Print standard output and error
    print("Standard Output:", process.stdout)
    print("Error Output:", process.stderr)

    # Check the return code
    if process.returncode != 0:
        error_message = f"Error: The subprocess returned a non-zero exit code: {process.returncode}"
        print(error_message)
        return error_message, None
    print(process.stdout)
    result = "Successfully changed voice!"
    # Return the standard output and the generated audio file path
    return result, f"D:\\Workspace\\模組\\result\\{output_wav}"

### Tab 3

In [7]:
import shutil
import ruamel.yaml
import time
import threading
stop_training_signal = threading.Event()

def move_audio(input_wav):
    train_directory = r"D:\Workspace\模組\DDSP-SVC-master\data\train\audio"
    val_directory = r"D:\Workspace\模組\DDSP-SVC-master\data\val\audio"
    # Delete all files in the target directory
    for file_name in os.listdir(train_directory):
        file_path = os.path.join(train_directory, file_name)
        if os.path.isfile(file_path):
            os.remove(file_path)
    for file_name in os.listdir(val_directory):
        file_path = os.path.join(val_directory, file_name)
        if os.path.isfile(file_path):
            os.remove(file_path)
    # Move the uploaded file to the specified directory
    for audio in input_wav:
        input_wav_path = os.path.join(train_directory, os.path.basename(audio))
        shutil.move(audio, input_wav_path)
    
def update_yaml_env(dir_path,epoch):
    # Modify the 'D:\\Workspace\\模組\\DDSP-SVC-master\\configs\\diffusion-new.yaml' file
    yaml_path = 'D:\\Workspace\\模組\\DDSP-SVC-master\\configs\\diffusion-new.yaml'

    # Load the YAML content using ruamel.yaml for safer modification
    yaml = ruamel.yaml.YAML()
    with open(yaml_path, 'r', encoding='utf-8') as file:
        yaml_content = yaml.load(file)

    # Update the 'env' field in the YAML content
    yaml_content['env']['expdir'] = os.path.join('exp', dir_path)
    yaml_content['train']['epochs'] = epoch
    # Write the updated content back to the YAML file
    with open(yaml_path, 'w',encoding='utf-8') as file:
        yaml.dump(yaml_content, file)

    # Create the directory if it doesn't exist
    expdir_path = os.path.join('D:\\Workspace\\模組\\DDSP-SVC-master\\exp',dir_path )
    os.makedirs(expdir_path, exist_ok=True)

    # Move 'model_0.pt' to the new directory
    model_0_path = 'D:\\Workspace\\模組\\DDSP-SVC-master\\exp\\model_0.pt'
    shutil.copy(model_0_path, os.path.join(expdir_path, 'model_0.pt'))

def learn_voice(input_wav, dir_path, epoch):
    if not input_wav or not dir_path:
        warning_msg = "Please "
        warning_msg += "upload audio files" if not input_wav else ""
        warning_msg += " and " if not input_wav and not dir_path else ""
        warning_msg += "enter a directory name" if not dir_path else ""
        warning_msg += "."
        return warning_msg
    if input_wav[0].split('.')[-1] != 'wav':
        warning_msg = "Please upload .wav files."
        # return warning_msg
        gr.Warning(warning_msg)
    
    move_audio(input_wav)
    update_yaml_env(dir_path,epoch)
    draw = f"python draw.py"
    preprocess = f"python preprocess.py -c configs/diffusion-new.yaml"
    train = f"python train_diff.py -c configs/diffusion-new.yaml"

    # Execute the command line instructions using Popen
    draw_process = subprocess.Popen(draw, shell=True, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=r"D:\Workspace\模組\DDSP-SVC-master")
    preprocess_process = subprocess.Popen(preprocess, shell=True, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=r"D:\Workspace\模組\DDSP-SVC-master")

    # Read stdout in real-time or every 2 seconds
    processes = [draw_process, preprocess_process]
    outputs = {p: [] for p in processes}

    draw_process.wait()
    preprocess_process.wait()
    # train_process.wait()

    try:
        while any(p.poll() is None for p in processes):  # Check if any process is still running
            for process in processes:
                if process.poll() is None:
                    output_line = process.stdout.readline().strip()
                    if output_line:
                        outputs[process].append(output_line)
                        # Display or use the output_line in Gradio
                        print(f"Output from {process}: {output_line}")
            time.sleep(2)  # Sleep for 2 seconds before checking again

        # Collect any remaining output after processes have completed
        for process in processes:
            remaining_output = process.stdout.read().strip()
            if remaining_output:
                outputs[process].append(remaining_output)
            process.wait()

        # Check if all processes have completed
        if all(p.poll() == 0 for p in processes):
            result = "Draw and Preprocess has been done successfully!"
            print(result)
            # return result

        train_process = subprocess.Popen(train, shell=True, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=r"D:\Workspace\模組\DDSP-SVC-master")

        print("Train Process stdout:")
        try:
            while train_process.poll() is None and not stop_training_signal.is_set():
                output_line = train_process.stdout.readline().strip()
                if output_line:
                    print(output_line)
                # time.sleep(2)

        except Exception as e:
            print(f"Error during training: {e}")

        finally:
            # Release resources and stop the process
            train_process.terminate()
            train_process.wait()

            # Check if the process completed successfully
            return_code = train_process.returncode
            if return_code == 0:
                res = "Your voice has been learned successfully! Please go to \"Tell Story with Different Voices\" to use the voice you learned."
                print(res)
                return res
            else:
                print(f"Train process exited with code: {return_code}")
        # print("Train Process stdout:")
        # for line in train_process.stdout.readlines():
        #     print(line.strip())
            
        # print("Train Process stderr:")
        # for line in train_process.stderr.readlines():
        #     print(line.strip())
            
        # # Wait for the process to complete and get output
        # output, error = train_process.communicate()
        # output = output.splitlines()

        # # Check the return code of the process
        # return_code = train_process.returncode
        # if return_code == 0:
        #     res = "Your voice has been learned successfully! Please go to \"Tell Story with Different Voices\" to use the voice you learned."
        #     print(res)
        #     return res
        # else:
        #     print(f"Process exited with code: {return_code}")
        
    except Exception as e:
        outputs.append(f"Error: {e}")

    return outputs

def stop_training():
    stop_training_signal.set()
    gr.Warning("Training has been stopped.")

## Main

In [8]:
#
# story.close()
with gr.Blocks(theme = 'xiaobaiyuan/theme_brief', title="StoryTeller") as story:
#     gr.Markdown("# Storyteller's Voice Studio: Create and Clone")
#     gr.Markdown("### Made by Yu-Pu Hsu, Yu-Han Tseng, Hsueh-Fu Shih\n---")
    gr.Markdown("""
    <div style='text-align: center;'>
        <h1 style='margin-bottom: 0.5rem;'>StoryTeller: Voice Studio - Create and Clone</h1>
        <h5 style='margin-top: 0.5rem;'>Made by Yu-Pu Hsu, Yu-Han Tseng, Hsueh-Fu Shih</h5>
    </div>
    """)
    
    gr.Markdown("### Let's choose a story!")
    
    with gr.Row():
        story_radio = gr.Radio(choices=["Little-Red-Riding-Hood", "The-Three-Little-Pigs", "Sleeping-Beauty", "The-Little-Match-Girl", "The-Ugly-Duckling"], container=False)
    with gr.Row():
        story_img = gr.Image("picture/pic.jpg", show_download_button=False, label=" ")
        with gr.Column():
            story_content = gr.Text(label="Story Content", placeholder="Select a story to see contents.")
            story_audio = gr.Audio(autoplay=True, label=" ")
            story_radio.change(fn=play_audio, inputs=story_radio, outputs=[story_img, story_content, story_audio])
        
    gr.Markdown("### Build your own text-to-speech story!")
    # Tab 1
    with gr.Tab("Customized Story"):
        with gr.Row():
            gr.Markdown("""
            Here is a list of non-speech sounds you can include in your story: 
            `[laughter]`, `[laughs]`, `[sighs]`, `[music]`, `[gasps]`, and `[clears throat]`. 
            Use an em dash (`—`) or ellipsis (`...`) to indicate hesitations, 
            a musical note (`♪`) to denote song lyrics, 
            and CAPITALIZATION when you want to emphasize a word.
            """)
        with gr.Row():
            text_input = gr.Textbox(label="Enter story content", info="Example: [clears throat] Once upon a time, there was a lovely princess.")
            # speaker_select = gr.Dropdown(choices=all_speakers, label="Select a speaker", info="Note: The input language must be consistent with the speaker’s language.")
            with gr.Column():
                gender_radio = gr.Radio(label="Select Gender", choices=["Male", "Female"])
                speaker_select = gr.Dropdown(label="Select Speaker", allow_custom_value=True, info="Note: The input language must be consistent with the speaker’s language.")
                gender_radio.change(fn=update_speaker_dropdown, inputs=gender_radio, outputs=speaker_select)
                with gr.Row():
                    submit_button = gr.Button("Text to speech")
                    clear_button = gr.Button("Clear")
                warning_text = gr.Text(label="Result")
        with gr.Row():
            audio_output = gr.Audio(label="Generated Audio", type="filepath")
        
        clear_button.click(
            fn=clear_tab1,
            outputs=[text_input, gender_radio, speaker_select]
        )
        submit_button.click(
            fn=text_to_speech,
            inputs=[text_input, speaker_select],
            outputs=[audio_output, warning_text]
        )

    # Tab 2
    with gr.Tab("Tell Story with Different Voices"):
        with gr.Row():
            with gr.Column(scale=1):
                star_rail_pt = gr.Dropdown(label="Speakers from Star Rail !", choices=StarRail)
                input_pt = gr.FileExplorer(label="Your voice's checkpoint file ", root="D:\Workspace\模組\DDSP-SVC-master\exp", glob="**/*.pt*")
            with gr.Column(scale=2):
                with gr.Row():
                    input_wav = gr.File(label="Choose the audio file")
                    audio = gr.Audio(label="Play the audio file")
                input_wav.upload(fn=audio_file, inputs=input_wav, outputs=audio)
            # with gr.Column():
                with gr.Row():
                    keychange = gr.Slider(label="Pitch", minimum=-20, maximum=20, step=1, value=0)
                    speedup = gr.Slider(label="Speedup", minimum=-10, maximum=10, step=1, value=1)
            # with gr.Column():
                output_wav = gr.Textbox(label="Output file name", value="output.wav")
            # speaker_id = gr.Number(label="speaker ID (speaker_id)", value=1)
            # method = gr.Radio(choices=["method1", "method2"], label="處理方法 (method)")
            # kstep = gr.Number(label="k步 (kstep)", value=1)
                with gr.Row():
                    run_button = gr.Button("Start converting sounds")
                    clear_button2 = gr.Button("Refresh")
                run_output = gr.Text(label="Result")
                output_audio = gr.Audio()

        clear_button2.click(
            fn=clear_tab2,
            outputs=[star_rail_pt, input_pt, input_wav, audio, keychange, speedup, output_wav]
        )
        run_button.click(
            change_voice,
            inputs=[input_wav, input_pt, star_rail_pt, keychange, output_wav, speedup],
            outputs=[run_output, output_audio]
        )
        
    # Tab 3
    with gr.Tab("Clone Your Voice"):
        with gr.Row():
            with gr.Column():
                gr.Markdown("Each wav file takes more than 2 seconds, and it is recommended to upload 20 files.")
                input_wav = gr.File(label="Upload audio file to train", file_count="multiple")
                # input_wav = gr.FileExplorer(label="Your voice's .wav file ", root="D:\Workspace\模組\dataset", glob="**/*.wav*")
            with gr.Column():
                dir_path = gr.Text(label="Your voice's name ")
                epoch = gr.Number(label="Epoch", value=2500, interactive=True)
                with gr.Row():
                    learn = gr.Button("Learn your voice")
                    stop_button = gr.Button("Stop learning")
                run_output = gr.Text(label="Result")
                # error = gr.Text(label="Error")

        learn.click(
            learn_voice,
            inputs=[input_wav, dir_path,epoch],
            outputs=run_output
        )
        stop_button.click(stop_training)

# story.launch(share=True, server_port=1111)
story.launch(share=True)

ERROR:    Exception in ASGI application
Traceback (most recent call last):
  File "c:\Users\pairlab\miniconda3\envs\module\lib\site-packages\starlette\responses.py", line 259, in __call__
    await wrap(partial(self.listen_for_disconnect, receive))
  File "c:\Users\pairlab\miniconda3\envs\module\lib\site-packages\starlette\responses.py", line 255, in wrap
    await func()
  File "c:\Users\pairlab\miniconda3\envs\module\lib\site-packages\starlette\responses.py", line 232, in listen_for_disconnect
    message = await receive()
  File "c:\Users\pairlab\miniconda3\envs\module\lib\site-packages\uvicorn\protocols\http\h11_impl.py", line 538, in receive
    await self.message_event.wait()
  File "c:\Users\pairlab\miniconda3\envs\module\lib\asyncio\locks.py", line 309, in wait
    await fut
asyncio.exceptions.CancelledError

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "c:\Users\pairlab\miniconda3\envs\module\lib\site-packages\u

Running on local URL:  http://127.0.0.1:7860
Running on public URL: https://f71ea094a621cd6882.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




100%|██████████| 514/514 [00:07<00:00, 73.41it/s] 
100%|██████████| 26/26 [00:08<00:00,  3.20it/s]


input_wav: C:\Users\pairlab\AppData\Local\Temp\gradio\a6012185cc15da74c1c1d3b230dabab3abd63663\generated_audio 2.wav input_pt: [] star_rail_pt: D:\Workspace\模組\Star-Rail\himeko\himeko.pt keychange: 4 output_wav: output.wav speedup: 1
Standard Output:  [DDSP Model] Combtooth Subtractive Synthesiser
 [Loading] D:\Workspace\模組\Star-Rail\himeko\himeko.pt
MD5: 8b05437c6819cf41ca8553b26ff88e97
Pitch extractor type: rmvpe
Extracting the pitch curve of the input audio...
Extracting the volume envelope of the input audio...
 [Encoder Model] Content Vec
 [Loading] pretrain/contentvec/checkpoint_best_legacy_500.pt
DDSP Speaker ID: 1
Diffusion Speaker ID: auto
Sampling method: DDPM
Shallow diffusion step: 100
DDSP model is not identified, the built-in DDSP model will be used!
Cut the input audio into 3 slices
| Load HifiGAN:  pretrain/nsf_hifigan/model
Removing weight norm...

Error Output: 2024-01-09 22:13:58 | INFO | fairseq.tasks.hubert_pretraining | current directory is D:\Workspace\模組\DDSP-SV