## Basic Inference (teacher-guided, 8 steps)

In [7]:
from infer import DMOInference
import IPython.display as ipd
import torchaudio
import time

# Initialize the model
tts = DMOInference(
    student_checkpoint_path="/home/wjeamwat/DMOSppeec2_Backup/student_model_85000.pt",
    duration_predictor_path="/home/wjeamwat/DMOSppeec2_Backup/duration_predictor_model_1500.pt",
    device="cuda",
    model_type="F5TTS_Base"
)

ValueError: Unknown model type: F5TTS_v1_Base

In [2]:
prompt_audio = "f5_tts/infer/examples/basic/basic_ref_en.wav"

ref_text = "Some call me nature, others call me mother nature."
gen_text = "I don't really care what you call me. I've been a silent spectator, watching species evolve, empires rise and fall. But always remember, I am mighty and enduring."

start_time = time.time()
# Generate with default settings
generated_audio = tts.generate(
    gen_text=gen_text,
    audio_path=prompt_audio,
    prompt_text=ref_text)
end_time = time.time()

processing_time = end_time - start_time
audio_duration = generated_audio.shape[-1] / 24000
rtf = processing_time / audio_duration

print('\n--------\n')
print('Prompt Audio: ')
display(ipd.Audio(prompt_audio, rate=24000))
print('Generated Audio: ')
display(ipd.Audio(generated_audio, rate=24000))

print(f"  RTF: {rtf:.2f}x ({1/rtf:.2f}x speed)")
print(f"  Processing: {processing_time:.2f}s for {audio_duration:.2f}s audio")


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


Converting audio...
Using custom reference text...

ref_text   Some call me nature, others call me mother nature. 

--------

Prompt Audio: 


Generated Audio: 


  RTF: 1.08x (0.93x speed)
  Processing: 14.23s for 13.21s audio


In [3]:
prompt_audio = "f5_tts/infer/examples/basic/basic_ref_zh.wav"

ref_text = "对，这就是我，万人敬仰的太乙真人。"
gen_text = '突然，身边一阵笑声。我看着他们，意气风发地挺直了胸膛，甩了甩那稍显肉感的双臂，轻笑道："我身上的肉，是为了掩饰我爆棚的魅力，否则，岂不吓坏了你们呢？"'

start_time = time.time()
# Generate with default settings
generated_audio = tts.generate(
    gen_text=gen_text,
    audio_path=prompt_audio,
    prompt_text=ref_text
)
end_time = time.time()

processing_time = end_time - start_time
audio_duration = generated_audio.shape[-1] / 24000
rtf = processing_time / audio_duration

print('\n--------\n')
print('Prompt Audio: ')
display(ipd.Audio(prompt_audio, rate=24000))
print('Generated Audio: ')
display(ipd.Audio(generated_audio, rate=24000))

print(f"  RTF: {rtf:.2f}x ({1/rtf:.2f}x speed)")
print(f"  Processing: {processing_time:.2f}s for {audio_duration:.2f}s audio")


Converting audio...
Using custom reference text...

ref_text   对，这就是我，万人敬仰的太乙真人。

--------

Prompt Audio: 


Generated Audio: 


  RTF: 0.06x (15.93x speed)
  Processing: 1.35s for 21.53s audio


## Comparision between different sampling configurations

#### Student only (4 steps)

Need to set `teacher_steps` and `student_start_step` to 0 to enable full student sampling.

In [4]:
prompt_audio = "f5_tts/infer/examples/basic/basic_ref_zh.wav"

ref_text = "对，这就是我，万人敬仰的太乙真人。"
gen_text = '突然，身边一阵笑声。我看着他们，意气风发地挺直了胸膛，甩了甩那稍显肉感的双臂，轻笑道："我身上的肉，是为了掩饰我爆棚的魅力，否则，岂不吓坏了你们呢？"'

start_time = time.time()
# Generate with default settings
generated_audio = tts.generate(
    gen_text=gen_text,
    audio_path=prompt_audio,
    prompt_text=ref_text,
    teacher_steps=0, # set this to 0 for no teachr sampling
    student_start_step=0, # set this to 0 for full student sampling
)
end_time = time.time()

processing_time = end_time - start_time
audio_duration = generated_audio.shape[-1] / 24000
rtf = processing_time / audio_duration

print('\n--------\n')
print('Prompt Audio: ')
display(ipd.Audio(prompt_audio, rate=24000))
print('Generated Audio: ')
display(ipd.Audio(generated_audio, rate=24000))

print(f"  RTF: {rtf:.2f}x ({1/rtf:.2f}x speed)")
print(f"  Processing: {processing_time:.2f}s for {audio_duration:.2f}s audio")

Converting audio...
Using cached preprocessed reference audio...
Using custom reference text...

ref_text   对，这就是我，万人敬仰的太乙真人。

--------

Prompt Audio: 


Generated Audio: 


  RTF: 0.03x (31.49x speed)
  Processing: 0.68s for 21.53s audio


#### More teacher steps (16 steps)

Now we use 14 steps from the teacher and 2 steps from the student to have higher diversity (16 steps total).

In [5]:
prompt_audio = "f5_tts/infer/examples/basic/basic_ref_zh.wav"

ref_text = "对，这就是我，万人敬仰的太乙真人。"
gen_text = '突然，身边一阵笑声。我看着他们，意气风发地挺直了胸膛，甩了甩那稍显肉感的双臂，轻笑道："我身上的肉，是为了掩饰我爆棚的魅力，否则，岂不吓坏了你们呢？"'

start_time = time.time()
# Generate with default settings
generated_audio = tts.generate(
    gen_text=gen_text,
    audio_path=prompt_audio,
    prompt_text=ref_text,
    teacher_steps=24, 
    teacher_stopping_time=0.3, # 0.25 means students go for the last two steps (0.26ish, 0.6ish)
    student_start_step=2, # only two steps for students
    verbose=True # see the number of steps used
)
end_time = time.time()

processing_time = end_time - start_time
audio_duration = generated_audio.shape[-1] / 24000
rtf = processing_time / audio_duration

print('\n--------\n')
print('Prompt Audio: ')
display(ipd.Audio(prompt_audio, rate=24000))
print('Generated Audio: ')
display(ipd.Audio(generated_audio, rate=24000))

print(f"  RTF: {rtf:.2f}x ({1/rtf:.2f}x speed)")
print(f"  Processing: {processing_time:.2f}s for {audio_duration:.2f}s audio")

Converting audio...
Using cached preprocessed reference audio...
Using custom reference text...

ref_text   对，这就是我，万人敬仰的太乙真人。
audio: torch.Size([1, 147790])
text: [[' ', 'dui4', '，', ' ', 'zhe4', ' ', 'jiu4', ' ', 'shi4', ' ', 'wo3', '，', ' ', 'wan4', ' ', 'ren2', ' ', 'jing4', ' ', 'yang3', ' ', 'de', ' ', 'tai4', ' ', 'yi3', ' ', 'zhen1', ' ', 'ren2', '。', ' ', 'tu1', ' ', 'ran2', '，', ' ', 'shen1', ' ', 'bian1', ' ', 'yi1', ' ', 'zhen4', ' ', 'xiao4', ' ', 'sheng1', '。', ' ', 'wo3', ' ', 'kan4', ' ', 'zhe', ' ', 'ta1', ' ', 'men', '，', ' ', 'yi4', ' ', 'qi4', ' ', 'feng1', ' ', 'fa1', ' ', 'di4', ' ', 'ting3', ' ', 'zhi2', ' ', 'le', ' ', 'xiong1', ' ', 'tang2', '，', ' ', 'shuai3', ' ', 'le', ' ', 'shuai3', ' ', 'na4', ' ', 'shao1', ' ', 'xian3', ' ', 'rou4', ' ', 'gan3', ' ', 'de', ' ', 'shuang1', ' ', 'bi4', '，', ' ', 'qing1', ' ', 'xiao4', ' ', 'dao4', '：', '"', ' ', 'wo3', ' ', 'shen1', ' ', 'shang4', ' ', 'de', ' ', 'rou4', '，', ' ', 'shi4', ' ', 'wei4', ' ', 'le', ' ', 'yan3',

Generated Audio: 


  RTF: 0.06x (16.29x speed)
  Processing: 1.32s for 21.53s audio


#### Stochastic duration 

Introduce even more diversity by adding randomness to the duration

In [6]:
prompt_audio = "f5_tts/infer/examples/basic/basic_ref_zh.wav"

ref_text = "对，这就是我，万人敬仰的太乙真人。"
gen_text = '突然，身边一阵笑声。我看着他们，意气风发地挺直了胸膛，甩了甩那稍显肉感的双臂，轻笑道："我身上的肉，是为了掩饰我爆棚的魅力，否则，岂不吓坏了你们呢？"'

start_time = time.time()
# Generate with default settings
generated_audio = tts.generate(
    gen_text=gen_text,
    audio_path=prompt_audio,
    prompt_text=ref_text,
    teacher_steps=24, 
    teacher_stopping_time=0.25, # 0.25 means students go for the last two steps (0.26ish, 0.6ish)
    student_start_step=2, # only two steps for students
    temperature=0.8, # set some temperature for duration sampling 
)
end_time = time.time()

processing_time = end_time - start_time
audio_duration = generated_audio.shape[-1] / 24000
rtf = processing_time / audio_duration

print('\n--------\n')
print('Prompt Audio: ')
display(ipd.Audio(prompt_audio, rate=24000))
print('Generated Audio: ')
display(ipd.Audio(generated_audio, rate=24000))

print(f"  RTF: {rtf:.2f}x ({1/rtf:.2f}x speed)")
print(f"  Processing: {processing_time:.2f}s for {audio_duration:.2f}s audio")

Converting audio...
Using cached preprocessed reference audio...
Using custom reference text...

ref_text   对，这就是我，万人敬仰的太乙真人。

--------

Prompt Audio: 


Generated Audio: 


  RTF: 0.09x (11.55x speed)
  Processing: 1.87s for 21.63s audio
