# EdgeTTS

https://github.com/rany2/edge-tts

edge-tts is a Python module that allows you to use Microsoft Edge's online text-to-speech service from within your Python code or using the provided edge-tts or edge-playback command.

In [None]:
%pip install edge-tts pygame

In [1]:
import edge_tts
import os
import pygame
import time

async def generate_edge_tts_audio(text, file_name, voice='en-US-GuyNeural', style='newscast-formal', verbose=False, play=False, overwrite=False):
    communicate = edge_tts.Communicate(text, voice)
    # whether file exists?
    if os.path.exists(file_name):
        if overwrite:
            if verbose:
                print(f'{file_name} exists, overwriting...')
        else:
            if verbose:
                print(f'{file_name} exists, skipping...')
            return
        
    await communicate.save(file_name)
    if play:
        pygame.mixer.init()
        pygame.mixer.music.load(file_name)
        pygame.mixer.music.play()
    if verbose:
        print(f'{file_name} created')
    
    time.sleep(1.5)

The history saving thread hit an unexpected error (DatabaseError('database disk image is malformed')).History will not be written to the database.
pygame 2.6.0 (SDL 2.28.4, Python 3.12.2)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [3]:

voices = ["en-US-GuyNeural", "en-US-AriaNeural", "en-GB-RyanNeural", "en-GB-LibbyNeural"]
regions = ['us', 'us', 'uk', 'uk']
genders = ['male', 'female', 'male', 'female']

# only_us = False
only_us = True
if only_us:
    voices = voices[:2]
    print(voices)

words = """
applying,
carrying,
crying,
denying,
qualifying,
replying,
satisfying,
specifying,
spying,
"""

for word in words.strip().split(','):
    print(word)
    for i, voice in enumerate(voices):
        w = word.strip().lower()
        if len(w) > 0:
            filename = f'../audios/{w.replace(" ", "-")}-{regions[i]}-{genders[i]}.mp3'
            await generate_edge_tts_audio(w, filename, voice=voice, verbose=True, overwrite=False, play=True)


['en-US-GuyNeural', 'en-US-AriaNeural']
applying
../audios/applying-us-male.mp3 exists, skipping...
../audios/applying-us-female.mp3 exists, skipping...

carrying
../audios/carrying-us-male.mp3 exists, skipping...
../audios/carrying-us-female.mp3 exists, skipping...

crying
../audios/crying-us-male.mp3 exists, skipping...
../audios/crying-us-female.mp3 exists, skipping...

denying
../audios/denying-us-male.mp3 exists, skipping...
../audios/denying-us-female.mp3 exists, skipping...

qualifying
../audios/qualifying-us-male.mp3 created
../audios/qualifying-us-female.mp3 created

replying
../audios/replying-us-male.mp3 exists, skipping...
../audios/replying-us-female.mp3 exists, skipping...

satisfying
../audios/satisfying-us-male.mp3 exists, skipping...
../audios/satisfying-us-female.mp3 exists, skipping...

specifying
../audios/specifying-us-male.mp3 exists, skipping...
../audios/specifying-us-female.mp3 exists, skipping...

spying
../audios/spying-us-male.mp3 exists, skipping...
../audi

In [21]:
def get_openai_tts_audio(text, path, performer='alloy'):
        
        from openai import OpenAI
        from dotenv import load_dotenv
        load_dotenv()
        client = OpenAI(
        )
        
        with client.audio.speech.with_streaming_response.create(
            model="tts-1",
            voice=performer,
            input=text.strip()
        ) as response:
            response.stream_to_file(path)
        
sentence = "She will cherish those memories and ever hold them close to her heart."

# remove all punctuation at the end of sentence,
# replace all spaces and punctuations in the sentence with dash
audio_filename_openai = sentence.strip().translate(str.maketrans(',.?! ', '-----')).replace("--", "-").lower().rstrip('-') + '_openai.mp3'
audio_filename_msedge = sentence.strip().translate(str.maketrans(',.?! ', '-----')).replace("--", "-").lower().rstrip('-') + '_msedge.mp3'

print(audio_filename_openai)
# get_openai_tts_audio(sentence, audio_filename_openai, performer='alloy')
# await generate_edge_tts_audio(sentence, audio_filename_msedge, voice="en-US-GuyNeural", verbose=True, overwrite=True, play=True)

for voice in ["alloy", "nova"]:
    get_openai_tts_audio(sentence, f'../audios/{sentence.replace(" ", "-")}-{voice}.mp3', performer=voice)


she-will-cherish-those-memories-and-ever-hold-them-close-to-her-heart_openai.mp3


In [3]:
from openai import OpenAI
import os
import IPython
from datetime import datetime
from mutagen.mp3 import MP3
from mutagen.id3 import ID3, APIC, TPE1, TALB, TCON
from dotenv import load_dotenv
from pydub import AudioSegment

load_dotenv()
client = OpenAI(
)

def get_openai_tts_audio(text, filename, performer="alloy"):

    # check artwork.png and ending.mp3 files exist
    if not os.path.isfile('Artwork.png') or not os.path.isfile('ending.mp3'):
        print("Either Artwork.png or ending.mp3 file not found.")
        return

    # split the text into lines
    text = markdown_to_text(text).split("\n")
    # remove empty lines
    text = [t for t in text if t]

    for t in text:
        speech_file_path = f'temp-{text.index(t)}.mp3'
        rspd_audio = client.audio.speech.create(
            model="tts-1",
            voice=performer,
            input=t.strip()
        ) 
        rspd_audio.stream_to_file(speech_file_path)
        # output a progress percentage 
        # keep updating within a line
        print(f"\rprocessing: {round((text.index(t)+1)/len(text)*100)}%", end='...')
    print("\n")

    # create an audio of 1 second of silence
    temp_audio = AudioSegment.silent(duration=1000)
    for t in text:
        seg = AudioSegment.from_file(f'temp-{text.index(t)}.mp3')
        temp_audio += seg + AudioSegment.silent(duration=1500)
        # delete the temp file
        os.remove(f'temp-{text.index(t)}.mp3')
    temp_audio.export('~temp.mp3', format='mp3')
    speech = AudioSegment.from_file('~temp.mp3')
    ending = AudioSegment.from_file('ending.mp3')
    combined = speech + ending
    os.remove('~temp.mp3')
    if filename:
        # if filename has no extension, add .mp3
        if filename.endswith('.mp3'):
            speech_file_path = filename
        else:
            speech_file_path = f'{filename}.mp3'        
    else:
        speech_file_path = f'{datetime.now().strftime("%Y%m%d_%H%M%S")}_{performer}.mp3'
    combined.export(speech_file_path, format='mp3')
    print(f"Audio file saved as {speech_file_path}")

    image_file = 'Artwork.png'
    artist = 'tts'
    album = 'Daily Speech Training'
    genre = 'SPEECH'

    add_metadata(speech_file_path, image_file, artist, album, genre)
    IPython.display.Audio(speech_file_path)

    return f'{speech_file_path} created successfully.'



# English Voices

* voice = "en-US-GuyNeural" (Male)
* voice = "en-US-AnaNeural" (Female)
* voice = "en-US-AndrewNeural" (Male)
* voice = "en-US-AriaNeural" (Female)
* voice = "en-US-AvaNeural" (Female)
* voice = "en-US-BrianNeural" (Male)
* voice = "en-US-ChristopherNeural" (Male)
* voice = "en-US-EmmaNeural" (Female)
* voice = "en-US-EricNeural" (Male)
* voice = "en-US-GuyNeural" (Male)
* voice = "en-US-JennyNeural" (Female)
* voice = "en-US-MichelleNeural" (Female)
* voice = "en-US-RogerNeural" (Male)
* voice = "en-US-SteffanNeural" (Male)
* voice = "en-GB-LibbyNeural" (Female)
* voice = "en-GB-MaisieNeural" (Female)
* voice = "en-GB-RyanNeural" (Male)
* voice = "en-GB-SoniaNeural" (Female)
* voice = "en-GB-ThomasNeural" (Male)
* voice = "en-AU-NatashaNeural" (Female)
* voice = "en-AU-WilliamNeural" (Male)
* voice = "en-CA-ClaraNeural" (Female)
* voice = "en-CA-LiamNeural" (Male)

In [55]:
# generate sentences using edge-tts
sentences = """
It's our pleasure.
"""
# split the sentences into lines
sentences = [s for s in sentences.strip().split("\n") if s]
for sentence in sentences:
    # get the first three worrds and replace the first with 'sentence', join them with '-'ArithmeticError
    # filename = f'../audios/sentence-{sentence.split(" ")[1]}-{sentence.split(" ")[2]}.mp3'
    # for voice in ["en-US-GuyNeural", "en-US-AriaNeural"]:
    for voice in ["alloy", "nova"]:

        filename = f'../audios/{sentence.rstrip(",.!?").replace("'", "").replace(" ", "-")}-us-{voice}.mp3'.replace("alloy", "male").replace("nova", "female")
        # filename = f'../audios/{sentence.rstrip(",.!?").replace("'", "").replace(" ", "-")}-us-{voice}.mp3'.replace("en-US-GuyNeural", "male").replace("en-US-AriaNeural", "female")
        print(filename)
        # await generate_edge_tts_audio(sentence, filename, voice=voice, verbose=True, overwrite=True, play=True)
        get_openai_tts_audio(sentence, filename, performer=voice)
    
    

    # get_openai_tts_audio(sentence, f'../audios/{sentence.replace(" ", "-")}-alloy.mp3', performer='alloy')
    # get_openai_tts_audio(sentence, f'../audios/{sentence.replace(" ", "-")}-nova.mp3', performer='nova')


../audios/are-us-male.mp3


APIConnectionError: Connection error.

In [9]:
def read_lines_from_file(file_path):
    with open(file_path, 'r') as file:
        return file.readlines()

import re
words = ''
# read lines from ../../sounds-of-american-english/6-vocabulary.md
lines = read_lines_from_file('../../sounds-of-american-english/6-vocabulary.md')
for line in lines:
    # whether line is markdown list item start with \d. or ' * '
    # regex
    if (re.match(r'^\d+\.', line) or re.match(r'^\s*\*\s', line)) and 'span>' in line:
    # remove starting ^\d+\.\s* or '^\s*\*\s*'
        line = re.sub(r'^\d+\.\s*|\s*\*\s*', '', line)
        # get the first word
        word = line.split(' ')[0]
        # append to words followed a comma
        words += word + ','
print(words)







airplane,airport,backyard,bedroom,birthday,blackboard,bookstore,brainstorm,breakfast,classroom,cupcake,daydream,dishwasher,doorbell,downtown,earthquake,everyday,eyewitness,firefighter,football,greenhouse,handwriting,headache,highway,homework,iceberg,jellyfish,laptop,lighthouse,mailbox,moonlight,notebook,nobody,pancake,postcard,rainbow,sailboat,sandbox,seashore,skateboard,snowflake,spaceship,sunflower,sunshine,superhero,tablecloth,toothbrush,toothpaste,typewriter,underwater,upstairs,volleyball,waterfall,watermelon,weekend,wheelchair,windmill,workshop,unhappy,unknown,unusual,redo,review,return,incomplete,invisible,inside,disagree,disappear,disconnect,preview,predict,preschool,misunderstand,misplace,mislead,nonstick,nonprofit,nonviolent,overeat,overwork,overheat,submarine,subtitle,subconscious,international,interact,interrupt,transport,transfer,transform,underpaid,underestimate,underground,superhuman,supermarket,supervisor,semicircle,semifinal,semiconscious,antibiotic,antifreeze,antisocia

In [10]:
voices = ["en-US-GuyNeural", "en-US-AriaNeural", "en-GB-RyanNeural", "en-GB-LibbyNeural"]
regions = ['us', 'us', 'uk', 'uk']
genders = ['male', 'female', 'male', 'female']

# only_us = False
only_us = True
if only_us:
    voices = voices[:2]
    print(voices)

for word in words.strip().split(','):
    print(word)
    for i, voice in enumerate(voices):
        w = word.strip().lower()
        if len(w) > 0:
            filename = f'../audios/{w.replace(" ", "-")}-{regions[i]}-{genders[i]}.mp3'
            await generate_edge_tts_audio(w, filename, voice=voice, verbose=True, overwrite=False, play=True)


['en-US-GuyNeural', 'en-US-AriaNeural']
airplane
../audios/airplane-us-male.mp3 created
../audios/airplane-us-female.mp3 created
airport
../audios/airport-us-male.mp3 created
../audios/airport-us-female.mp3 created
backyard
../audios/backyard-us-male.mp3 created
../audios/backyard-us-female.mp3 created
bedroom
../audios/bedroom-us-male.mp3 created
../audios/bedroom-us-female.mp3 created
birthday
../audios/birthday-us-male.mp3 created
../audios/birthday-us-female.mp3 created
blackboard
../audios/blackboard-us-male.mp3 created
../audios/blackboard-us-female.mp3 created
bookstore
../audios/bookstore-us-male.mp3 created
../audios/bookstore-us-female.mp3 created
brainstorm
../audios/brainstorm-us-male.mp3 created
../audios/brainstorm-us-female.mp3 created
breakfast
../audios/breakfast-us-male.mp3 created
../audios/breakfast-us-female.mp3 created
classroom
../audios/classroom-us-male.mp3 exists, skipping...
../audios/classroom-us-female.mp3 exists, skipping...
cupcake
../audios/cupcake-us-ma