In [None]:
# Install or update openai modules 
%pip install openai --upgrade openai 
%pip install python-dotenv mutagen pydub

In [None]:
from openai import OpenAI
import re
import os
import IPython
from datetime import datetime
from mutagen.mp3 import MP3
from mutagen.id3 import ID3, APIC, TPE1, TALB, TCON
from dotenv import load_dotenv
from pydub import AudioSegment

load_dotenv()
client = OpenAI(
)

def add_metadata(mp3_file, image_file, artist, album, genre):
    try:
        audio = MP3(mp3_file, ID3=ID3)

        # Add ID3 tag if it doesn't exist
        try:
            audio.add_tags()
        except:
            pass

        # Add artist name
        audio.tags.add(TPE1(encoding=3, text=artist))

        # Add album name
        audio.tags.add(TALB(encoding=3, text=album))

        # Add genre
        audio.tags.add(TCON(encoding=3, text=genre))

        # Add artwork
        with open(image_file, 'rb') as albumart:
            audio.tags.add(APIC(
                encoding=3,          # 3 is for utf-8
                mime='image/jpeg',   # image/jpeg or image/png
                type=3,              # 3 is for the cover image
                desc=u'Cover',
                data=albumart.read()
            ))

        audio.save()
        # print(f"\rMetadata added to {mp3_file}")

    except Exception as e:
        print(f"\rAn error occurred: {e}")

def get_response(user_prompt, role_definition, model="gpt-4o-mini"):
    user_prompt += "\n\n Please provide response directly without further explanation."

    rspd_translation = client.chat.completions.create(
    model=model,
    messages=[
        {
            "role": "system", 
            "content": role_definition
        },
        {
            "role": "user", 
            "content": user_prompt
        }
    ])
    return rspd_translation.choices[0].message.content

def get_openai_tts_audio(text, file_name="", performer="alloy", silence_duration=1500, with_ending=False, ending_file="resources/ending.mp3", progress=True):

    # check artwork.png and ending.mp3 files exist
    if not os.path.isfile('resources/Artwork.png') or not os.path.isfile('resources/ending.mp3'):
        print("\rEither Artwork.png or ending.mp3 file not found.")
        return

    # split the text into lines
    text = markdown_to_text(text).split("\n")
    # remove empty lines
    text = [t for t in text if t]

    for t in text:
        speech_file_path = f'temp-{text.index(t)}.mp3'
       
        with client.audio.speech.with_streaming_response.create(
            model="tts-1",
            voice=performer,
            input=t.strip()
        ) as response:
            response.stream_to_file(speech_file_path)
        
        # output a progress percentage, keep updating within a line
        if progress:
            print(f"\rprocessing audio performed by {performer.capitalize()}: {round((text.index(t)+1)/len(text)*100)}%", end='...')
    if progress:
        print("\n")
    
    # create an audio of silence of specified silence_duration
    temp_audio = AudioSegment.silent(duration=silence_duration)
    for t in text:
        seg = AudioSegment.from_file(f'temp-{text.index(t)}.mp3')
        temp_audio += seg + AudioSegment.silent(duration=silence_duration)
        # delete the temp file
        os.remove(f'temp-{text.index(t)}.mp3')
    temp_audio.export('~temp.mp3', format='mp3')
    speech = AudioSegment.from_file('~temp.mp3')
    if with_ending:
        ending = AudioSegment.from_file(ending_file)
        combined = speech + ending
    else:
        combined = speech
    os.remove('~temp.mp3')
    if file_name:
        # if file_name has no extension, add .mp3
        if file_name.endswith('.mp3'):
            speech_file_path = file_name
        else:
            speech_file_path = f'{file_name}.mp3'        
    else:
        speech_file_path = f'{datetime.now().strftime("%Y%m%d_%H%M%S")}_{performer}.mp3'
    combined.export(speech_file_path, format='mp3')
    # print(f"\rAudio file saved as {speech_file_path}")

    image_file = 'resources/Artwork.png'
    artist = 'tts'
    album = 'Daily Speech Training'
    genre = 'SPEECH'

    add_metadata(speech_file_path, image_file, artist, album, genre)
    IPython.display.Audio(speech_file_path)

    return speech_file_path

# convert markdown to plain text
def markdown_to_text(markdown_text):
    # remove bold text
    text = re.sub(r'\*\*(.*?)\*\*', r'\1', markdown_text)
    # remove italic text
    text = re.sub(r'\*(.*?)\*', r'\1', text)
    # remove links
    text = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', text)
    # remove images
    text = re.sub(r'\!\[.*?\]\(.*?\)', '', text)
    # remove code blocks
    text = re.sub(r'```(.*?)```', r'\1', text)
    # remove inline code
    text = re.sub(r'`(.*?)`', r'\1', text)
    # remove ordered lists
    text = re.sub(r'\d+\.(.*?\n)', r'\1', text)
    # remove unordered lists
    text = re.sub(r'\*(.*?\n)', r'\1', text)
    # remove blockquotes
    text = re.sub(r'>(.*?\n)', r'\1', text)
    # remove horizontal rules
    text = re.sub(r'-+', '', text)
    # remove headings
    text = re.sub(r'#+', '', text)
    # remove extra spaces
    text = re.sub(r' +', ' ', text)
    text = text.strip()
    return text

def add_zero(number, length=2):
    return str(number).zfill(length)

# read text from a file
def read_text_from_file(file_path):
    with open(file_path, 'r') as file:
        return file.read()
    
# write text to a file
def write_text_to_file(file_path, text):
    with open(file_path, "w") as file:
        file.write(text)

# read lines from a file
def read_lines_from_file(file_path):
    with open(file_path, 'r') as file:
        return file.readlines()

# write lines to a file
def write_lines_to_file(file_path, lines):
    with open(file_path, 'w') as file:
        file.writelines(lines)


In [3]:
# Generate a passage by ChatGPT, with topic, key words or points you specify.

role_definition = """
You're my American English guru, 
Whatever topic or key words or points I give you,
You'll develop them into a 300 words American English essay.
Be creative, do not use any cliche to start your passage, such as "In todays's fast-paced world"...
If I send you "random", you'll write a random essay with recent events or trending topics.
Use as diverse as possible expressions or idioms Americans use in daily conversations.
Return the passage as pure text without any markdown or code block.
"""

user_prompt = """
learning and practicing

学本身用处并不大的，练才是真正的关键。学只不过是知道，只有高密度地练才能做到。

学是系统二的工作。练是把知识、技能、流程和习惯压缩进系统一的工作。

如果学的重要性是 1 的话，练的重要性可能是 99  甚至 9999。

学是为了练，练是为了把学变成自己的东西。

学了很多很多，从来不练，才是人们最大的问题。说什么学到的东西用不上，只不过是自己没练所以才用不了而已。哪儿有什么学了没用的东西呢？
"""

rspd  = get_response(user_prompt, role_definition, model="gpt-4o-mini")
rspd = rspd.replace("—", " — ")

audio_filename_prefix = f"Discourse_{'-'.join(user_prompt.strip().rstrip(',.!?\")').split(' ')[:2])}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"

print(rspd + "\n\nGenerating the audio file...\n")
write_text_to_file(f'{audio_filename_prefix}.md', rspd)

for p in ["alloy", "nova"]:
    audio_filename = get_openai_tts_audio(rspd, file_name=f'{audio_filename_prefix}-{p}.mp3', performer=p, silence_duration=1500, with_ending=True, ending_file="ending.mp3")
    print(f"Audio file saved as {audio_filename_prefix}-{p}.mp3!\n")

print("All done!")

Mastering a skill is much like honing a craft with a chisel: learning provides the foundational knowledge, but it is practice that shapes the masterpiece. In a world where knowledge is at our fingertips, the danger lies not in a lack of information, but in our failure to translate that information into action. Learning is an essential first step, akin to gathering the raw materials needed for a project; however, it is through rigorous practice that we refine those materials into something meaningful.

Think about it this way: learning serves as our intellectual toolbox, while practice is the actual construction work. When we engage in practical application, we’re not merely reviewing what we've learned; we’re embedding those ideas into our very being, making them second nature. This transformation is what separates the novices from the experts. For instance, a musician can study music theory for years, but without consistent practice, those notes on a page remain just that — notes on a

In [4]:
# Generating audio from  conversations with a topic you specify.

import json
import random

def conversation_audio(conversation):
    topic =  conversation['topic'].strip()
    random_speaker = ["Alloy", "Nova"][random.randint(0, 1)].lower()
    # 1 second silence audio
    audio = AudioSegment.silent(duration=500)
    topic_audio = AudioSegment.from_file(get_openai_tts_audio(topic, "temp~~.mp3", random_speaker, 1000, with_ending=False, progress=False))
    audio += topic_audio
    # remove topic.mp3 file
    os.remove("temp~~.mp3")
    sounds = ["resources/Basso.aiff", "resources/Blow.aiff", "resources/Bottle.aiff", "resources/Frog.aiff", "resources/Funk.aiff", "resources/Glass.aiff", "resources/Hero.aiff", "resources/Morse.aiff", "resources/Ping.aiff", "resources/Pop.aiff", "resources/Purr.aiff", "resources/Sosumi.aiff", "resources/Submarine.aiff", "resources/Tink.aiff"]
    sound = random.choice(sounds)
    AudioSegment.from_file(sound).export("dot-ending.mp3", format="mp3")
    ending_file_name = "dot-ending.mp3"

    for exchange in conversation['exchanges']:
        for speaker, line in exchange.items():
            # if last line in exchange, assign with_ending=True
            with_ending = conversation['exchanges'].index(exchange) == len(conversation['exchanges']) - 1
            # print(f"\rspeaker, line, with_ending")
            audio += AudioSegment.from_file(get_openai_tts_audio(line.replace("—", " — "), "temp~~.mp3", speaker.lower(), 1000, with_ending, ending_file=ending_file_name, progress=False))
            # remove temp file
            os.remove("temp~~.mp3")

    os.remove(ending_file_name)

    return audio

# user_prompt = """
# on homosexuality, Although we don't oppose homosexuality, we do have another concern—what if our child isn't naturally homosexual but is somehow influenced to become one? How should we view this situation, and how can we prevent it from happening?
# """

role_definition = """
You're my American English guru, 
Whatever topic or key words I give you,
You'll desing two conversations or debates with several exchanges between Alloy and Nova,
each of which is about 400 words long, and is completely different with each other.
Make sure to mix exchnages in conversation with positive, neutral, and negative reactions, which might well including specifically from: 
agreement, enthusiasm, support, admiration, acknowledgment, curiosity, clarification, reflection, disagreement, doubt, surprise, disapproval, confusion, indifference, frustration, sarcasm, etc.
Be creative to start every exchanges in conversation.
Use as diverse as possible expressions or idioms Americans use in daily conversations.
Return these conversations in json format, without code block such as ```json```.
Check the validity of the json format before returning it.

Example of a conversation:

{
  "conversations": [
    {
      "topic": "The Importance of Renewable Energy",
      "exchanges": [
        {
          "Alloy": "I really think we should invest more in renewable energy sources."
        },
        {
          "Nova": "I agree, but do you think it's feasible for all countries right now?"
        },
        {
          "Alloy": "Definitely! With the right policies and technological advancements, it can be achieved."
        },
        {
          "Nova": "True, but the initial costs might be a hurdle for some developing nations."
        },
        {
          "Alloy": "That's valid. Perhaps international support could help bridge that gap."
        }
      ]
    },
  ],
}
"""

responds  = get_response(user_prompt, role_definition, model="gpt-4o-mini")

# load json
conversations = json.loads(responds)

# extract first three words from the user prompt to use as the filename prefix
whole_audio_filename_prefix = f"Conversation_{'-'.join(user_prompt.strip().rstrip(',.!?\")').split(' ')[:2])}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"

# dump json to a file
with open(f"{whole_audio_filename_prefix}.json", "w") as file:
    json.dump(conversations, file)

# get each conversation, topic and exchanges, and each exchange
with open(f"{whole_audio_filename_prefix}.md", "w") as file:
  for conversation in conversations['conversations']:
      topic = conversation['topic']
      exchanges = conversation['exchanges']
      print(f"\nConversation {conversations['conversations'].index(conversation)+1}: {topic}")
      file.write(f"\n## Conversation {conversations['conversations'].index(conversation)+1}: {topic}\n")
      for exchange in exchanges:
          for speaker, text in exchange.items():
              print(f"{speaker}: {text}")
              file.write(f"\n**{speaker}**: {text.replace("—", " — ")}\n")

print("\nGenerating the audio file...")
whole_audio = AudioSegment.silent(duration=1000)
# get each conversation, topic and exchanges, and each exchange
for conversation in conversations['conversations']:
    whole_audio += conversation_audio(conversation)

whole_audio += AudioSegment.from_file("resources/ending.mp3")
whole_audio_filename = f"{whole_audio_filename_prefix}.mp3"
whole_audio.export(whole_audio_filename, format="mp3")
    
add_metadata(whole_audio_filename, 'Artwork.png', 'tts', 'Daily Speech Training - Conversations', 'SPEECH')
print(f"\nAudio file saved as {whole_audio_filename}!")    
IPython.display.Audio(whole_audio_filename)


Conversation 1: The Balance Between Learning and Practicing
Alloy: You know, I believe that practicing is way more important than just learning.
Nova: That's an interesting take! But can we really downplay the importance of learning?
Alloy: Not entirely, but I think learning is just the starting point. Practice is where it counts.
Nova: I get that. It's like the old saying, 'you can lead a horse to water, but you can't make it drink.'
Alloy: Exactly! Just sitting in a classroom won't make you a pro. You’ve got to roll up your sleeves.
Nova: I agree, but don’t you think some foundational knowledge is necessary before you can even practice?
Alloy: Sure, but too much focus on learning can lead to overthinking. You just have to dive in!
Nova: That's a fair point. People often get stuck in analysis paralysis. But what about the risk of learning the wrong way?
Alloy: That’s where good mentorship comes in. They can guide your practice and correct any mistakes.
Nova: Valid argument! Perhaps t