In [None]:
import os
from datetime import datetime
from openai import AzureOpenAI

############# Update the following with your Azure OpenAI and Speech keys #############

client = AzureOpenAI(
  api_key="<azure_openai_key>",
  azure_endpoint="<azure_openai_endpoint>",
  api_version = "2024-02-15-preview"
  
)

AOAI_model  = "gpt-35-turbo-0125" # "gpt-35-turbo-16k"  #"gpt4"

speech_key = os.getenv("SPEECH_API_KEY", "<Azure_speech_key>")
service_region = "<speech_region>" #southeastasia


languages = {

    # "stt": "zh-CN",
    # "tts": "zh-CN-XiaoxiaoMultilingualNeural", #"zh-CN-XiaoyanNeural", 

    "stt": "en-SG",
    "tts": "en-US-AvaMultilingualNeural", #en-SG-LunaNeural",

    # "stt": "ms-MY",
    # "tts": "ms-MY-YasminNeural",

}


## User Profile

In [None]:
user_profile = {
    "Name": "Mandy",
    "Outstanding Debt Product": "Credit Card",
    "Outstanding Debt Amount": "USD 500",
    "Date to make payment": "2024-04-18",
    "Minimum Payment": "USD 50",
    "Preferred Language": f"{languages['stt']}"
}

## System Prompt

In [None]:
system_prompt = f"""As a voice assistant for a RichRichMoney bank's debt collection department, your goals are to inform customers about their outstanding debts and inquire about their payment plans. Follow the conversation Flow and Guidelines provided.

Flow:
- Announce that the conversation is AI-generated.
- Greet the customer on behalf of the bank and confirm their identity.
- Once confirmed, inform them about their outstanding debt on a specific product that is due on given date. Do not disclose the amount unless customer asked. Never offer customer to check on the due amount.
- Inquire about the payment date and amount, asking if they plan to make a minimum or full payment.
- If they agree to pay, thank them and conclude the conversation.
- If they refuse, ask why and inform them a human agent will contact them.
- If objectives are met, thank the customer and end the conversation.
- If the customer is uncooperative, inform them a human agent will contact them and end the conversation.
- Use "Bye" to end the conversation.

Guidelines:
- Maintain a polite, professional tone.
- Use simple, short and concise language.
- Respond in the customer's preferred language (English, Malay, or Chinese Simplified).
- Keep the conversation focused on debt collection.
- Do not disclose personal information.
- You can disclose which bank you are representing.
- Understand the customer's intent from their transcribed input. If unclear, repeat and confirm.
- Do not ask/offer any help or assistant needed.


Sensitive Customer Information:
Name: {user_profile["Name"]}
Outstanding Debt Product: {user_profile["Outstanding Debt Product"]}
Outstanding Debt Amount: {user_profile["Outstanding Debt Amount"]}
Date to make payment: {user_profile["Date to make payment"]}
Minimum Payment: {user_profile["Minimum Payment"]}
Preferred Language: {user_profile["Preferred Language"]}


Current Context:
Current Date: {datetime.now().strftime('%Y-%m-%d, %A')}
Location: Kuala Lumpur, Malaysia


Self check whether is following the Flow and Guidelines strictly.
"""

## Initial Greeting

In [None]:
if languages["stt"] == "zh-CN":
    assistant_first_prompt=f"您好，我是RichRichMoney银行的人工智能语音助手。 我正在是和 {user_profile['Name']} 说话吗?"
elif languages["stt"] == "ms-MY":
    assistant_first_prompt=f"Hai, saya ialah pembantu suara yang dijana AI daripada bank RichRichMoney. Adakah saya bercakap dengan {user_profile['Name']} ?"
else:
    languages["stt"] == "en-SG"
    assistant_first_prompt=f"Hi, I am a AI generated voice assistant from a RichRichMoney bank. Am I speaking to {user_profile['Name']} ?"

## Text To Speech  - Continoues (3s) or detect a stop voice

In [None]:
import os
import azure.cognitiveservices.speech as speechsdk
import time 

PAUSE_SECONDS = 3

def speech_recognize_continuous_from_mic():
    """performs continuous speech recognition with input from an audio file"""

    # speech_key = os.getenv("SPEECH_API_KEY", "")
    # service_region = "southeastasia"

    # This example requires environment variables named "SPEECH_KEY" and "SPEECH_REGION"
    speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
    speech_config.speech_recognition_language = languages["stt"] #"en-SG" #"ms-MY" #"zh-CN" #"en-US"

    audio_config = speechsdk.audio.AudioConfig(use_default_microphone=True)
    speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)

    done = False
    capture_sentence = ""
    recognized_time  = time.time()
    start_detect_silent = False

    def stop_cb(evt):
        """callback that signals to stop continuous recognition upon receiving an event `evt`"""
        print('CLOSING on {}'.format(evt))
        nonlocal done
        done = True
    
    def recognized (evt):
        print('RECOGNIZED: {}'.format(evt))
        if 'ai' in evt.result.text.lower():
            print('AI detected, stopping')
            stop_cb(evt)
            speech_recognizer.stop_continuous_recognition()
            return
        nonlocal capture_sentence, recognized_time, start_detect_silent
        capture_sentence += evt.result.text
        print(capture_sentence)
        start_detect_silent = True
        recognized_time = time.time()

    # Connect callbacks to the events fired by the speech recognizer
    speech_recognizer.recognizing.connect(lambda evt: print('RECOGNIZING: {}'.format(evt.result.text))) #lambda evt: print('RECOGNIZING: {}'.format(evt))
    speech_recognizer.recognized.connect(recognized)
    speech_recognizer.session_started.connect(lambda evt: print('SESSION STARTED: {}'.format(evt)))
    speech_recognizer.session_stopped.connect(lambda evt: print('SESSION STOPPED {}'.format(evt)))
    speech_recognizer.canceled.connect(lambda evt: print('CANCELED {}'.format(evt)))
    # stop continuous recognition on either session stopped or canceled events
    speech_recognizer.session_stopped.connect(stop_cb)
    speech_recognizer.canceled.connect(stop_cb)

    # Start continuous speech recognition
    speech_recognizer.start_continuous_recognition()
    while not done:
        time.sleep(.5)
        pause_time = time.time()
        if start_detect_silent == True and pause_time - recognized_time > PAUSE_SECONDS:
            print(f"No sound detected for {PAUSE_SECONDS} seconds, stopping")
            stop_cb("")
            speech_recognizer.stop_continuous_recognition()
            break
    print("#" * 8)
    print (capture_sentence)
    print("#" * 8)
    return capture_sentence


## OpenAI - Generate Answer

In [None]:
memory = [
    {"role": "system", "content": system_prompt},
    {"role": "assistant" , "content": assistant_first_prompt}
]

In [None]:
# Define the Azure OpenAI language generation function

def generate_answer(prompt):
    if prompt == None: return

    memory.append( 
        {"role": "user", "content": prompt} 
    )
    
    response = client.chat.completions.create(
        model = AOAI_model,
        messages = memory,
        temperature=0,
        max_tokens=800,
        top_p=0.95,
        frequency_penalty=0,
        presence_penalty=0,
        stop=None
    )
    answer = response.choices[0].message.content
    memory.append({"role": "assistant", "content": answer})
    print (f"*" * 8)
    print (f"generated answer: {answer}")
    print (f"*" * 8)
    return answer
    # return response['choices'][0]['message']['content']

## Speech to Text - Microsoft

In [None]:
# Define the text-to-speech function
def text_to_speech(text):
    speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
    speech_config.speech_synthesis_language = languages["tts"][:6]

    # Set up the voice configuration
    speech_config.speech_synthesis_voice_name = languages["tts"] #"en-NZ-MollyNeural"
    #https://learn.microsoft.com/en-us/javascript/api/microsoft-cognitiveservices-speech-sdk/speechsynthesisoutputformat?view=azure-node-latest
    # Audio16Khz32KBitRateMonoMp3
    speech_config.set_speech_synthesis_output_format(speechsdk.SpeechSynthesisOutputFormat.Riff16Khz16BitMonoPcm) #Audio16Khz32KBitRateMonoMp3 #Riff16Khz16BitMonoPcm
    speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config)
    try:
        result = speech_synthesizer.speak_text_async(text).get()
        print(result)
        if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
            print("Text-to-speech conversion successful.")
            audio_data = result.audio_data
            # change the file extension to match the format in line near 10
            with open("output_audio.wav", "wb") as audio_file:
                audio_file.write(audio_data)
            return True
        else:
            print(f"Error synthesizing audio: {result}")
            return False
    except Exception as ex:
        print(f"Error synthesizing audio: {ex}")
        return False
# text_to_speech("how are you doing?")

## OpenAI voice (Optional)

In [None]:
def generate_speech_with_openai_tts(tts_text):
    response = client.audio.speech.create(
        model="tts", #check the azure portal for the deployment name
        voice= "nova", #"alloy",
        input=tts_text #text_answer
    )


    response.stream_to_file("output1.mp3")

In [None]:
import pygame
def play_mp3(file_path):
    pygame.mixer.init()
    pygame.mixer.music.load(file_path)
    pygame.mixer.music.play()
    
    #release the audio file after playing
    while pygame.mixer.music.get_busy():
        pygame.time.Clock().tick(10)
    pygame.mixer.quit()

# Replace 'your_file.mp3' with your actual file path
# play_mp3('output1.mp3')

## Actions

In [None]:
memory = [
    # {"role": "system", "content": "You are an AI assistant that helps people find information. Answer it short and concise."},
    {"role": "system", "content": system_prompt},
    {"role": "assistant" , "content": assistant_first_prompt}
]

### Press space bar to start for every turn, run the cell above to delete the memory (reset)

In [None]:
import keyboard
import time

first_time = True

#################### Microsoft TTS or OpenAI TTS ####################
text_to_speech_option = "openai" # "msft" or "openai"
#################### Microsoft TTS or OpenAI TTS ####################

try:
    while True:
        if keyboard.is_pressed('space'):  # if spacebar is pressed 
            if first_time:
                if text_to_speech_option == "msft":
                    text_to_speech(assistant_first_prompt)
                elif text_to_speech_option == "openai":
                    generate_speech_with_openai_tts(assistant_first_prompt)
                    play_mp3("output1.mp3")
                    
                first_time = False
                continue
        
            speaker_text = speech_recognize_continuous_from_mic()
            # time to generate text and speech

            gen_start_time = time.time()
            answer = generate_answer(speaker_text)
            gen_end_time = time.time()
            print(f"time openai gen:{gen_end_time - gen_start_time  }")

            if text_to_speech_option == "msft":
                ### Microsoft AI TTS
                tts_start_time = time.time()
                text_to_speech(answer) # Microsoft AI TTS
                tts_stop_time = time.time()
                print(f"time tts gen:{tts_stop_time - tts_start_time }")
            elif text_to_speech_option == "openai":
                
                ### OpenAI TTS
                tts_start_time = time.time()
                generate_speech_with_openai_tts(answer)# Microsoft AI TTSl
                tts_stop_time = time.time()
                play_mp3("output1.mp3")
                print(f"time tts gen:{tts_stop_time - tts_start_time }")
            
            
            
            diff_time = tts_stop_time - gen_start_time 
            
            # print(memory)
            print(f'time taken for text gen and speech gen:{diff_time} seconds')

            if '再见' in answer.lower() or 'bye' in answer.lower():
                break
        elif keyboard.is_pressed('esc'):
            break
except KeyboardInterrupt: 
    print("Stopped by User")