In [1]:
!conda list

# packages in environment at C:\Users\anbey\.conda\envs\new_ml:
#
# Name                    Version                   Build  Channel
abseil-cpp                20211102.0           hd77b12b_0  
accelerate                0.23.0             pyhd8ed1ab_0    conda-forge
aiofiles                  22.1.0          py311haa95532_0  
aiohttp                   3.8.5           py311h2bbff1b_0  
aiosignal                 1.2.0              pyhd3eb1b0_0  
aiosqlite                 0.18.0          py311haa95532_0  
anyio                     3.5.0           py311haa95532_0  
appdirs                   1.4.4              pyh9f0ad1d_0    conda-forge
argon2-cffi               21.3.0             pyhd3eb1b0_0  
argon2-cffi-bindings      21.2.0          py311h2bbff1b_0  
arize                     7.6.0                    pypi_0    pypi
arize-phoenix             0.0.49             pyhd8ed1ab_0    conda-forge
arrow-cpp                 11.0.0               ha81ea56_2  
asttokens                 2.0.5           

#### Requirements

- numpy
- sounddevice
- soundfile
- pydub
- ffmpeg
- keyboard
- openai
- aiohttp - probably not needed?

#### Imports

In [73]:
import sounddevice as sd
import numpy as np

import soundfile as sf # install as pysoundfile
from pydub import AudioSegment # also need ffmpeg to be installed

# import threading
import queue
import os
import keyboard

In [74]:
import openai
from openai import OpenAI

In [75]:
from pygame import mixer  # Load the popular external library

In [76]:
APIKEY = "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"
openai.api_key = APIKEY   #os.getenv(“OPENAI_API_KEY”)

In [77]:
client = OpenAI(api_key=APIKEY)

## Record Audio

In [78]:
FS = 44100
FILENAME = "last_question"

In [26]:
def record_audio_fixed_length(duration=5, fs=FS):
    """
    Record audio from the microphone for a given duration.
    """
    print("Recording...")
    recording = sd.rec(int(duration * fs), samplerate=fs, channels=1)
    sd.wait()  # Wait until recording is finished
    print("Recording stopped.")
    return recording


In [42]:
recording = record_audio_fixed_length()

Recording...
Recording stopped.


In [43]:
sd.play(recording, FS)
# sf.write('tone.wav', recording, FS)

In [27]:
def record_audio(fs=44100):
    """
    Record audio from the microphone. Start and end recording with a spacebar.
    """
    
    # This queue will hold the recorded audio frames
    audio_queue = queue.Queue()
    
    def callback(indata, frames, time, status):
        """This is called for each audio block."""
        if status:
            print(status, file=sys.stderr)
        audio_queue.put(indata.copy())
    
    print("Press Space to start recording...")
    keyboard.wait('space')  # Wait until space is pressed to start recording

    # Open the stream with the callback and specified sample rate
    # This will store recorded audio frames in audio_queue
    with sd.InputStream(samplerate=fs, channels=1, callback=callback):
        print("Recording... Press Space to stop.")
        keyboard.wait('space')  # Wait until space is pressed again

    # Retrieve audio data from the queue
    audio_data = []
    while not audio_queue.empty():
        audio_data.append(audio_queue.get())
    
    # Concatenate all the audio chunks into one NumPy array
    audio_data = np.concatenate(audio_data)

    # result_queue.put(recording) - this line could be used instead of return to run the function in the separate thread
    return audio_data


In [31]:
def save_as_mp3(recording, filename, fs):
    # Save the recording as a WAV file
    sf.write(filename + '.wav', recording, fs)

    # Load the WAV file with pydub and export as MP3
    audio_segment = AudioSegment.from_wav(filename + '.wav')
    audio_segment.export(filename + '.mp3', format='mp3')

    # Delete the WAV file
    os.remove(filename + '.wav')

In [29]:
recording = record_audio()  # Start recording

Press Space to start recording...
Recording... Press Space to stop.


In [61]:
sd.play(recording, FS)

In [32]:
save_as_mp3(recording, FILENAME, FS)

#### Recording in a separate thread

In [None]:
#
# We can run audio recording in a separate thread like this, but need to modify record_audio with result_queue.put(recording) instead of return
#

result_queue = Queue()

# Start recording in a separate thread, passing the queue
record_thread = threading.Thread(target=record_audio, args=(result_queue,))
record_thread.start()
record_thread.join()  # Wait for the recording thread to finish

# Retrieve the result from the queue
recording = result_queue.get()

#### Extra operations

In [34]:
def save_recording_as_mp3(audio_data, fs=FS, file_name="recording.mp3"):
    """
    Save the recorded audio data as an MP3 file.

    Parameters:
    audio_data (numpy.ndarray): The recorded audio data.
    fs (int): The sampling rate of the audio data.
    file_name (str): The name of the file to save the recording to.
    """
    # Normalize the array to be in the range of -1 to 1, as required by pydub
    audio_data = audio_data / np.max(np.abs(audio_data))

    # Convert the normalized audio array to a format compatible with pydub
    audio_segment = AudioSegment(
        audio_data.astype("float32").tobytes(),
        frame_rate=fs,
        sample_width=audio_data.dtype.itemsize,
        channels=1
    )

    # Export the audio segment to an MP3 file
    audio_segment.export(file_name, format="mp3")
    print(f"File saved as {file_name}")

In [40]:
sd.default.device = [1, 5]

In [19]:
sd.query_devices()

   0 Microsoft Sound Mapper - Input, MME (2 in, 0 out)
   1 Microphone Array (Intel® Smart , MME (4 in, 0 out)
   2 Headset Microphone (Oculus Virt, MME (1 in, 0 out)
   3 Headset (Soundcore Life Tune XR, MME (1 in, 0 out)
   4 Microsoft Sound Mapper - Output, MME (0 in, 2 out)
<  5 Headphones (Soundcore Life Tune, MME (0 in, 2 out)
   6 Headphones (Oculus Virtual Audi, MME (0 in, 2 out)
   7 Speakers (Realtek(R) Audio), MME (0 in, 2 out)
   8 Acer K272HUL (NVIDIA High Defin, MME (0 in, 2 out)
   9 Output 1 (OCULUSVAD Wave Speaker Headphone), Windows WDM-KS (0 in, 2 out)
  10 Output 2 (OCULUSVAD Wave Speaker Headphone), Windows WDM-KS (0 in, 2 out)
  11 Input (OCULUSVAD Wave Speaker Headphone), Windows WDM-KS (2 in, 0 out)
  12 Headset Microphone (OCULUSVAD Wave Microphone Headphone), Windows WDM-KS (1 in, 0 out)
  13 Speakers (Nahimic mirroring Wave Speaker), Windows WDM-KS (0 in, 2 out)
  14 Microphone Array 1 (), Windows WDM-KS (2 in, 0 out)
  15 Microphone Array 2 (), Windows WDM-K

## Speech to Text

In [16]:
def Mp3Transcribe(filename):
    audio_file = open(filename + ".mp3", "rb")

    parameters = {
        "model":"whisper-1", 
        "file":audio_file, 
        "response_format":"text"}

    # transcript = openai.Audio.transcribe(**parameters)
    transcript = client.audio.transcriptions.create(**parameters)
    
    return transcript

In [17]:
Mp3Transcribe(FILENAME)

'I have a new cat. She is cute and fluffy and I want to give her a good name. Please give me a few suggestions for nice cat names.\n'

## AI Answer

#### Main Function

In [40]:
test_text = 'I have a new cat. She is cute and fluffy and I want to give her a good name. Please give me a few suggestions for nice cat names.\n'

In [41]:
# models: gpt-4, gpt-4-turbo, gpt-3.5-turbo, text-davinci-003

# The system message helps set the behavior of the assistant. In the example above, the assistant was instructed with “You are a helpful assistant.”
# The user messages help instruct the assistant. They can be generated by the end users of an application, or set by a developer as an instruction.
# The assistant messages help store prior responses. They can also be written by a developer to help give examples of desired behavior.

system_message = \
"""
You are a helpful AI assistant from speech to speech. User input is coming from speech to text transcriber and may contain errors due to incorrect speech detection.
YOur answer will be encoded into speech, keep it short but conversational. Unless user specifically requests to be detailed try to keep answer under a 100 words.
"""

In [42]:
def QueryLLM(question):
    parameters = {
      'model': 'gpt-3.5-turbo', 
      'messages': [{"role": "system", "content": system_message}, 
                   {"role": "user", "content": question}, 
                   {"role": "assistant", "content": ""}]
    }

    result = client.chat.completions.create(**parameters)

    text_reply = reply.choices[0].message.content

    return text_reply

In [43]:
QueryLLM(test_text)

'Of course! Congratulations on your new furry friend. Here are a few cute and fluffy cat name suggestions: \n\n1. Bella\n2. Luna\n3. Oliver\n4. Coco\n5. Teddy\n6. Daisy\n7. Milo\n8. Lily\n9. Charlie\n10. Sophie\n\nI hope you find the perfect name for your adorable new cat! Enjoy your time together.'

## Text to Speech

In [51]:
text_reply = \
"""
Of course! Congratulations on your new furry friend. Here are a few cute and fluffy cat name suggestions: \n\n1. Bella\n2. Luna\n3. Oliver\n4. Coco\n5. Teddy\n6. Daisy\n7. Milo\n8. Lily\n9. Charlie\n10. Sophie\n\nI hope you find the perfect name for your adorable new cat! Enjoy your time together.
"""

In [54]:
def text_preprocessing(text):
    return text.replace("\n", " ").strip()

In [57]:
def TTS_Reply(text_reply, filename):
    response = client.audio.speech.create(
        model="tts-1-hd", # tts-1, tts-1-hd
        voice="echo", # alloy, echo, fable, onyx, nova, shimmer
        input=text_reply  #  The text to generate audio for. The maximum length is 4096 characters.
    )
    
    response.stream_to_file(filename + "_reply.mp3")

In [95]:
def play_reply(filename):
    reply_filename = filename + "_reply.mp3"
    # audio = AudioSegment.from_mp3(reply_filename)
    # play(audio)

    mixer.init()
    mixer.music.load(reply_filename)
    mixer.music.play()

    # os.remove(reply_filename)

In [59]:
TTS_Reply(text_preprocessing(text_reply), FILENAME)

In [96]:
play_reply(FILENAME)

## Putting it all together

In [57]:
def SpeechToAnswer():
    recording = record_audio()

    save_as_mp3(recording, FILENAME, FS)

    print("Transcribing")
    transcription = Mp3Transcribe(FILENAME)
    # os.remove(filename + '.mp3')

    print("Sending question to LLM")
    answer = QueryLLM(transcription)

    print("Answer from LLM:")
    print(answer)
    
    TTS_Reply(text_preprocessing(answer), FILENAME)

    play_reply(FILENAME)
    
    return answer

## TEMP

In [65]:
import inspect

In [66]:
for name, obj in inspect.getmembers(openai.Audio):
    if inspect.isclass(obj):
        print (obj)

<class 'type'>


In [None]:
openai.api_resources.chat_completion.ChatCompletion