In [1]:
%pip install pyttsx3 python-dotenv deepgram-sdk==3.* pydub sounddevice openai termcolor toml




In [18]:
import pyttsx3
from deepgram import (
    DeepgramClient,
    PrerecordedOptions,
    FileSource,
)
from datetime import datetime
from dotenv import load_dotenv

load_dotenv()

import sounddevice as sd
from pydub import AudioSegment
from pydub.playback import play

from openai import OpenAI

import toml
from termcolor import colored
import speech_recognition as sr

In [2]:
secrets = toml.load(open("secrets.toml", "r"))

DEEPGRAM_API_KEY = secrets['DEEPGRAM_API_KEY']
OPENAI_API_KEY = secrets['OPENAI_API_KEY']
PREDEFINED_PROMPT = secrets['PREDEFINED_PROMPT']

In [3]:
user_data = toml.load(open("user_config.toml", "r"))

NAME = user_data['NAME']
AGE = user_data['AGE']
GENDER = user_data['GENDER']

In [20]:
def record_and_save_to_mp3(file_path="recorded_audio.mp3", duration=4, sample_rate=44100):
    try:
        audio_data = sd.rec(int(duration * sample_rate), samplerate=sample_rate, channels=2, dtype='int16')
        sd.wait()

        audio_segment = AudioSegment(
            audio_data.tobytes(),
            frame_rate=sample_rate,
            sample_width=audio_data.dtype.itemsize,
            channels=2
        )

        audio_segment.export(file_path, format="mp3")

        return file_path

    except Exception as e:
        print(f"Exception: {e}")

def speech_to_text(API_KEY, AUDIO_FILE):
    try:
        deepgram = DeepgramClient(API_KEY)

        with open(AUDIO_FILE, "rb") as file:
            buffer_data = file.read()

        payload: FileSource = {
            "buffer": buffer_data,
        }

        options = PrerecordedOptions(
            model="nova-2",
            smart_format=True,
        )

        response = deepgram.listen.prerecorded.v("1").transcribe_file(payload, options)

        return response.results.channels[0].alternatives[0].transcript
        
    except Exception as e:
        print(f"Exception: {e}")
        return None

def speech_to_text2(AUDIO_FILE):
    recognizer = sr.Recognizer()
    
    try:
        text = recognizer.recognize_google(AUDIO_FILE)
        print(f"You: {text}")

        if not text:
            return

        text_list = text.split()
    
            
    except sr.UnknownValueError:
        pass
    except AssertionError:
        print(text_list)

def process_text(text, NAME, AGE, GENDER):
    if "time" in text:
        # print(colored(f"The current time is {datetime.now().strftime('%H:%M')}", "green"))
        return f"It's {datetime.now().strftime('%I:%M %p')} right now."
    
    if text == '' or len(text)<3:
        return "I'm sorry, I didn't catch that."
    
    client = OpenAI()

    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
        {"role": "system", "content": PREDEFINED_PROMPT.format(NAME, AGE, GENDER)},
        {"role": "user", "content": f"{text}"}
        ]
    )

    return response.choices[0].message.content

def text_to_speech(text, voice_id=0):
    engine = pyttsx3.init()
    voices = engine.getProperty('voices')
    engine.setProperty('voice', voices[voice_id].id)
    engine.setProperty('rate', 150)
    engine.say(text)
    engine.runAndWait()

In [17]:
if __name__ == "__main__":
    file_path = record_and_save_to_mp3()
    # print(colored(f"Audio file saved at: {file_path}\n", "yellow"))
    
    text = speech_to_text(DEEPGRAM_API_KEY, file_path)
    print(colored(f"User: {text}", "green"))

    response = process_text(text, NAME, AGE, GENDER)
    print(colored(f"Limbo: {response}", "blue"))
    
    text_to_speech(response)

[32mUser: Who are you?[0m
[34mLimbo: Greetings Arpit Sengar, I am Limbo, your personal assistant from Hastakriti. I am here to assist you with any queries or tasks you may have. How may I assist you today?[0m


### --------------------------- TEST ---------------------------

In [36]:
import pydub
import speech_recognition as sr
from termcolor import colored

output_ = "recorded_audio_wav.wav"

def convert_to_wav(input_file, output_file):
    audio = pydub.AudioSegment.from_file(input_file)
    audio.export(output_file, format="wav")
    return output_file

def speech_to_text2(AUDIO_FILE, language='en-US'):  # Set default language to English
    recognizer = sr.Recognizer()

    try:
        with sr.AudioFile(AUDIO_FILE) as source:
            audio_data = recognizer.record(source)
            text = recognizer.recognize_google(audio_data, language=language)
            text_list = text.split()

            if not text:
                return

            return text

    except sr.UnknownValueError:
        pass
    except AssertionError:
        return text_list

if __name__ == "__main__":
    # Specify the language code for the desired language, for example, 'es-ES' for Spanish
    file_path = record_and_save_to_mp3(duration=3)
    wav_file = convert_to_wav(file_path, output_)
    
    # Specify the language code for the desired language, for example, 'es-ES' for Spanish
    text = speech_to_text2(wav_file, language='es-ES')
    
    print(colored(f"User: {text}", "green"))

[32mUser: ya me iría[0m


In [44]:
import time
def listen_and_respond(source, recognizer):
    start_time = time.time()
    print(start_time)
    while True:
        print(time.time())
        if time.time() - start_time > 5:
            break
        
        audio = recognizer.listen(source)

        try:
            text = recognizer.recognize_google(audio)
            print(f"You: {text}")

        except sr.UnknownValueError:
            print("Bot: Silence found, shutting up")
            break

        except sr.RequestError as e:
            print(f"Bot: Could not request results; {e}")
            break

def main():
    recognizer = sr.Recognizer()
    with sr.Microphone(device_index=2) as source:
        listen_and_respond(source, recognizer)

if __name__ == "__main__":
    main()

1708237908.2749238
1708237908.2749238


KeyboardInterrupt: 