In [None]:
import os
import base64
import requests
import uuid
import time
import logging
from dotenv import load_dotenv
import gradio as gr
import openai
from langchain import LLMChain, PromptTemplate
from langchain.memory import ConversationBufferWindowMemory
from langchain.chat_models import ChatOpenAI

In [None]:
# Load environment variables
load_dotenv(override=True)

In [None]:
# OpenAI API key
openai.organization =  os.environ["OPENAI_ORGANIZATION"]
openai.api_key = os.environ["OPENAI_API_KEY"]
voice_id =  os.environ["VOICE_ID"]
# ElevenLabs API configuration
CHUNK_SIZE = 1024
url = "https://api.elevenlabs.io/v1/text-to-speech/{}".format(voice_id)
headers = {
    "xi-api-key": os.environ['ELVEVEN_API_KEY']
}

# DID authentication data
username = os.environ["DID_USERNAME"]
password = os.environ["DID_PASSWORD"]
base64_credentials = base64.b64encode(f"{username}:{password}".encode()).decode()
headers_base = {
    "accept": "application/json",
    "Authorization": f"Basic {base64_credentials}"
}

# Template for the assistant's behavior
template = """Actua como un experto en IA, siempre defindes uso de la IA y vez el lado positivo de la AI y su uso para solucionar multiples problemas, ademas respondes de una manera muy breve y concisa y con una perspectiva muy positiva y esperanzadora

{history}
Human: {human_input}
Assistant:"""

prompt = PromptTemplate(input_variables=["history", "human_input"], template=template)

chatgpt_chain = LLMChain(
    llm=ChatOpenAI(model = 'gpt-3.5-turbo',temperature=.8,max_tokens=300),
    prompt=prompt,
    verbose=True,
    memory=ConversationBufferWindowMemory(k=4),
)

In [None]:

def transcribe_audio_to_text(recording):
    try:
        with open(recording, "rb") as audio_file:
            transcribe = openai.Audio.transcribe("whisper-1", audio_file)
        return transcribe['text']
    except Exception as e:
        logging.error(f"Error transcribing audio: {e}")
        return ""

def generate_llm_response(transcribed_text, messages):
    try:
        output = chatgpt_chain.predict(human_input=transcribed_text)
        messages.extend(['Alarcón: ' + transcribed_text, 'Alarcón pro IA: ' + output])
        chat_transcription = "\n ".join(messages)
        return chat_transcription, output
    except Exception as e:
        logging.error(f"Error generating LLM response: {e}")
        return "", ""

def convert_text_to_audio(text):
    try:
        data = {
            "text": text,
            "model_id": "eleven_multilingual_v1",
            "voice_settings": {
                "stability": 0.45,
                "similarity_boost": 0.85,
                "use_speaker_boost": True
            }
        }
        response = requests.post(url, json=data, headers=headers)
        audio_path = 'output_.mp3'
        with open(audio_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=CHUNK_SIZE):
                if chunk:
                    f.write(chunk)
        return audio_path
    except Exception as e:
        logging.error(f"Error converting text to audio: {e}")
        return None


In [None]:
def upload_did_audio(filename):
    try:
        url_audio = "https://api.d-id.com/audios"
        files = { "audio": (filename, open(filename, "rb"), ".") }
        response_upload = requests.post(url_audio, files=files, headers=headers_base)
        response_upload.raise_for_status()
        audio_url = response_upload.json().get('url', '')
        return audio_url
    except Exception as e:
        logging.error(f"Error uploading audio to D-ID: {e}")
        return ""

def did_avatar(audio_url):
    try:
        url_talks = "https://api.d-id.com/talks"
        data = {
            "source_url": "https://create-images-results.d-id.com/google-oauth2%7C110735661733404113770/upl_rXLLDq6unvtWbkA5q3Cc1/image.png",
            "script": {
                "type": "audio",
                "audio_url": audio_url
            }
        }
        headers_talk = {
            "Content-Type": "application/json",
            "Accept": "application/json",
            "Authorization": f"Basic {base64_credentials}"
        }
        response_talk = requests.post(url_talks, json=data, headers=headers_talk)
        response_talk.raise_for_status()
        request_video_id = response_talk.json().get('id', '')
        url_video_talks = f"https://api.d-id.com/talks/{request_video_id}"

        flag_validation = True
        while flag_validation:
            response_video = requests.get(url_video_talks, headers=headers_base)
            response_video.raise_for_status()
            try:
                video_url = response_video.json()['result_url']
                flag_validation = False
            except:
                time.sleep(1)
        return video_url
    except Exception as e:
        logging.error(f"Error generating avatar with D-ID: {e}")
        return ""

def download_video(video_url):
    file_name = f'video_{uuid.uuid4()}.mp4'
    try:
        response = requests.get(video_url)
        response.raise_for_status()
        with open(file_name, 'wb') as file:
            file.write(response.content)
        return file_name
    except Exception as e:
        logging.error(f"Error downloading video: {e}")
        return ""


In [None]:
#avatar_talking("flagged/audio/5d0656b0bac11c148529d7c75a50c60b3fe6f2f1/tmpu8inxq6k.wav", "pro.png")

In [None]:
messages = []

def avatar_talking(audio, img):
    global messages
    try:
        text_transcribe = transcribe_audio_to_text(audio)
        llm_response, last_response = generate_llm_response(text_transcribe, messages)
        audio_path = convert_text_to_audio(last_response)
        if not audio_path:
            raise Exception("Error converting text to audio.")
        
        audio_url = upload_did_audio(audio_path)
        video_url = did_avatar(audio_url)
        if not video_url:
            raise Exception("Error generating video avatar.")
        
        video_path = download_video(video_url)
        if not video_path:
            raise Exception("Error downloading video.")
        
        return video_path, llm_response
    except Exception as e:
        logging.error(f"Error in avatar_talking: {e}")
        return "", "An error occurred. Please try again."


In [None]:
default_image_path = "pro.png"  # Path to the default image

ui = gr.Interface(
    fn=avatar_talking,
    inputs=[
        gr.Audio(source="microphone"),
        gr.Image(label="Image", value=default_image_path, height=250, width=250)  # Default image set here
    ],
    outputs=[
        gr.Video(label="Generated Audio", autoplay=True),
        gr.Textbox(label="Transcript")
    ],
    live=False  # Optional: Enables live updates as you speak, set to False for this use case
)


In [None]:
ui.launch(debug=True)