Installing Neccessary Dependencies

In [1]:
!pip install -q git+https://github.com/openai/whisper.git
!pip install -q gradio
!pip install -q openai
!pip install -q gTTS
!pip install -q git+https://github.com/librosa/librosa
!pip install -q pydub
!pip install -q resampy
!ffmpeg -f lavfi -i anullsrc=r=44100:cl=mono -t 10 -q:a 9 -acodec libmp3lame Temp.mp3

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m69.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for openai-whisper (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.3/17.3 MB[0m [31m64.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m140.5/140.5 kB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.3/75.3 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.5/50.5 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata

# Setup Notebook

Import the libraries

In [2]:
import whisper
import gradio as gr 
import time
import warnings
import json
import openai
import os
from gtts import gTTS
from google.colab import drive
from keras.models import load_model
import numpy as np
import math
import librosa
import resampy
from pydub import AudioSegment
drive.mount('/gdrive')

Mounted at /gdrive


Setup files path for Colab to access

In [3]:
main_path = '/gdrive/MyDrive/Colab_Notebooks'
os.chdir(main_path)

In [4]:
emotion_enc = {'fear':0, 'disgust':1, 'neutral':2, 'happy':3, 'sadness':4, 'surprise':5, 'angry':6}
emotion_dec = {v: k for k, v in emotion_enc.items()}

In [5]:
warnings.filterwarnings("ignore")

In [6]:
openai.api_key = '' #Insert OpenAI API Key Here

In [7]:
model = whisper.load_model("base") #or base
model.device

100%|████████████████████████████████████████| 139M/139M [00:01<00:00, 121MiB/s]


device(type='cuda', index=0)

# Implementation

Define the function to generate system response with the help of OpenAI's GPT4 API

In [12]:
def chatgpt_api(input_text, emotion_result):
    system_prompt = "User's emotion is " + emotion_result + " , as a Therapist utilize this for any question the user may have and help him by keeping the emotion in mind."
    print(system_prompt)
    messages = [
    {"role": "system", "content": system_prompt}]
    
    if input_text:
        messages.append(
            {"role": "user", "content": input_text},
        )
        chat_completion = openai.ChatCompletion.create(
            model="gpt-4", messages=messages
        )
    
    reply = chat_completion.choices[0].message.content
    return reply

Define the function to detect user's emotion

In [13]:
def discover_emotion(audio):
    input_file = audio
    x,sr = librosa.load(input_file,
             res_type='kaiser_fast', sr=44000)
    length_chosen = 120378
    if x.shape[0] > length_chosen:
        new = x[:length_chosen]
    elif x.shape[0] < length_chosen:
        new = np.pad(x,math.ceil((length_chosen-x.shape[0])/2), mode='median')
    else:
        new = x
    mfcc = librosa.feature.mfcc(y=new, sr=44000, n_mfcc=40)
    mfcc = mfcc.T
    mfcc.shape
    emotion_model = load_model("model3.h5")
    mfcc = mfcc.reshape(1,236,40)
    mfcc.shape
    p = emotion_model.predict(mfcc)
    emotion_result = emotion_dec[p.argmax()]

    return emotion_result


Define the function for voice to text transcription(Whisper) and text to voice generation(HuggingFace TTS)

In [14]:
def transcribe(audio):
    language = 'en'

    #call emotion recognition
    emotion_result = discover_emotion(audio)
    #Chatbot 
    audio = whisper.load_audio(audio)
    audio = whisper.pad_or_trim(audio)

    mel = whisper.log_mel_spectrogram(audio).to(model.device)

    _, probs = model.detect_language(mel)

    options = whisper.DecodingOptions()
    result = whisper.decode(model, mel, options)
    result_text = result.text
    
    out_result = chatgpt_api(result_text, emotion_result)
    
    audioobj = gTTS(text = out_result, 
                    lang = language, 
                    slow = False)
    
    audioobj.save("Temp.mp3")

    return [result_text,emotion_result, out_result, "Temp.mp3"]
  

Create the Gradio intrface for user to interact with the chatbot

In [None]:
output_1 = gr.Textbox(label="Speech to Text")
output_2 = gr.Textbox(label="ChatGPT Output")
output_3 = gr.Audio("Temp.mp3")
output_4 = gr.Textbox(label="User Emotion Detected")
gr.Interface(
    title = 'EmotionGPT', 
    fn=transcribe, 
    inputs=[
        gr.inputs.Audio(source="microphone", type="filepath")
    ],

    outputs=[
        output_1, output_4, output_2, output_3 
    ],
    live=True).launch(debug=True)
# inputFile= "hello.mp3"
# transcribe(inputFile)

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Note: opening Chrome Inspector may crash demo inside Colab notebooks.

To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>

User's emotion is neutral , as a Therapist utilize this for any question the user may have and help him by keeping the emotion in mind.
