<a href="https://colab.research.google.com/github/ailab-nda/ML/blob/main/VoiceChatGPT_AP_VOICEBOX.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ChatGPT API + Whisper + gTTs

## ライブラリのインストール

In [None]:
# パッケージのクローンとセットアップ
!git clone https://github.com/VOICEVOX/voicevox_core -b 0.13.2
%cd voicevox_core/

In [None]:
# 環境構築

# ONNX Runtimeのダウンロード
!mkdir onnxruntime
!wget https://github.com/microsoft/onnxruntime/releases/download/v1.10.0/onnxruntime-linux-x64-gpu-1.10.0.tgz
!tar xf onnxruntime-linux-x64-gpu-1.10.0.tgz -C onnxruntime --strip=1
!rm onnxruntime-linux-x64-gpu-1.10.0.tgz
# コアライブラリのダウンロード
!mkdir release
!wget https://github.com/VOICEVOX/voicevox_core/releases/download/0.13.2/voicevox_core-linux-x64-gpu-0.13.2.zip
!unzip -qj voicevox_core-linux-x64-gpu-0.13.2.zip -d release
!rm voicevox_core-linux-x64-gpu-0.13.2.zip
# 配置
!mkdir -p core/lib
!cp onnxruntime/lib/* core/lib
!cp release/* core/lib
# 辞書データダウンロード
!wget http://downloads.sourceforge.net/open-jtalk/open_jtalk_dic_utf_8-1.11.tar.gz
!tar xf open_jtalk_dic_utf_8-1.11.tar.gz
!rm open_jtalk_dic_utf_8-1.11.tar.gz

In [None]:
# pythonモジュールのインストール

!pip install -qqUr requirements.txt
!pip install -qqU .

In [None]:
!pip install openai==0.28 gTTs pydub

## ライブラリのインポート

In [None]:
from IPython.display import Javascript
from google.colab import output
from base64 import b64decode
from gtts import gTTS
import IPython.display
from pydub import AudioSegment
import openai
openai.api_key = ""#@param{type:"string"}
#from openai import OpenAI
#import os
#os.environ['OPENAI_API_KEY'] = ""#@param{type:"string"}
#client = OpenAI()

## 音声認識の準備

In [None]:
RECORD = """
  const sleep = time => new Promise(resolve => setTimeout(resolve, time))
  const b2text = blob => new Promise(resolve => {
    const reader = new FileReader()
    reader.onloadend = e => resolve(e.srcElement.result)
    reader.readAsDataURL(blob)
  })
  var record = time => new Promise(async resolve => {
    stream = await navigator.mediaDevices.getUserMedia({ audio: true })
    recorder = new MediaRecorder(stream)
    chunks = []
    recorder.ondataavailable = e => chunks.push(e.data)
    recorder.start()
    await sleep(time)
    recorder.onstop = async ()=>{
      blob = new Blob(chunks)
      text = await b2text(blob)
      resolve(text)
    }
    recorder.stop()
  })
"""
def speech_to_text(model='whisper-1', language='ja', second=5):
  filename='tmp.wav'
  display(Javascript(RECORD))
  s = output.eval_js('record(%d)' % (second * 1000))
  b = b64decode(s.split(',')[1])

  with open(filename, 'wb+') as fw:
    fw.write(b)

  # 以下3行は講義室用に必要
  sound = AudioSegment.from_file("tmp.wav")
  sound.export("tmp.mp3", format="mp3")
  filename = 'tmp.mp3'

  with open(filename, "rb") as fr:
    transcription = openai.Audio.transcribe(
        model=model,
        file=fr,
        language=language
    )
    return transcription['text']

## ChatGPT と会話する

### 人格、役割などの設定

In [None]:
system_messages = [
    {'role': 'system', 'content': 'あなたは有能なアシスタントです。'},
    {'role': 'system', 'content': '質問には1行で返答してください。'}
]

### 質疑応答の例（いろいろ試してください）

In [None]:
print("質問して下さい（５秒）")
question = speech_to_text()
print("質問：", question)
user_messages=[{
    "role": "user",
    "content": question
}]

print("\n回答：", end="")
messages = system_messages + user_messages
result = openai.ChatCompletion.create(
    model = "gpt-3.5-turbo",
    messages = messages,
    stream = True,
)
response_text = ""
for chunk in result:
    content = chunk["choices"][0].get("delta", {}).get("content")
    if content is not None:
        print(content, end='', flush=True)
        response_text += content
print("")

message = response_text
tts = gTTS(text=message, lang='ja', slow=False)
tts.save('gTTS_test.mp3')
IPython.display.display(IPython.display.Audio('gTTS_test.mp3', autoplay=True))

## 繰り返し

In [None]:
messages = system_messages # 新しくリストを定義する
max_messages = 10 # 最大メッセージ数を定義する

while True:
    print("喋る準備ができたらエンターキーを押してください。")
    input()
    print("質問して下さい（５秒）")
    question = speech_to_text()
    print("質問：", question)
    messages.append({"role": "user", "content": question})

    if 'メッセージ' in question:
        print('messages:', messages)
        continue

    print("\n回答：", end="")
    result = openai.ChatCompletion.create(
        model = "gpt-3.5-turbo",
        messages = messages,
        stream = True,
    )
    response_text = ""
    for chunk in result:
        content = chunk["choices"][0].get("delta", {}).get("content")
        if content is not None:
            print(content, end='', flush=True)
            response_text += content
    print("")
    messages.append({"role": "assistant", "content": response_text})

    tts = gTTS(response_text, lang='ja')
    tts.save('gTTS_test.mp3')
    IPython.display.display(IPython.display.Audio('gTTS_test.mp3', autoplay=True))

    if len(messages) > max_messages * 2 + 1:
        messages.pop(1)
        messages.pop(1)

    if '終了' in question:
        break