In [None]:
from IPython.display import clear_output

In [None]:
# Mount Google Drive
from google.colab import drive


drive.mount('/content/drive')

In [None]:
# Install dependencies
!pip install TTS
!sudo apt-get install espeak-ng
!pip install onnx
!pip install onnxruntime

# STT
!pip install git+https://github.com/openai/whisper.git
!pip install jiwer
!pip install tabulate
!pip install pydub
!pip install transformers

# API-related dependencies
!pip install fastapi uvicorn pydantic pyngrok nest_asyncio
!pip install python-multipart

clear_output()

In [None]:
# TTS-Related Imports
import IPython
import tempfile
import subprocess
import time
from TTS.tts.configs.vits_config import VitsConfig
from TTS.tts.models.vits import Vits
from TTS.utils.audio.numpy_transforms import save_wav
import numpy as np


# STT-Related Imports
import io
import wave
import numpy as np
import whisper
import jiwer
import time
import pandas as pd
from tabulate import tabulate
from pydub import AudioSegment
import os
import joblib
import re
from transformers import BertTokenizer, BertModel
import torch
import torch.nn.functional as F
from torch import nn, Tensor


# API-Related Imports
from fastapi import FastAPI,Response
from fastapi.middleware.cors import CORSMiddleware
from starlette.middleware.gzip import GZipMiddleware
from fastapi.responses import JSONResponse
from fastapi.responses import StreamingResponse,FileResponse
from fastapi import FastAPI, UploadFile, File
import shutil
from pydantic import BaseModel
from IPython.display import Audio
import uvicorn
import nest_asyncio
from pyngrok import ngrok
import base64


nest_asyncio.apply()

In [None]:
from starlette.middleware.gzip import GZipMiddleware
app = FastAPI()
app.add_middleware(
    CORSMiddleware,
    allow_origins=['*'],
    allow_credentials=True,
    allow_methods=['*'],
    allow_headers=['*'],
)
app.add_middleware(GZipMiddleware, minimum_size=50)

## TTS

In [None]:
# Load TTS model
live_config=VitsConfig()
live_config.load_json("/content/drive/MyDrive/NSMQ AI Project/Technical/TTS/Prof Elsie Kauffmann/VITS model/vits-elsie/traineroutput/vits_vctk-May-24-2023_11+05PM-23a7a9a3/config.json")
live_vits = Vits.init_from_config(live_config)
live_vits.load_onnx("/content/drive/MyDrive/NSMQ AI Project/Technical/TTS/Prof Elsie Kauffmann/VITS model/vits-elsie/elsie.onnx")

clear_output()

In [None]:
def live_audio(text:str):
  text_inputs = np.asarray(
      live_vits.tokenizer.text_to_ids(text, language="en"),
      dtype=np.int64,
  )[None, :]
  audio = live_vits.inference_onnx(text_inputs,speaker_id=0)
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
    out_path = temp_file.name
  save_wav(wav=audio[0], path=out_path,sample_rate=22050)
  return out_path

In [None]:
class LiveText(BaseModel):
  text: str

In [None]:
@app.get('/synthesize-speech')
def onnx_audio(payload:LiveText):
  out_path=live_audio(payload.text)
  return FileResponse(out_path, media_type="audio/wav")

## STT

In [None]:
# Load STT Model
# Load whisper model
torch.cuda.is_available()
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# DEVICE = "cpu"

model = whisper.load_model("medium.en", device = DEVICE)

In [None]:
def transcribe_audio(path_to_audio):
  """Loads whisper model to transcribe audio"""

  # Load audio
  audio = whisper.load_audio(path_to_audio)

  # Transcribe audio
  result = model.transcribe(audio)

  # Print transcript
  return result["text"]

In [None]:
class AudioBytes(BaseModel):
  data: bytes
  filename: str

@app.get("/get-transcript")
async def get_transcript(audio: AudioBytes):
  try:
    decoded_data = base64.b64decode(audio.data)

    # Write bytes data to a .wav file
    with io.BytesIO(decoded_data) as audio_file:
        with wave.open(audio_file, "wb") as wav:
          wav.setnchannels(1)
          wav.setsampwidth(2)
          wav.setframerate(16000)

          # Write .wav files
          wav.writeframes(decoded_data)

    # Save the audio file with the custom name
    audio_filename = audio.filename
    with open(audio_filename, "wb") as file:
        file.write(decoded_data)

    transcript = transcribe_audio(audio_filename)
    os.remove(audio_filename)
    return {"transcript": transcript}
  except Exception as e:
    return {"error":str(e)}

In [None]:
!ngrok config add-authtoken # TO DO: Replace this comment with your ngronk token (can be obtained from your ngronk account).

In [None]:
ngrok_tunnel = ngrok.connect(8000)
print("Public URL:", ngrok_tunnel.public_url)
uvicorn.run(app, port=8000)