<a href="https://colab.research.google.com/github/0de554kw/EQUALPostOstTools/blob/main/Whisper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Wishper FE

In [None]:
#@title Run this to setup the application

#@markdown * Install OpenAI Whisper
#@markdown * Download pretrained model
#@markdown * Used sources:

#@markdown https://github.com/magenta/ddsp/blob/main/ddsp/colab/colab_utils.py

#@markdown https://github.com/tugstugi/dl-colab-notebooks/archive/colab_utils.zip


!pip install git+https://github.com/openai/whisper.git
!pip install pydub librosa


import whisper
from IPython import display as adsp
from IPython.display import display, clear_output
from IPython.utils import io
import ipywidgets as widgets
import base64
import io
import tempfile
import librosa
from pydub import AudioSegment

from google.colab import files
from google.colab import output

DEFAULT_SAMPLE_RATE = 16000

def record_audio(seconds=3,
                 sample_rate=DEFAULT_SAMPLE_RATE,
                 normalize_db=0.1):
    """Record audio from the browser in colab using javascript.
    Based on: https://gist.github.com/korakot/c21c3476c024ad6d56d5f48b0bca92be
    Args:
      seconds: Number of seconds to record.
      sample_rate: Resample recorded audio to this sample rate.
      normalize_db: Normalize the audio to this many decibels. Set to None to skip
        normalization step.
    Returns:
      An array of the recorded audio at sample_rate.
    """
    # Use Javascript to record audio.
    record_js_code = """
      const sleep  = time => new Promise(resolve => setTimeout(resolve, time))
      const b2text = blob => new Promise(resolve => {
        const reader = new FileReader()
        reader.onloadend = e => resolve(e.srcElement.result)
        reader.readAsDataURL(blob)
      })
      var record = time => new Promise(async resolve => {
        stream = await navigator.mediaDevices.getUserMedia({ audio: true })
        recorder = new MediaRecorder(stream)
        chunks = []
        recorder.ondataavailable = e => chunks.push(e.data)
        recorder.start()
        await sleep(time)
        recorder.onstop = async ()=>{
          blob = new Blob(chunks)
          text = await b2text(blob)
          resolve(text)
        }
        recorder.stop()
      })
      """
    print('Starting recording for {} seconds...'.format(seconds))
    adsp.display(adsp.Javascript(record_js_code))
    audio_string = output.eval_js('record(%d)' % (seconds * 1000.0))
    print('Finished recording!')
    audio_bytes = base64.b64decode(audio_string.split(',')[1])
    return audio_bytes_to_np(audio_bytes,
                             sample_rate=sample_rate,
                             normalize_db=normalize_db)


def audio_bytes_to_np(wav_data,
                      sample_rate=DEFAULT_SAMPLE_RATE,
                      normalize_db=0.1):
    """Convert audio file data (in bytes) into a numpy array.
    Saves to a tempfile and loads with librosa.
    Args:
      wav_data: A byte stream of audio data.
      sample_rate: Resample recorded audio to this sample rate.
      normalize_db: Normalize the audio to this many decibels. Set to None to skip
        normalization step.
    Returns:
      An array of the recorded audio at sample_rate.
    """
    # Parse and normalize the audio.
    audio = AudioSegment.from_file(io.BytesIO(wav_data))
    audio.remove_dc_offset()
    if normalize_db is not None:
        audio.normalize(headroom=normalize_db)
    # Save to tempfile and load with librosa.
    with tempfile.NamedTemporaryFile(suffix='.wav') as temp_wav_file:
        fname = temp_wav_file.name
        audio.export(fname, format='wav')
        audio_np, unused_sr = librosa.load(fname, sr=sample_rate)
    return audio_np


def upload_audio(sample_rate=DEFAULT_SAMPLE_RATE, normalize_db=None):
    """Load a collection of audio files (.wav, .mp3) from disk into colab.
    Args:
      sample_rate: Resample recorded audio to this sample rate.
      normalize_db: Normalize the audio to this many decibels. Set to None to skip
        normalization step.
    Returns:
      An tuple of lists, (filenames, numpy_arrays).
    """
    audio_files = files.upload()
    fnames = list(audio_files.keys())
    if len(fnames) == 0:
        return None

    return audio_bytes_to_np(audio_files[fnames[0]],
                             sample_rate=sample_rate,
                             normalize_db=normalize_db)

model = whisper.load_model("small")

In [None]:
#@title Run this cell to Record or Upload Audio
#@markdown * Either record audio from microphone or upload audio from file (.mp3 or .wav)

from pprint import pprint

record_or_upload = "Record" #@param ["Record", "Upload (.mp3 or .wav)"]
record_seconds =   10#@param {type:"number", min:1, max:10, step:1}
SAMPLE_RATE = 22050

def transcribe(audio):
  ret = model.transcribe(audio)
  text = ret['text']
  print("\n\n")
  pprint(text)

def _record_audio(b):
  clear_output()
  audio = record_audio(record_seconds, sample_rate=SAMPLE_RATE)
  transcribe(audio)

def _upload_audio(b):
  clear_output()
  audio = upload_audio(sample_rate=SAMPLE_RATE)
  transcribe(audio)

if record_or_upload == "Record":
  button = widgets.Button(description="Record Your Voice")
  button.on_click(_record_audio)
  display(button)
else:
  button = widgets.Button(description="Upload Voice File")
  button.on_click(_upload_audio)
  _upload_audio("")