In [6]:
from transformers import pipeline
import torch

device = "cuda:0" if torch.cuda.is_available() else "cpu"

classifier = pipeline(
    "audio-classification", model="MIT/ast-finetuned-speech-commands-v2", device=device
)

In [7]:
import subprocess
import numpy as np
import sys

def launch_fn(wake_word="marvin", prob_threshold=0.5, chunk_length_s=1,
              debug=False, amplitdude_threshold=2000):
    
    if wake_word not in classifier.model.config.label2id.keys():
        raise ValueError(
            f"Wake word {wake_word} not in set of valid class labels, pick a wake word in the set {classifier.model.config.label2id.keys()}."
        )
    
    # ffmpeg lib required for this function 
    ffmpeg_command = [
        'ffmpeg',
        '-f', 'avfoundation',
        '-i', ':0',  # default microphone
        '-ac', '1',  # audio channels (1 for mono)
        '-ar', '16000',  # sample rate
        '-f', 's16le',  # format (signed 16-bit little endian)
        '-acodec', 'pcm_s16le',  # audio codec (PCM signed 16-bit little endian)
        '-'
    ]
    
    process = subprocess.Popen(ffmpeg_command, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL)
    try:
        print('listening...')
        while True:
            # read 40960 bytes (~1 seconds of audio) from ffmpeg stdout
            data = process.stdout.read(40960*chunk_length_s)
            if not data:
                break

            # convert raw audio bytes to a numpy array
            audio_data = np.frombuffer(data, dtype=np.int16)

            # check if any value in the audio data exceeds the threshold
            if np.any(audio_data > amplitdude_threshold):
                input_data = audio_data.astype(np.float32)
                
                # normalize the data
                input_data = (input_data - input_data.mean()) / (input_data.std() * 2)
                prediction = classifier(input_data)
                prediction = prediction[0]
                
                if debug:
                    print('debug',prediction)
                if prediction["label"] == wake_word:
                    if prediction["score"] > prob_threshold:
                        process.kill()
                        return  True
                              
                if prediction["label"] == 'stop':       
                    if prediction["score"] > prob_threshold:
                        process.kill()
                        raise KeyboardInterrupt()
                
    except KeyboardInterrupt:
        print("stoped by user")
    finally:
        process.kill()
    

In [3]:
launch_fn(debug=False)

listening...
stoped by user


In [8]:
transcriber = pipeline(
    'automatic-speech-recognition', model="openai/whisper-base.en", device=device
    )

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [18]:
def transcribe(chunk_length_s=5, amplitdude_threshold=2000):
    # ffmpeg lib required for this function 
    ffmpeg_command = [
        'ffmpeg',
        '-f', 'avfoundation',
        '-i', ':0',  # default microphone
        '-ac', '1',  # audio channels (1 for mono)
        '-ar', '16000',  # sample rate
        '-f', 's16le',  # format (signed 16-bit little endian)
        '-acodec', 'pcm_s16le',  # audio codec (PCM signed 16-bit little endian)
        '-'
    ]
    
    process = subprocess.Popen(ffmpeg_command, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL)
    try:
        print('start speaking...')
        while True:
            # read 40960 bytes (~1 seconds of audio) from ffmpeg stdout
            data = process.stdout.read(40960*chunk_length_s)
            if not data:
                break

            # convert raw audio bytes to a numpy array
            audio_data = np.frombuffer(data, dtype=np.int16)

            # check if any value in the audio data exceeds the threshold
            if np.any(audio_data > amplitdude_threshold):
                input_data = audio_data.astype(np.float32)
                
                # normalize the data
                input_data = (input_data - input_data.mean()) / (input_data.std() * 2)
                item = transcriber(input_data, generate_kwargs={"max_new_tokens": 128})
                
                return item['text']
                    
    except KeyboardInterrupt:
        print("stoped by user")
    finally:
        process.kill()
    

In [19]:
print(transcribe())

start speaking...
 How are you?


In [10]:
from huggingface_hub import HfFolder
import requests

In [11]:
def query(text, model_id="allenai/tk-instruct-3b-def"):
    api_url = f"https://api-inference.huggingface.co/models/{model_id}"
    headers = {"Authorization": f"Bearer {HfFolder().get_token()}"}
    payload = {"inputs": text}
    print(f"Querying...: {text}")
    response = requests.post(api_url, headers=headers, json=payload)
    return response.json()[0]["generated_text"]

In [9]:
print(query('how are you?'))

Querying...: how are you?
I am doing great. I am still working on my book. I am also working on 


In [10]:
query("What does Hugging Face do?")

Querying...: What does Hugging Face do?


'Hugging Face is a non-profit organization that provides a wide range of services to'

In [4]:
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan

In [12]:
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)

Downloading config.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/50.7M [00:00<?, ?B/s]

In [13]:
from datasets import load_dataset

In [14]:
embeddings_dataset =  load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)

Downloading builder script:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/17.9M [00:00<?, ?B/s]

Generating validation split: 0 examples [00:00, ? examples/s]

In [15]:
def synthesise(text):
    inputs = processor(text=text, return_tensors="pt")
    speech = model.generate_speech(
        inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder
    )
    return speech.cpu()

In [16]:
from IPython.display import Audio

audio = synthesise(
    "Hugging Face is a company that provides natural language processing and machine learning tools for developers."
)

Audio(audio, rate=16000)

In [20]:
launch_fn()
transcription = transcribe()
response = query(transcription)
audio = synthesise(response)

Audio(audio, rate=16000, autoplay=True)

listening...
start speaking...
Querying...:  How are you?
