In [1]:
from transformers import pipeline
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

classifier = pipeline(
    "audio-classification", model="MIT/ast-finetuned-speech-commands-v2", device=device
)

Device set to use cpu


In [3]:
classifier.model.config.id2label

{0: 'backward',
 1: 'follow',
 2: 'five',
 3: 'bed',
 4: 'zero',
 5: 'on',
 6: 'learn',
 7: 'two',
 8: 'house',
 9: 'tree',
 10: 'dog',
 11: 'stop',
 12: 'seven',
 13: 'eight',
 14: 'down',
 15: 'six',
 16: 'forward',
 17: 'cat',
 18: 'right',
 19: 'visual',
 20: 'four',
 21: 'wow',
 22: 'no',
 23: 'nine',
 24: 'off',
 25: 'three',
 26: 'left',
 27: 'marvin',
 28: 'yes',
 29: 'up',
 30: 'sheila',
 31: 'happy',
 32: 'bird',
 33: 'go',
 34: 'one'}

In [4]:
classifier.model.config.id2label[27]

'marvin'

* To define a fn that is constantly listening to our device's microphone we will use ffmpeg_microphone_live.
* This fn forwards small chunks of audio of specified chunk length for audio classification.
* To ensure that we get smooth boundaries across chunks of audio, we run a sliding window across our audio with stride chunk_length_s / 6.
* To avoid waiting for the chunk to be recorded before starting inference, we define minimal temporary audio input length stream_chunk_s that is forwarded to the model before chunk_length_s time is reached
* The function ffmpeg_microphone_live returns a generator object, yielding a sequence of audio chunks that can each be passed to the classification model to make a prediction.
* We can pass this generator directly to the pipeline, which in turn returns a sequence of output predictions, one for each chunk of audio input.
* We can inspect the class label probabilities for each audio chunk, and stop our wake word detection loop when we detect that the wake word has been spoken.
* We declare a threshold, for if the prob is above that threshold, we declare that the wake up call has been spoken. 

In [5]:
from transformers.pipelines.audio_utils import ffmpeg_microphone_live

In [6]:
def launch_fn(
    wake_word="marvin",
    prob_threshold=0.5,
    chunk_length_s=2.0,
    stream_chunk_s=0.25,
    debug=False,
):
    if wake_word not in classifier.model.config.label2id.keys():
        raise ValueError(
            f"Wake word {wake_word} not in set of valid class labels, pick a wake word in the set {classifier.model.config.label2id.keys()}."
        )

    sampling_rate = classifier.feature_extractor.sampling_rate

    mic = ffmpeg_microphone_live(
        sampling_rate=sampling_rate,
        chunk_length_s=chunk_length_s,
        stream_chunk_s=stream_chunk_s,
    )

    print("Listening for wake word...")
    for prediction in classifier(mic):
        prediction = prediction[0]
        if debug:
            print(prediction)
        if prediction["label"] == wake_word:
            if prediction["score"] > prob_threshold:
                return True

In [7]:
# launch_fn(debug=True)

In [8]:
transcriber = pipeline(
    "automatic-speech-recognition", model="openai/whisper-base.en", device=device
)

Device set to use cpu


In [9]:
import sys

In [10]:
# Terminating the system after 5 sec
def transcribe(chunk_length_s=5.0, stream_chunk_s=1.0):
    sampling_rate = transcriber.feature_extractor.sampling_rate

    mic = ffmpeg_microphone_live(
        sampling_rate=sampling_rate,
        chunk_length_s=chunk_length_s,
        stream_chunk_s=stream_chunk_s,
    )

    print("Start speaking...")
    for item in transcriber(mic, generate_kwargs={"max_new_tokens": 128}):
        sys.stdout.write("\033[K")
        print(item["text"], end="\r")
        if not item["partial"][0]:
            break

    return item["text"]

In [11]:
from huggingface_hub import HfFolder
import requests

In [None]:
!pip install ipywidgets

In [15]:
def query(text, model_id="tiiuae/falcon-7b-instruct"):
    api_url = f"https://api-inference.huggingface.co/models/{model_id}"
    headers = {"Authorization": f"Bearer {HfFolder().get_token()}"}
    payload = {"inputs": text}

    print(f"Querying...: {text}")
    response = requests.post(api_url, headers=headers, json=payload)
    return response.json()[0]["generated_text"][len(text) + 1 :]

In [16]:
query("What does Hugging Face do?")

Querying...: What does Hugging Face do?


JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [17]:
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan

processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")

model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)

In [18]:
from datasets import load_dataset

embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)

In [19]:
def synthesise(text):
    inputs = processor(text=text, return_tensors="pt")
    speech = model.generate_speech(
        inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder
    )
    return speech.cpu()

In [20]:
from IPython.display import Audio

audio = synthesise(
    "Hugging Face is a company that provides natural language processing and machine learning tools for developers."
)

Audio(audio, rate=16000)

In [None]:
launch_fn()
transcription = transcribe()
response = query(transcription)
audio = synthesise(response)

Audio(audio, rate=16000, autoplay=True)

In [30]:
import pandas as pd
df = pd.read_csv(r"C:\Users\aarus\Downloads\metadata (1).csv", delimiter = '|')
df.head()

Unnamed: 0,metacsv: path,transcription
0,/content/drive/MyDrive/dataset/chunks_12sec/ch...,हेलो जी नमस्ते एंड स्वागत है आपका बनारस इस वीड...
1,/content/drive/MyDrive/dataset/chunks_12sec/ch...,बनारस आ चुके है और सबसे पहली चदा जो हम खाने जा...
2,/content/drive/MyDrive/dataset/chunks_12sec/ch...,बंडार का और मैं बहुत ज्यादा मुझे से क्रेविंग भ...
3,/content/drive/MyDrive/dataset/chunks_12sec/ch...,चर्ना पूर्णे का अच्छी उसे बिस्कियर गए बट इस वह...
4,/content/drive/MyDrive/dataset/chunks_12sec/ch...,अभी जोनी रॉप्ते भी मुलवाए हमने लेकिन मी ट्राइब...


In [31]:
df['transcription'][0]

'हेलो जी नमस्ते एंड स्वागत है आपका बनारस इस वीडियो में मैं आपको लेकर चलने वाली हूं बनारस की कलियों में ताशी का स्वाच्चक लेकर चलिए इस वीडियो को शुरू कर दीजिए अब'