In [7]:
from transformers.utils import logging
logging.set_verbosity_error()

In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline, Conversation

chatbot = pipeline("text2text-generation", model="facebook/blenderbot-400M-distill")


user_message = "What are some fun activities I can do in the winter?"
conversation = Conversation(user_message)
conversation = chatbot(conversation)
print(conversation)

conversation.add_message(
    {"role": "user",
     "content": "What else do you recommend?"
    })

conversation = chatbot(conversation)
print(conversation)

2. Translation and Summarization

In [23]:
from transformers import pipeline 
import torch

translator = pipeline(task="translation",
                      model="facebook/nllb-200-distilled-600M",
                      torch_dtype=torch.bfloat16)

text = """\
My puppy is adorable, \
Your kitten is cute.
Her panda is friendly.
His llama is thoughtful. \
We all have nice pets!"""

text_translated = translator(text,
                             src_lang="eng_Latn",
                             tgt_lang="fra_Latn")

print(text_translated[0]["translation_text"])


Mon chiot est adorable, ton chaton est mignon, son panda est ami, sa lamme est attentive, nous avons tous de beaux animaux de compagnie.


3. Sentence Similarity

In [28]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-MiniLM-L6-v2")
sentences1 = ['The cat sits outside',
              'A man is playing guitar',
              'The movies are awesome']
embeddings1 = model.encode(sentences1, convert_to_tensor=True)

sentences2 = ['The dog plays in the garden',
              'A woman watches TV',
              'The new movie is so great']
embeddings2 = model.encode(sentences2, convert_to_tensor=True)

print(embeddings1, embeddings2)

from sentence_transformers import util
cosine_scores = util.cos_sim(embeddings1,embeddings2)
print(cosine_scores)
for i in range(len(sentences1)):
    print("{} \t\t {} \t\t Score: {:.4f}".format(sentences1[i],
                                                 sentences2[i],
                                                 cosine_scores[i][i]))

tensor([[ 0.1392,  0.0030,  0.0470,  ...,  0.0641, -0.0163,  0.0636],
        [ 0.0227, -0.0014, -0.0056,  ..., -0.0225,  0.0846, -0.0283],
        [-0.1043, -0.0628,  0.0093,  ...,  0.0020,  0.0653, -0.0150]]) tensor([[ 0.0163, -0.0700,  0.0384,  ...,  0.0447,  0.0254, -0.0023],
        [ 0.0054, -0.0920,  0.0140,  ...,  0.0167, -0.0086, -0.0424],
        [-0.0842, -0.0592, -0.0010,  ..., -0.0157,  0.0764,  0.0389]])
tensor([[ 0.2838,  0.1310, -0.0029],
        [ 0.2277, -0.0327, -0.0136],
        [-0.0124, -0.0465,  0.6571]])
The cat sits outside 		 The dog plays in the garden 		 Score: 0.2838
A man is playing guitar 		 A woman watches TV 		 Score: -0.0327
The movies are awesome 		 The new movie is so great 		 Score: 0.6571


4. zero-shot audio classification

In [32]:
from datasets import load_dataset

dataset = load_dataset("ashraq/esc50", split="train[0:10]")


Repo card metadata block was not found. Setting CardData to empty.


In [33]:
audio_sample = dataset[5]
audio_sample

{'filename': '1-101296-B-19.wav',
 'fold': 1,
 'target': 19,
 'category': 'thunderstorm',
 'esc10': False,
 'src_file': 101296,
 'take': 'B',
 'audio': {'path': None,
  'array': array([-9.46044922e-04, -6.71386719e-04, -6.10351562e-05, ...,
         -2.13623047e-03, -2.62451172e-03, -3.17382812e-03]),
  'sampling_rate': 44100}}

In [34]:
from IPython.display import Audio as IPythonAudio
IPythonAudio(audio_sample["audio"]["array"],
             rate=audio_sample["audio"]["sampling_rate"])

In [35]:
from transformers import pipeline
zero_shot_classifier = pipeline(
    task="zero-shot-audio-classification",
    model="laion/clap-htsat-unfused"
)

In [36]:
zero_shot_classifier.feature_extractor.sampling_rate, audio_sample["audio"]["sampling_rate"]

(48000, 44100)

In [38]:
from datasets import Audio

dataset = dataset.cast_column(
    "audio",
    Audio(sampling_rate=48_000)
)
audio_sample = dataset[5]
audio_sample

{'filename': '1-101296-B-19.wav',
 'fold': 1,
 'target': 19,
 'category': 'thunderstorm',
 'esc10': False,
 'src_file': 101296,
 'take': 'B',
 'audio': {'path': None,
  'array': array([-8.93350865e-04, -7.40112155e-04, -8.92440366e-05, ...,
         -2.95405276e-03, -2.92613916e-03,  0.00000000e+00]),
  'sampling_rate': 48000}}

In [72]:
candidate_labels = [
    "sound of dog",
    "sound of vacuum cleaner",
    "sound of child crying",
    "sound of bird singing",
    "sound of an airplane"
]
pred_result = []
true_result = []
for i in range(10):
    audio_sample = dataset[i]
    true_label = audio_sample["category"]
    out = zero_shot_classifier(audio_sample["audio"]["array"],
                        candidate_labels = candidate_labels)
    pred_result.append(out)
    true_result.append(true_label)
compare = zip(pred_result, true_result)

In [83]:
pred_result, true_result = next(compare)
pred_result, true_result

StopIteration: 

5. Automatic speech recognition

In [1]:
from datasets import load_dataset

dataset = load_dataset("librispeech_asr",
                        split="train.clean.100",
                        streaming=True)

  from .autonotebook import tqdm as notebook_tqdm
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [2]:
example = next(iter(dataset))

In [90]:
dataset_head = dataset.take(5)
list(dataset_head)

[{'file': '374-180298-0000.flac',
  'audio': {'path': '374-180298-0000.flac',
   'array': array([ 7.01904297e-04,  7.32421875e-04,  7.32421875e-04, ...,
          -2.74658203e-04, -1.83105469e-04, -3.05175781e-05]),
   'sampling_rate': 16000},
  'text': 'CHAPTER SIXTEEN I MIGHT HAVE TOLD YOU OF THE BEGINNING OF THIS LIAISON IN A FEW LINES BUT I WANTED YOU TO SEE EVERY STEP BY WHICH WE CAME I TO AGREE TO WHATEVER MARGUERITE WISHED',
  'speaker_id': 374,
  'chapter_id': 180298,
  'id': '374-180298-0000'},
 {'file': '374-180298-0001.flac',
  'audio': {'path': '374-180298-0001.flac',
   'array': array([-9.15527344e-05, -1.52587891e-04, -1.52587891e-04, ...,
          -2.13623047e-04, -1.83105469e-04, -2.74658203e-04]),
   'sampling_rate': 16000},
  'text': "MARGUERITE TO BE UNABLE TO LIVE APART FROM ME IT WAS THE DAY AFTER THE EVENING WHEN SHE CAME TO SEE ME THAT I SENT HER MANON LESCAUT FROM THAT TIME SEEING THAT I COULD NOT CHANGE MY MISTRESS'S LIFE I CHANGED MY OWN",
  'speaker_id': 374

In [3]:
from IPython.display import Audio as IPythonAudio
IPythonAudio(example["audio"]["array"],
             rate=example["audio"]["sampling_rate"])

In [4]:
from transformers import pipeline

asr = pipeline(task="automatic-speech-recognition",
               model="distil-whisper/distil-small.en")




Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
asr.feature_extractor.sampling_rate, example["audio"]["sampling_rate"]

(16000, 16000)

In [6]:
asr(example["audio"]["array"]), example["text"]

({'text': ' Chapter 16 I might have told you of the beginning of this liaison in a few lines, but I wanted you to see every step by which we came. I too agree to whatever Marguerite wished.'},
 'CHAPTER SIXTEEN I MIGHT HAVE TOLD YOU OF THE BEGINNING OF THIS LIAISON IN A FEW LINES BUT I WANTED YOU TO SEE EVERY STEP BY WHICH WE CAME I TO AGREE TO WHATEVER MARGUERITE WISHED')

In [10]:
import gradio as gr
demo = gr.Blocks()

def transcribe_speech(filepath):
    if filepath is None:
        gr.Warning("No audio found")
        return ""
    output = asr(filepath)
    return output[text]


In [11]:
mic_transcribe = gr.Interface(
    fn=transcribe_speech,
    inputs=gr.Audio(sources="microphone", type="filepath"),
    outputs=gr.Textbox(label="Transcription", lines=3),
    title="Automatic Speech Recognition",
    description="Transcribe speech from your microphone",
    allow_flagging="never"
)
file_transcribe = gr.Interface(
    fn=transcribe_speech,
    inputs=gr.Audio(sources="upload", type="filepath"),
    outputs=gr.Textbox(label="Transcription", lines=3),
    title="Automatic Speech Recognition",
    description="Transcribe speech from your microphone",
    allow_flagging="never"
)

In [14]:
demo.close()
with demo:
    gr.TabbedInterface(
        [mic_transcribe, file_transcribe],
        ["transcribe microphone", "transcribe Audio file"],
    )

demo.launch(debug=True)

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


Traceback (most recent call last):
  File "c:\Users\AttahiruJibril\.conda\envs\ML-AI\Lib\site-packages\transformers\pipelines\audio_utils.py", line 34, in ffmpeg_read
    with subprocess.Popen(ffmpeg_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE) as ffmpeg_process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\AttahiruJibril\.conda\envs\ML-AI\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Users\AttahiruJibril\.conda\envs\ML-AI\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
FileNotFoundError: [WinError 2] The system cannot find the file specified

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "c:\Users\AttahiruJibril\.conda\envs\ML-AI\Lib\site-packages\gradi

Keyboard interruption in main thread... closing server.




In [15]:
!choco install ffmpeg