In [34]:
!pip install datasets
from datasets import load_dataset, Audio

!pip install transformers
from transformers import pipeline, AutoFeatureExtractor, AutoModelForAudioClassification, TrainingArguments, Trainer

!pip install numpy
import numpy as np

!pip install gradio
import gradio as gr



In [35]:
fleurs = load_dataset("google/fleurs", "all", split="validation", streaming=True)
sample = next(iter(fleurs))

In [36]:
sample

{'id': 1326,
 'num_samples': 218880,
 'path': None,
 'audio': {'path': 'dev/1015649832594091373.wav',
  'array': array([ 0.        ,  0.        ,  0.        , ...,  0.0002867 ,
         -0.00020713, -0.0001356 ]),
  'sampling_rate': 16000},
 'transcription': 'in 1989 het hy vir brooks en groening gehelp om the simpsons te skep en hy was verantwoordelik om die eerste skrywerspan vir die program te huur',
 'raw_transcription': 'In 1989 het hy vir Brooks en Groening gehelp om The Simpsons te skep en hy was verantwoordelik om die eerste skrywerspan vir die program te huur.',
 'gender': 0,
 'lang_id': 0,
 'language': 'Afrikaans',
 'lang_group_id': 3}

In [37]:
sample["audio"]["array"]

array([ 0.        ,  0.        ,  0.        , ...,  0.0002867 ,
       -0.00020713, -0.0001356 ])

We are using a fine tuned model of fleurs i.e. Whisper

In [38]:
language_model = "sanchit-gandhi/whisper-medium-fleurs-lang-id"
classifier = pipeline("audio-classification", model=language_model)

In [39]:
classifier(sample["audio"]["array"])

[{'score': 0.9999330043792725, 'label': 'Afrikaans'},
 {'score': 7.093003659974784e-06, 'label': 'Northern-Sotho'},
 {'score': 4.269145392754581e-06, 'label': 'Icelandic'},
 {'score': 3.266111207267386e-06, 'label': 'Danish'},
 {'score': 3.258066044509178e-06, 'label': 'Cantonese Chinese'}]

In [40]:
genre_model = "sanchit-gandhi/distilhubert-finetuned-gtzan"
pipe = pipeline("audio-classification", model = genre_model)

Some weights of the model checkpoint at sanchit-gandhi/distilhubert-finetuned-gtzan were not used when initializing HubertForSequenceClassification: ['hubert.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'hubert.encoder.pos_conv_embed.conv.parametrizations.weight.original0']
- This IS expected if you are initializing HubertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at sanchit-gandhi/distilhubert-finetuned-gtzan and are newly initialized: ['hubert.encoder.pos_conv_embed.conv.wei

In [41]:
def classify_lang(filepath):
    # The filepath should be in the numpy nd array format for it to process. make sure you are passing a numpy nd array.
    # The function is built to work with gradio.
    # You can use the function to classify manually as well by passing an audio from the dataset.
    preds = classifier(filepath)
    outputs = {}
    for p in preds:
        outputs[p["label"]] = p["score"]
    return outputs

In [42]:
def classify_audio(filepath):
    preds = pipe(filepath)
    outputs = {}
    for p in preds:
        outputs[p["label"]] = p["score"]
    return outputs

In [43]:
# Example
classify_lang(sample["audio"]["array"])

{'Afrikaans': 0.9999330043792725,
 'Northern-Sotho': 7.093003659974784e-06,
 'Icelandic': 4.269145392754581e-06,
 'Danish': 3.266111207267386e-06,
 'Cantonese Chinese': 3.258066044509178e-06}

In [44]:
# Example
classify_audio(sample["audio"]["array"])

{'hiphop': 0.6191902160644531,
 'blues': 0.13513532280921936,
 'reggae': 0.07579993456602097,
 'jazz': 0.03493537753820419,
 'metal': 0.03486782684922218}

In [None]:
with gr.Blocks() as intf:
  name = gr.Textbox(label="File")
  output1 = gr.Textbox(label="Langauge")
  output2 = gr.Textbox(label="Genre")
  submit_btn = gr.Button("Submit")
  submit_btn.click(fn=classify_lang, inputs=name, outputs=output1, api_name="language_classification")
  submit_btn.click(fn=classify_audio, inputs=name, outputs=output2, api_name="genre_classification")

intf.launch(debug = True)

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Note: opening Chrome Inspector may crash demo inside Colab notebooks.

To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>