In [7]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import librosa
import librosa.display

from IPython.display import Audio as ipdAudio

from datasets import load_dataset, Audio

from transformers import AutoFeatureExtractor, pipeline


In [10]:
gtzan = load_dataset("marsyas/gtzan", split='train')
GENRES = ["blues", "classical", "country", "disco", "hiphop", "jazz", "metal", "pop", "reggae", "rock"]
SR = 16_000
gtzan = gtzan.cast_column("audio", Audio(sampling_rate=SR))
max_duration = 30.0

model_id = "sanchit-gandhi/whisper-medium-fleurs-lang-id"
feature_extractor = AutoFeatureExtractor.from_pretrained(
    model_id, do_normalize=True)

def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays,
        sampling_rate=SR,
        max_length=int(SR * max_duration),
        truncation=True,
    )
    return inputs

gtzan_encoded = gtzan.map(
    preprocess_function,
    remove_columns=["audio", "file"],
    batched=True,
    batch_size=100,
    num_proc=1,
)
gtzan_encoded = gtzan_encoded.rename_column("genre", "label")
gtzan_encoded

Dataset({
    features: ['label', 'input_features'],
    num_rows: 999
})

In [15]:
from transformers import AutoModelForAudioClassification

id2label_fn = gtzan.features["genre"].int2str
id2label_fn(gtzan[0]["genre"])
id2label = {
    str(i): id2label_fn(i)
    for i in range(len(gtzan_encoded.features["label"].names))
}

label2id = {v: k for k, v in id2label.items()}
num_labels = len(id2label)

path = "best_Whisper-Small_model_92"
model = AutoModelForAudioClassification.from_pretrained(path)
model

WhisperForAudioClassification(
  (encoder): WhisperEncoder(
    (conv1): Conv1d(80, 768, kernel_size=(3,), stride=(1,), padding=(1,))
    (conv2): Conv1d(768, 768, kernel_size=(3,), stride=(2,), padding=(1,))
    (embed_positions): Embedding(1500, 768)
    (layers): ModuleList(
      (0-11): 12 x WhisperEncoderLayer(
        (self_attn): WhisperSdpaAttention(
          (k_proj): Linear(in_features=768, out_features=768, bias=False)
          (v_proj): Linear(in_features=768, out_features=768, bias=True)
          (q_proj): Linear(in_features=768, out_features=768, bias=True)
          (out_proj): Linear(in_features=768, out_features=768, bias=True)
        )
        (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (activation_fn): GELUActivation()
        (fc1): Linear(in_features=768, out_features=3072, bias=True)
        (fc2): Linear(in_features=3072, out_features=768, bias=True)
        (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwis

In [31]:
label2id

{'blues': '0',
 'classical': '1',
 'country': '2',
 'disco': '3',
 'hiphop': '4',
 'jazz': '5',
 'metal': '6',
 'pop': '7',
 'reggae': '8',
 'rock': '9'}

In [25]:
idx = 208
print(id2label_fn(gtzan[idx]["genre"]))
ipdAudio(gtzan[idx]["audio"]["array"], rate=SR)

country


In [57]:
gtzan[idx]

{'file': '/home/pvk/.cache/huggingface/datasets/downloads/extracted/b75da0d5c18fce1faaf45ae634042591b38c8a2e7b12c1a191f47a40c6d2c5e9/genres/country/country.00008.wav',
 'audio': {'path': '/home/pvk/.cache/huggingface/datasets/downloads/extracted/b75da0d5c18fce1faaf45ae634042591b38c8a2e7b12c1a191f47a40c6d2c5e9/genres/country/country.00008.wav',
  'array': array([ 0.07425082, -0.14342646, -0.12220861, ..., -0.04336805,
          0.14697215,  0.        ]),
  'sampling_rate': 16000},
 'genre': 2}

In [43]:
modified = {"array": gtzan[idx]["audio"]["array"], "sampling_rate": gtzan[idx]["audio"]["sampling_rate"]}

In [44]:
classifier = pipeline("audio-classification", model=model, feature_extractor=feature_extractor)
classifier(modified)


[{'score': 0.6397517323493958, 'label': 'rock'},
 {'score': 0.35262972116470337, 'label': 'country'},
 {'score': 0.0029203908052295446, 'label': 'pop'},
 {'score': 0.0016572315944358706, 'label': 'disco'},
 {'score': 0.0012948554940521717, 'label': 'blues'}]

In [55]:
from pathlib import Path
def predict(audio):
    if isinstance(audio, str) or isinstance(audio, Path):
        audio, sr = librosa.load(audio, sr=16_000)
    else:
        sr, audio = audio
    preds = classifier({"array": audio, "sampling_rate": sr})
    probs = {k: 0.0 for k in label2id}
    for pred in preds:
        probs[pred["label"]] = pred["score"]
    return probs

predict(gtzan[idx]["file"])
    

{'blues': 0.0012948554940521717,
 'classical': 0.0,
 'country': 0.35262972116470337,
 'disco': 0.0016572315944358706,
 'hiphop': 0.0,
 'jazz': 0.0,
 'metal': 0.0,
 'pop': 0.0029203908052295446,
 'reggae': 0.0,
 'rock': 0.6397517323493958}

In [59]:
import gradio as gr
title = "<h1>🤖 🎵 Audiobot 🎹 ⚡</h1>"
description = "<h2>♯ The superpowered music genre classifier ♭</h2>"
article = """
<p>This model is a version of <a href="https://huggingface.co/openai/whisper-medium">Whisper Medium</a>,
fine-tuned on the <a href="https://huggingface.co/datasets/marsyas/gtzan">GTZAN</a> dataset.</p>
<p>It recognizes 10 genres: blues, classical, country, hip-hop, jazz, metal, pop, reggae, and rock.</p>
<p>Upload a song or click one of the examples to try it out!</p>
"""
iface = gr.Interface(
    fn=predict,
    inputs=gr.Audio(type="filepath"),
    outputs=gr.Label(num_top_classes=3),
    title=title,
    description=description,
    article=article,
    examples=["examples/country.00008.wav", "examples/hiphop.00057.wav", "examples/gamblersblues.opus"]
)
iface.launch()

Running on local URL:  http://127.0.0.1:7868

To create a public link, set `share=True` in `launch()`.


