#The Erd&#337;s Institute Fall Boot Camp - Team Audiobots

We're using data from [this dataset](https://www.kaggle.com/datasets/andradaolteanu/gtzan-dataset-music-genre-classification) to try and classify one thousand 30s samples of audio into one of 10 genres:

    * "blues",
    * "classical",
    * "country",
    * "disco",
    * "hiphop",
    * "jazz",
    * "metal",
    * "pop",
    * "reggae",
    * "rock"

We are assuming the genres are accurate. We're using this instead of the other dataset, as it seems to be more accurately classified and avoids any problematic "International" genre.

For training, we will feed the 90\% of the data into a pre-trained Transformer network from HuggingSpace, and fine-tune the network to classify one of the 10 genres above. If the architecture requries inputs of constant size, we can either pad shorter samples with 0's, or randomly clip shorter sections of longer audio streams.

This code does the following:

*   Loads Beyonce's Cowboy Carter album from disk.
*   Picks one of the three best models trained on GTZAN (Whisper Small, DistilHuBERT, or Fleur)
*   Predicts genres (and provides logits + probabilities) for further analysis.

In [1]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import librosa
import librosa.display
import evaluate
import torch
import os

from datasets import load_dataset, Audio

from transformers import AutoFeatureExtractor, AutoModelForAudioClassification, Trainer



Download the data from the Huggingface repo

In [2]:
model_id = "openai/whisper-small"
#model_id = "ntu-spml/distilhubert"
#model_id = "sanchit-gandhi/whisper-medium-fleurs-lang-id"
feature_extractor = AutoFeatureExtractor.from_pretrained(
    model_id, do_normalize=True)

In [3]:
sampling_rate = feature_extractor.sampling_rate
#This is the sampling rate that the model expects, so we have to make sure we re-sample the audio to this rate.
sampling_rate


16000

In [4]:
DATA_DIR = '/home/dwgb93/Music/Beyoncé - COWBOY CARTER (2024)'

bey = load_dataset("audiofolder", data_dir=DATA_DIR)

Resolving data files:   0%|          | 0/27 [00:00<?, ?it/s]

In [5]:
bey= bey.cast_column("audio", Audio(sampling_rate=sampling_rate))
#Otherwise, it will ASSUME the audio is 16kHz, and only use the first ~11s of slowed down audio


In [6]:
max_duration = 30.0 #I'm pretty sure all the audio is close to exactly this long (skipped EDA, lol)


def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    
    inputs = feature_extractor(
        audio_arrays,
        sampling_rate=feature_extractor.sampling_rate,
        max_length=int(feature_extractor.sampling_rate * max_duration),
        truncation=True,
        padding=True,
        return_attention_mask=True,
    )
    
    return inputs

In [7]:
id2label = {
0: "blues", 1: "classical", 2: "country", 3: "disco", 4: "hiphop", 5: "jazz", 6: "metal", 7: "pop", 8: "reggae", 9: "rock"
}

id2label.items()

dict_items([(0, 'blues'), (1, 'classical'), (2, 'country'), (3, 'disco'), (4, 'hiphop'), (5, 'jazz'), (6, 'metal'), (7, 'pop'), (8, 'reggae'), (9, 'rock')])

In [8]:
label2id = {v: k for k, v in id2label.items()}
genres = [i for i in id2label.items()]

In [9]:
def get_middle(batch):
    max_length=int(feature_extractor.sampling_rate * max_duration), 
    
    for song in batch["audio"]:
        song_length = len(song['array'])
        #max_length = max_length[0] #Why does it randomly become a tuple? Why doesn't this work?

        if song_length <= max_length[0]:
            pass
        else:
            song['array'] = song['array'][song_length//2 - max_length[0]//2:song_length//2 + max_length[0]//2] # Dumb?

    return batch

In [10]:
bey = bey.map(
    get_middle,
    batched=True,
    num_proc=1,
)
bey

Map:   0%|          | 0/27 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['audio'],
        num_rows: 27
    })
})

In [11]:
bey_encoded = bey.map(
    preprocess_function,
    remove_columns=["audio"],
    batched=True,
    num_proc=1,
)
    
bey_encoded

Map:   0%|          | 0/27 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_features', 'attention_mask'],
        num_rows: 27
    })
})

In [12]:
path = '/home/dwgb93/Code/Audiobots/best_Whisper-Small_model_92'
#path = '/home/dwgb93/Code/Audiobots/best_distillHubert_model_87'
#path = '/home/dwgb93/Code/Audiobots/best_Fleur_model_86'
model = AutoModelForAudioClassification.from_pretrained(path)

In [13]:
trainer = Trainer(
    model,
    tokenizer=feature_extractor,
)

preds = trainer.predict(bey_encoded['train'])

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [14]:
for i, pred in enumerate(preds.predictions):
    print(i+1, id2label[np.argmax(pred)])

1 rock
2 rock
3 pop
4 pop
5 disco
6 rock
7 disco
8 pop
9 blues
10 pop
11 rock
12 hiphop
13 pop
14 country
15 pop
16 pop
17 pop
18 pop
19 rock
20 disco
21 pop
22 pop
23 pop
24 pop
25 pop
26 pop
27 country


In [17]:
table = pd.DataFrame(preds.predictions, columns = genres)
table.to_csv("bey_logits_middle3.csv")
table

Unnamed: 0,"(0, blues)","(1, classical)","(2, country)","(3, disco)","(4, hiphop)","(5, jazz)","(6, metal)","(7, pop)","(8, reggae)","(9, rock)"
0,-2.491505,-1.726104,-1.184168,-0.381552,8.688173,-0.933624,-1.138128,0.990151,2.183978,-3.211143
1,0.019525,-2.956783,5.141325,-2.121413,-5.992849,-3.68032,-1.327183,-0.586669,-1.748253,8.392411
2,-2.451221,-2.226452,1.092243,1.189929,-1.107875,-1.427563,-3.935056,9.171456,-3.66228,0.18258
3,-1.127444,-1.905612,0.048231,-1.464624,-0.880928,3.229001,-3.580336,8.618262,-4.251264,-1.05615
4,0.646138,2.929201,-2.085522,-1.620157,-0.712087,8.078201,-2.664864,3.226367,-3.767455,-3.174635
5,4.766645,-1.266966,1.040232,-2.511318,-0.545998,-2.029893,-2.403663,-3.568056,8.02409,-1.056485
6,-3.651223,-2.503304,-2.005607,6.774098,2.975549,-1.311888,-4.228528,4.896916,-0.153844,-1.769252
7,-3.058188,-1.906582,-2.939828,1.066307,8.238279,-0.294311,-1.581971,4.532594,-0.720563,-3.029478
8,-0.715949,-2.370223,-3.106105,-0.628324,7.98325,-0.298246,-0.630985,-1.705088,5.976594,-2.842721
9,-3.378165,-1.451036,5.312044,4.871397,-3.70734,-3.182249,-4.662673,2.12961,-1.192565,1.627254


In [32]:
probs = pd.DataFrame(np.exp(preds.predictions)/np.sum(np.exp(preds.predictions), axis = 1).reshape(-1,1), columns = genres)
probs.to_csv("bey_probs_middle3.csv")
probs

Unnamed: 0,"(0, blues)","(1, classical)","(2, country)","(3, disco)","(4, hiphop)","(5, jazz)","(6, metal)","(7, pop)","(8, reggae)","(9, rock)"
0,4.9e-05,0.00035,0.000145,0.998304,9.7e-05,4.2e-05,6.3e-05,0.000685,0.00017,9.6e-05
1,0.013309,0.000206,0.014427,0.001849,0.000471,0.000153,0.000539,0.002733,0.000394,0.965919
2,0.00014,0.000107,9.9e-05,0.000143,5.6e-05,5e-05,6.3e-05,0.999211,6.4e-05,6.6e-05
3,0.000989,0.000512,0.003006,0.011481,0.000448,0.000176,0.000341,0.981192,0.000246,0.001609
4,7e-05,0.000155,7.9e-05,0.998995,7.8e-05,2.3e-05,5.2e-05,0.00035,9.9e-05,9.9e-05
5,0.763546,0.007982,0.007118,0.04591,0.001271,0.004023,0.000749,0.03979,0.047511,0.082099
6,0.000558,0.000158,0.000122,0.990885,0.005035,2e-05,0.000249,0.001346,0.000987,0.00064
7,0.000117,9.4e-05,8.2e-05,0.000308,0.000195,4.4e-05,0.000102,0.9989,9.6e-05,6.2e-05
8,0.889759,0.001095,0.08853,0.003076,0.006642,0.001659,0.002301,0.002737,0.001313,0.002889
9,0.000851,0.000151,0.0003,0.001866,0.000336,5.7e-05,9.6e-05,0.995925,0.00017,0.000247
