Link to github with most of info plus links to dataset and notebooks showing processing - https://github.com/mdeff/fma?tab=readme-ov-file

Link to usage NB which is useful for understanding how to interact with the dataset - https://nbviewer.org/github/mdeff/fma/blob/outputs/usage.ipynb

In [1]:
%matplotlib inline

import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import math

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import accuracy_score, classification_report

In [2]:
track_headers = pd.read_csv('fma_metadata/tracks.csv',nrows=3, header=None)
new_track_headers = []

for col in track_headers:
    if not isinstance(track_headers[col].iloc[0],str) :
        new_track_headers.append(track_headers[col].iloc[2])
    else:
        new_track_headers.append(track_headers[col].iloc[0]+"_"+track_headers[col].iloc[1])

In [3]:
# new_track_headers


In [4]:
tracks = pd.read_csv('fma_metadata/tracks.csv',skiprows=[0,1,2], header=None)
tracks.columns = new_track_headers
tracks.head(3)

Unnamed: 0,track_id,album_comments,album_date_created,album_date_released,album_engineer,album_favorites,album_id,album_information,album_listens,album_producer,...,track_information,track_interest,track_language_code,track_license,track_listens,track_lyricist,track_number,track_publisher,track_tags,track_title
0,2,0,2008-11-26 01:44:45,2009-01-05 00:00:00,,4,1,<p></p>,6073,,...,,4656,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,1293,,3,,[],Food
1,3,0,2008-11-26 01:44:45,2009-01-05 00:00:00,,4,1,<p></p>,6073,,...,,1470,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,514,,4,,[],Electric Ave
2,5,0,2008-11-26 01:44:45,2009-01-05 00:00:00,,4,1,<p></p>,6073,,...,,1933,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,1151,,6,,[],This World


In [5]:
tracks.shape

(106574, 53)

In [6]:
tracks.isna().sum()

track_id                         0
album_comments                   0
album_date_created            3529
album_date_released          36280
album_engineer               91279
album_favorites                  0
album_id                         0
album_information            23425
album_listens                    0
album_producer               88514
album_tags                       0
album_title                   1025
album_tracks                     0
album_type                    6508
artist_active_year_begin     83863
artist_active_year_end      101199
artist_associated_labels     92303
artist_bio                   35418
artist_comments                  0
artist_date_created            856
artist_favorites                 0
artist_id                        0
artist_latitude              62030
artist_location              36364
artist_longitude             62030
artist_members               59725
artist_name                      0
artist_related_projects      93422
artist_tags         

In [7]:
tracks['track_genre_top'].value_counts(dropna=False)

track_genre_top
NaN                    56976
Rock                   14182
Experimental           10608
Electronic              9372
Hip-Hop                 3552
Folk                    2803
Pop                     2332
Instrumental            2079
International           1389
Classical               1230
Jazz                     571
Old-Time / Historic      554
Spoken                   423
Country                  194
Soul-RnB                 175
Blues                    110
Easy Listening            24
Name: count, dtype: int64

In [8]:
genre_info = pd.read_csv('fma_metadata/genres.csv')

genre_info.head(5)

Unnamed: 0,genre_id,#tracks,parent,title,top_level
0,1,8693,38,Avant-Garde,38
1,2,5271,0,International,2
2,3,1752,0,Blues,3
3,4,4126,0,Jazz,4
4,5,4106,0,Classical,5


## TODO / Other comments
The track_genres column on the tracks dataset contains a list of integers that references the genres stored in the genres.csv file need to:

- Check if the order of the genres listed means anything or is it just numerically sorted
- If it is not sorted, check if for the records where top_genre is present that the genre id relates to the same genre in the genre.csv

For now I am just going to work with an adapted version and drop the null top_genre rows and just work with that subset and try to get something basic working.

Also not sure exactly whether track_date_created just relates to when the track was added to the DB as a lot of them have same value. Only a small number have date_recorded.

Another thing to check

- How many distinct artists in the dataset?
- Do artists have one to one mapping of genres?
- Same with albums

In [9]:
topg_tracks = tracks.dropna(subset=['track_genre_top'])

topg_tracks.isna().sum()

track_id                        0
album_comments                  0
album_date_created           1051
album_date_released         16086
album_engineer              40687
album_favorites                 0
album_id                        0
album_information           10154
album_listens                   0
album_producer              39372
album_tags                      0
album_title                   309
album_tracks                    0
album_type                   2047
artist_active_year_begin    40016
artist_active_year_end      47200
artist_associated_labels    43455
artist_bio                  16194
artist_comments                 0
artist_date_created           215
artist_favorites                0
artist_id                       0
artist_latitude             28618
artist_location             17488
artist_longitude            28618
artist_members              27653
artist_name                     0
artist_related_projects     44233
artist_tags                     0
artist_website

In [10]:
# Not sure if this is necessary, but am assigning each of the categorical top genre labels an integer instead of having them as just text.
# Using the sklearn label encoder for this

label_encoder = LabelEncoder()
topg_tracks.loc[:, 'genre_label'] = label_encoder.fit_transform(topg_tracks['track_genre_top'])

len(topg_tracks)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  topg_tracks.loc[:, 'genre_label'] = label_encoder.fit_transform(topg_tracks['track_genre_top'])


49598

In [11]:
# sample = topg_tracks.sample(n=1000,random_state=42)
sample = topg_tracks

In [12]:
def gen_Train_and_Test(dataset,feature):
    X = dataset[[feature]]
    y = dataset['genre_label']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    print("Training sample length: "+str(len(X_train)))
    print("Testing sample length: "+str(len(X_test)))

    return X_train,X_test,y_train,y_test

In [13]:
X_train, X_test, y_train, y_test = gen_Train_and_Test(sample,'track_duration')

Training sample length: 39678
Testing sample length: 9920


## KNN SINGLE FEATURE DURATION

In [16]:
knn_classifier = KNeighborsClassifier(n_neighbors=10)
knn_classifier.fit(X_train, y_train);

In [17]:
y_pred = knn_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(report)

Accuracy: 0.30
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        21
           1       0.02      0.01      0.01       244
           2       0.00      0.00      0.00        47
           3       0.00      0.00      0.00         6
           4       0.25      0.37      0.30      1910
           5       0.34      0.41      0.37      2083
           6       0.05      0.01      0.02       569
           7       0.11      0.03      0.05       688
           8       0.09      0.02      0.03       421
           9       0.02      0.00      0.01       265
          10       0.00      0.00      0.00       120
          11       0.00      0.00      0.00       115
          12       0.41      0.12      0.19       464
          13       0.34      0.46      0.39      2846
          14       0.00      0.00      0.00        33
          15       0.00      0.00      0.00        88

    accuracy                           0.3

## NAIVE BAYES SINGLE FEATURE DURATION

In [18]:
naive_bayes_classifier = GaussianNB()
naive_bayes_classifier.fit(X_train, y_train);

In [19]:
y_pred = naive_bayes_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(report)

Accuracy: 0.31
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        21
           1       0.00      0.00      0.00       244
           2       0.00      0.00      0.00        47
           3       0.00      0.00      0.00         6
           4       0.00      0.00      0.00      1910
           5       0.55      0.19      0.28      2083
           6       0.00      0.00      0.00       569
           7       0.00      0.00      0.00       688
           8       0.00      0.00      0.00       421
           9       0.00      0.00      0.00       265
          10       0.00      0.00      0.00       120
          11       0.00      0.00      0.00       115
          12       0.00      0.00      0.00       464
          13       0.30      0.95      0.45      2846
          14       0.00      0.00      0.00        33
          15       0.11      0.12      0.12        88

    accuracy                           0.3

## SVM SINGLE FEATURE DURATION

In [14]:
svm_model = SVC(kernel='linear',random_state=42)
svm_model.fit(X_train,y_train);

In [15]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
# warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

y_pred = svm_model.predict(X_test)
accuracy = accuracy_score(y_test,y_pred)

# When using the sample to test this there was a problem with the class labels so fixed it here:
unique_classes = sorted(set(y_test) | set(y_pred))
report = classification_report(y_test, y_pred, labels=unique_classes, target_names=label_encoder.classes_)

print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(report)

Accuracy: 0.32
Classification Report:
                     precision    recall  f1-score   support

              Blues       0.00      0.00      0.00        21
          Classical       0.00      0.00      0.00       244
            Country       0.00      0.00      0.00        47
     Easy Listening       0.00      0.00      0.00         6
         Electronic       0.00      0.00      0.00      1910
       Experimental       0.55      0.21      0.30      2083
               Folk       0.00      0.00      0.00       569
            Hip-Hop       0.00      0.00      0.00       688
       Instrumental       0.00      0.00      0.00       421
      International       0.00      0.00      0.00       265
               Jazz       0.00      0.00      0.00       120
Old-Time / Historic       0.00      0.00      0.00       115
                Pop       0.00      0.00      0.00       464
               Rock       0.30      0.95      0.45      2846
           Soul-RnB       0.00      0.00      