## Before starting
Go to https://github.com/mdeff/fma?tab=readme-ov-file, download the following files:

1. `fma_metadata.zip`
    - Extract it to `./fma_metadata`
2. `fma_small.zip` (Only if you want the audio files)

In [1]:
import os
import pandas as pd
import numpy as np

# utils.py, local file
import utils

# Directory where mp3 are stored
AUDIO_DIR = ''

# Load metadata and features.
base_dir = 'fma_metadata'

# these objects are all pandas dataframes
tracks = utils.load(f'{base_dir}/tracks.csv')
genres = utils.load(f'{base_dir}/genres.csv')
features = utils.load(f'{base_dir}/features.csv')
echonest = utils.load(f'{base_dir}/echonest.csv')

np.testing.assert_array_equal(features.index, tracks.index)
assert echonest.index.isin(tracks.index).all()

tracks.shape, genres.shape, features.shape, echonest.shape

((106574, 52), (163, 4), (106574, 518), (13129, 249))

In [2]:
tracks.head()

Unnamed: 0_level_0,album,album,album,album,album,album,album,album,album,album,...,track,track,track,track,track,track,track,track,track,track
Unnamed: 0_level_1,comments,date_created,date_released,engineer,favorites,id,information,listens,producer,tags,...,information,interest,language_code,license,listens,lyricist,number,publisher,tags,title
track_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2,0,2008-11-26 01:44:45,2009-01-05,,4,1,<p></p>,6073,,[],...,,4656,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,1293,,3,,[],Food
3,0,2008-11-26 01:44:45,2009-01-05,,4,1,<p></p>,6073,,[],...,,1470,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,514,,4,,[],Electric Ave
5,0,2008-11-26 01:44:45,2009-01-05,,4,1,<p></p>,6073,,[],...,,1933,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,1151,,6,,[],This World
10,0,2008-11-26 01:45:08,2008-02-06,,4,6,,47632,,[],...,,54881,en,Attribution-NonCommercial-NoDerivatives (aka M...,50135,,1,,[],Freeway
20,0,2008-11-26 01:45:05,2009-01-06,,2,4,"<p> ""spiritual songs"" from Nicky Cook</p>",2710,,[],...,,978,en,Attribution-NonCommercial-NoDerivatives (aka M...,361,,3,,[],Spiritual Level


In [3]:
# let's look at the columns.
# we want to be able to identify which song in the FMA dataset maps to a track ID in the million song subset.
tracks.columns

MultiIndex([( 'album',          'comments'),
            ( 'album',      'date_created'),
            ( 'album',     'date_released'),
            ( 'album',          'engineer'),
            ( 'album',         'favorites'),
            ( 'album',                'id'),
            ( 'album',       'information'),
            ( 'album',           'listens'),
            ( 'album',          'producer'),
            ( 'album',              'tags'),
            ( 'album',             'title'),
            ( 'album',            'tracks'),
            ( 'album',              'type'),
            ('artist', 'active_year_begin'),
            ('artist',   'active_year_end'),
            ('artist', 'associated_labels'),
            ('artist',               'bio'),
            ('artist',          'comments'),
            ('artist',      'date_created'),
            ('artist',         'favorites'),
            ('artist',                'id'),
            ('artist',          'latitude'),
          

In [4]:
# run the other notebook so we have the million_song_df in memory here.
%run million_song_subset_exploration.ipynb

Loading data from million_song_data.csv


In [5]:
# we want to match the track IDs in the FMA dataset to the track_id in this df:
million_song_df

Unnamed: 0,track_id,track_title,artist_name,play_count
0,TRAAAAW128F429D538,I Didn't Mean To,Casual,0.602120
1,TRAAABD128F429CF47,Soul Deep,The Box Tops,0.000000
2,TRAAADZ128F9348C2E,Amor De Cabaret,Sonora Santanera,0.000000
3,TRAAAEF128F4273421,Something Girls,Adam Ant,0.000000
4,TRAAAFD128F92F423A,Face the Ashes,Gob,0.604501
...,...,...,...,...
9995,TRBIJMU12903CF892B,The Hanged Man,Moonspell,0.594080
9996,TRBIJNF128F14815A7,The Wonderful World Of The Young,Danny Williams,0.334707
9997,TRBIJNK128F93093EC,Sentimental Man,Winston Reedy,0.000000
9998,TRBIJRN128F425F3DD,Zydeco In D-Minor,"Myrick ""Freeze"" Guillory",0.000000


In [7]:
tracks.head()

Unnamed: 0_level_0,album,album,album,album,album,album,album,album,album,album,...,track,track,track,track,track,track,track,track,track,track
Unnamed: 0_level_1,comments,date_created,date_released,engineer,favorites,id,information,listens,producer,tags,...,information,interest,language_code,license,listens,lyricist,number,publisher,tags,title
track_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2,0,2008-11-26 01:44:45,2009-01-05,,4,1,<p></p>,6073,,[],...,,4656,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,1293,,3,,[],Food
3,0,2008-11-26 01:44:45,2009-01-05,,4,1,<p></p>,6073,,[],...,,1470,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,514,,4,,[],Electric Ave
5,0,2008-11-26 01:44:45,2009-01-05,,4,1,<p></p>,6073,,[],...,,1933,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,1151,,6,,[],This World
10,0,2008-11-26 01:45:08,2008-02-06,,4,6,,47632,,[],...,,54881,en,Attribution-NonCommercial-NoDerivatives (aka M...,50135,,1,,[],Freeway
20,0,2008-11-26 01:45:05,2009-01-06,,2,4,"<p> ""spiritual songs"" from Nicky Cook</p>",2710,,[],...,,978,en,Attribution-NonCommercial-NoDerivatives (aka M...,361,,3,,[],Spiritual Level


In [8]:
tracks.columns

MultiIndex([( 'album',          'comments'),
            ( 'album',      'date_created'),
            ( 'album',     'date_released'),
            ( 'album',          'engineer'),
            ( 'album',         'favorites'),
            ( 'album',                'id'),
            ( 'album',       'information'),
            ( 'album',           'listens'),
            ( 'album',          'producer'),
            ( 'album',              'tags'),
            ( 'album',             'title'),
            ( 'album',            'tracks'),
            ( 'album',              'type'),
            ('artist', 'active_year_begin'),
            ('artist',   'active_year_end'),
            ('artist', 'associated_labels'),
            ('artist',               'bio'),
            ('artist',          'comments'),
            ('artist',      'date_created'),
            ('artist',         'favorites'),
            ('artist',                'id'),
            ('artist',          'latitude'),
          

In [14]:
tracks.index

Index([     2,      3,      5,     10,     20,     26,     30,     46,     48,
          134,
       ...
       155310, 155311, 155312, 155314, 155315, 155316, 155317, 155318, 155319,
       155320],
      dtype='int64', name='track_id', length=106574)

In [17]:
# so the tracks.index has a weird indexing thing, where you index using the track id.
# we want the track id to be a column in the df.
# so that when merged, we know which index into 'tracks' (the df) maps to which 'track_id' in million_song_df
resetted_tracks = tracks.reset_index()
resetted_tracks.rename(columns={'track_id': 'index_into_fma_track_df'}, inplace=True)
resetted_tracks.head()

Unnamed: 0_level_0,index_into_fma_track_df,album,album,album,album,album,album,album,album,album,...,track,track,track,track,track,track,track,track,track,track
Unnamed: 0_level_1,Unnamed: 1_level_1,comments,date_created,date_released,engineer,favorites,id,information,listens,producer,...,information,interest,language_code,license,listens,lyricist,number,publisher,tags,title
0,2,0,2008-11-26 01:44:45,2009-01-05,,4,1,<p></p>,6073,,...,,4656,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,1293,,3,,[],Food
1,3,0,2008-11-26 01:44:45,2009-01-05,,4,1,<p></p>,6073,,...,,1470,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,514,,4,,[],Electric Ave
2,5,0,2008-11-26 01:44:45,2009-01-05,,4,1,<p></p>,6073,,...,,1933,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,1151,,6,,[],This World
3,10,0,2008-11-26 01:45:08,2008-02-06,,4,6,,47632,,...,,54881,en,Attribution-NonCommercial-NoDerivatives (aka M...,50135,,1,,[],Freeway
4,20,0,2008-11-26 01:45:05,2009-01-06,,2,4,"<p> ""spiritual songs"" from Nicky Cook</p>",2710,,...,,978,en,Attribution-NonCommercial-NoDerivatives (aka M...,361,,3,,[],Spiritual Level


In [18]:
resetted_tracks.columns

MultiIndex([('index_into_fma_track_df',                  ''),
            (                  'album',          'comments'),
            (                  'album',      'date_created'),
            (                  'album',     'date_released'),
            (                  'album',          'engineer'),
            (                  'album',         'favorites'),
            (                  'album',                'id'),
            (                  'album',       'information'),
            (                  'album',           'listens'),
            (                  'album',          'producer'),
            (                  'album',              'tags'),
            (                  'album',             'title'),
            (                  'album',            'tracks'),
            (                  'album',              'type'),
            (                 'artist', 'active_year_begin'),
            (                 'artist',   'active_year_end'),
        

In [21]:
# Now normalize both dfs: process the strings and find matches between the DFs
# extract our desired fma columns

fma_tracks = resetted_tracks[[('index_into_fma_track_df', ''), ('track', 'title'), ('artist', 'name')]].copy()
fma_tracks.columns = ['index_into_fma_track_df', 'track_title', 'artist_name']

# normalize the fma data: lowercase and remove special chars
fma_tracks['track_title'] = fma_tracks['track_title'].str.lower().str.replace('[^\w\s]', '', regex=True)
fma_tracks['artist_name'] = fma_tracks['artist_name'].str.lower().str.replace('[^\w\s]', '', regex=True)

# normalize text data for Million Song subset the same way
million_song_df['track_title'] = million_song_df['track_title'].str.lower().str.replace('[^\w\s]', '', regex=True)
million_song_df['artist_name'] = million_song_df['artist_name'].str.lower().str.replace('[^\w\s]', '', regex=True)

# merge DataFrames on 'track_title' and 'artist_name'
matched_tracks = pd.merge(million_song_df, fma_tracks, on=['track_title'], how='inner')

# results
print(matched_tracks.head())
print("Number of matched tracks:", matched_tracks.shape[0])

matched_tracks.columns

             track_id   track_title          artist_name_x  play_count  \
0  TRAAAPK128E0786D96  drop of rain  tweeterfriendly music    0.000000   
1  TRAABLR128F423B7E3      floating             blue rodeo    0.405116   
2  TRAABLR128F423B7E3      floating             blue rodeo    0.405116   
3  TRAABLR128F423B7E3      floating             blue rodeo    0.405116   
4  TRAABLR128F423B7E3      floating             blue rodeo    0.405116   

   index_into_fma_track_df              artist_name_y  
0                    70568  vesme dreamchatcher kiong  
1                    12168                      brunk  
2                    41554                  john ming  
3                    81737             podington bear  
4                   103945        section 27 netlabel  
Number of matched tracks: 6224


Index(['track_id', 'track_title', 'artist_name_x', 'play_count',
       'index_into_fma_track_df', 'artist_name_y'],
      dtype='object')

In [25]:
# now we have matched_tracks. Looking at the above cell's output, let's looked at the first matched track. 
# 'index_into_fma_track_df' == 70568, and the song's name is 'drop of rain'
# We have access to the track_id in the million song subset: TRAAAPK128E0786D96
# to access this song in the fma dataset object, we can use the "index_into_fma_track_df" value like this:
tracks.loc[70568]

album   comments                                                             0
        date_created                                       2012-10-01 10:47:23
        date_released                                                      NaT
        engineer                                                           NaN
        favorites                                                            1
        id                                                               12540
        information                                                        NaN
        listens                                                           6633
        producer                                                           NaN
        tags                                                                []
        title                                                       Sinful son
        tracks                                                               8
        type                                        