## Before starting
Go to https://github.com/mdeff/fma?tab=readme-ov-file, download the following files:

1. `fma_metadata.zip`
    - Extract it to `./fma_metadata`
2. `fma_small.zip` (Only if you want the audio files)

In [2]:
import os
import pandas as pd
import numpy as np

# utils.py, local file
import utils

from utilities.constants import(
    DEFAULT_FMA_METADATA_LOCATION,
    DEFAULT_FMA_SONG_LOCATION,
)


# Directory where mp3 are stored
AUDIO_DIR = ''

# Load metadata and features.
base_dir = DEFAULT_FMA_METADATA_LOCATION

# these objects are all pandas dataframes
tracks = utils.load(f'{base_dir}/tracks.csv')
genres = utils.load(f'{base_dir}/genres.csv')
features = utils.load(f'{base_dir}/features.csv')
echonest = utils.load(f'{base_dir}/echonest.csv')

np.testing.assert_array_equal(features.index, tracks.index)
assert echonest.index.isin(tracks.index).all()

tracks.shape, genres.shape, features.shape, echonest.shape

((106574, 52), (163, 4), (106574, 518), (13129, 249))

In [3]:
tracks.head()

Unnamed: 0_level_0,album,album,album,album,album,album,album,album,album,album,...,track,track,track,track,track,track,track,track,track,track
Unnamed: 0_level_1,comments,date_created,date_released,engineer,favorites,id,information,listens,producer,tags,...,information,interest,language_code,license,listens,lyricist,number,publisher,tags,title
track_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2,0,2008-11-26 01:44:45,2009-01-05,,4,1,<p></p>,6073,,[],...,,4656,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,1293,,3,,[],Food
3,0,2008-11-26 01:44:45,2009-01-05,,4,1,<p></p>,6073,,[],...,,1470,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,514,,4,,[],Electric Ave
5,0,2008-11-26 01:44:45,2009-01-05,,4,1,<p></p>,6073,,[],...,,1933,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,1151,,6,,[],This World
10,0,2008-11-26 01:45:08,2008-02-06,,4,6,,47632,,[],...,,54881,en,Attribution-NonCommercial-NoDerivatives (aka M...,50135,,1,,[],Freeway
20,0,2008-11-26 01:45:05,2009-01-06,,2,4,"<p> ""spiritual songs"" from Nicky Cook</p>",2710,,[],...,,978,en,Attribution-NonCommercial-NoDerivatives (aka M...,361,,3,,[],Spiritual Level


In [4]:
# let's look at the columns.
# we want to be able to identify which song in the FMA dataset maps to a track ID in the million song subset.
tracks.columns

MultiIndex([( 'album',          'comments'),
            ( 'album',      'date_created'),
            ( 'album',     'date_released'),
            ( 'album',          'engineer'),
            ( 'album',         'favorites'),
            ( 'album',                'id'),
            ( 'album',       'information'),
            ( 'album',           'listens'),
            ( 'album',          'producer'),
            ( 'album',              'tags'),
            ( 'album',             'title'),
            ( 'album',            'tracks'),
            ( 'album',              'type'),
            ('artist', 'active_year_begin'),
            ('artist',   'active_year_end'),
            ('artist', 'associated_labels'),
            ('artist',               'bio'),
            ('artist',          'comments'),
            ('artist',      'date_created'),
            ('artist',         'favorites'),
            ('artist',                'id'),
            ('artist',          'latitude'),
          

In [5]:
# run the other notebook so we have the million_song_df in memory here.
%run million_song_subset_exploration.ipynb

Loading data from ../Datasets/million_song_data.csv
found 115 songs


In [6]:
# we want to match the track IDs in the FMA dataset to the track_id in this df:
million_song_df

Unnamed: 0,track_id,track_title,artist_name,play_count
6437,SOULTKQ12AB018A183,Nothin' On You [feat. Bruno Mars] (Album Version),B.o.B,1.000000
5699,SOOXLKF12A6D4F594A,Harder To Breathe,Maroon 5,0.979837
9724,SOMKGQN12A8C1339D2,Blue Orchid,The White Stripes,0.972387
6482,SOUXEOI12A6D4FB18E,They Reminisce Over You (Single Version),Pete Rock & C.L. Smooth,0.932274
1427,SOGCDYR12AC961854A,You And Your Heart,Jack Johnson,0.918534
...,...,...,...,...
2047,SOCHPTV12A6BD53113,Die Kunst der Fuge_ BWV 1080 (2007 Digital Rem...,Lionel Rogg,0.000000
1718,SOLUABP12AF72A2CD2,Alpha Beta Parking Lot,Cake,0.000000
5120,SOTPQFM12AB017AC9E,I'm Not A Loser,Descendents,0.000000
5864,SOWZVLJ12A58A7C2C8,Jesus_ Lover Of My Soul (Bonus Stereo Trax),Passion,0.000000


In [7]:
tracks.head()

Unnamed: 0_level_0,album,album,album,album,album,album,album,album,album,album,...,track,track,track,track,track,track,track,track,track,track
Unnamed: 0_level_1,comments,date_created,date_released,engineer,favorites,id,information,listens,producer,tags,...,information,interest,language_code,license,listens,lyricist,number,publisher,tags,title
track_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2,0,2008-11-26 01:44:45,2009-01-05,,4,1,<p></p>,6073,,[],...,,4656,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,1293,,3,,[],Food
3,0,2008-11-26 01:44:45,2009-01-05,,4,1,<p></p>,6073,,[],...,,1470,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,514,,4,,[],Electric Ave
5,0,2008-11-26 01:44:45,2009-01-05,,4,1,<p></p>,6073,,[],...,,1933,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,1151,,6,,[],This World
10,0,2008-11-26 01:45:08,2008-02-06,,4,6,,47632,,[],...,,54881,en,Attribution-NonCommercial-NoDerivatives (aka M...,50135,,1,,[],Freeway
20,0,2008-11-26 01:45:05,2009-01-06,,2,4,"<p> ""spiritual songs"" from Nicky Cook</p>",2710,,[],...,,978,en,Attribution-NonCommercial-NoDerivatives (aka M...,361,,3,,[],Spiritual Level


In [8]:
tracks.columns

MultiIndex([( 'album',          'comments'),
            ( 'album',      'date_created'),
            ( 'album',     'date_released'),
            ( 'album',          'engineer'),
            ( 'album',         'favorites'),
            ( 'album',                'id'),
            ( 'album',       'information'),
            ( 'album',           'listens'),
            ( 'album',          'producer'),
            ( 'album',              'tags'),
            ( 'album',             'title'),
            ( 'album',            'tracks'),
            ( 'album',              'type'),
            ('artist', 'active_year_begin'),
            ('artist',   'active_year_end'),
            ('artist', 'associated_labels'),
            ('artist',               'bio'),
            ('artist',          'comments'),
            ('artist',      'date_created'),
            ('artist',         'favorites'),
            ('artist',                'id'),
            ('artist',          'latitude'),
          

In [9]:
tracks.index

Index([     2,      3,      5,     10,     20,     26,     30,     46,     48,
          134,
       ...
       155310, 155311, 155312, 155314, 155315, 155316, 155317, 155318, 155319,
       155320],
      dtype='int64', name='track_id', length=106574)

In [10]:
# so the tracks.index has a weird indexing thing, where you index using the track id.
# we want the track id to be a column in the df.
# so that when merged, we know which index into 'tracks' (the df) maps to which 'track_id' in million_song_df
resetted_tracks = tracks.reset_index()
resetted_tracks.rename(columns={'track_id': 'index_into_fma_track_df'}, inplace=True)
resetted_tracks.head()

Unnamed: 0_level_0,index_into_fma_track_df,album,album,album,album,album,album,album,album,album,...,track,track,track,track,track,track,track,track,track,track
Unnamed: 0_level_1,Unnamed: 1_level_1,comments,date_created,date_released,engineer,favorites,id,information,listens,producer,...,information,interest,language_code,license,listens,lyricist,number,publisher,tags,title
0,2,0,2008-11-26 01:44:45,2009-01-05,,4,1,<p></p>,6073,,...,,4656,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,1293,,3,,[],Food
1,3,0,2008-11-26 01:44:45,2009-01-05,,4,1,<p></p>,6073,,...,,1470,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,514,,4,,[],Electric Ave
2,5,0,2008-11-26 01:44:45,2009-01-05,,4,1,<p></p>,6073,,...,,1933,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,1151,,6,,[],This World
3,10,0,2008-11-26 01:45:08,2008-02-06,,4,6,,47632,,...,,54881,en,Attribution-NonCommercial-NoDerivatives (aka M...,50135,,1,,[],Freeway
4,20,0,2008-11-26 01:45:05,2009-01-06,,2,4,"<p> ""spiritual songs"" from Nicky Cook</p>",2710,,...,,978,en,Attribution-NonCommercial-NoDerivatives (aka M...,361,,3,,[],Spiritual Level


In [11]:
resetted_tracks.columns

MultiIndex([('index_into_fma_track_df',                  ''),
            (                  'album',          'comments'),
            (                  'album',      'date_created'),
            (                  'album',     'date_released'),
            (                  'album',          'engineer'),
            (                  'album',         'favorites'),
            (                  'album',                'id'),
            (                  'album',       'information'),
            (                  'album',           'listens'),
            (                  'album',          'producer'),
            (                  'album',              'tags'),
            (                  'album',             'title'),
            (                  'album',            'tracks'),
            (                  'album',              'type'),
            (                 'artist', 'active_year_begin'),
            (                 'artist',   'active_year_end'),
        

In [12]:
# Now normalize both dfs: process the strings and find matches between the DFs
# extract our desired fma columns

fma_tracks = resetted_tracks[[('index_into_fma_track_df', ''), ('track', 'title'), ('artist', 'name')]].copy()
fma_tracks.columns = ['index_into_fma_track_df', 'track_title', 'artist_name']

# normalize the fma data: lowercase and remove special chars
fma_tracks['track_title'] = fma_tracks['track_title'].str.lower().str.replace('[^\w\s]', '', regex=True)
fma_tracks['artist_name'] = fma_tracks['artist_name'].str.lower().str.replace('[^\w\s]', '', regex=True)

# normalize text data for Million Song subset the same way
million_song_df['track_title'] = million_song_df['track_title'].str.lower().str.replace('[^\w\s]', '', regex=True)
million_song_df['artist_name'] = million_song_df['artist_name'].str.lower().str.replace('[^\w\s]', '', regex=True)

# merge DataFrames on 'track_title' and 'artist_name'
matched_tracks = pd.merge(million_song_df, fma_tracks, on=['track_title'], how='inner')

# results
print(matched_tracks.head())
print("Number of matched tracks:", matched_tracks.shape[0])

matched_tracks.columns

             track_id     track_title artist_name_x  play_count  \
0  SOYHHHT12A6D4F7F97  one more night  phil collins    0.823479   
1  SOLXDDC12A6701FBFD         im back        eminem    0.811708   
2  SOEHTZE12A6310F0F2      one i love      coldplay    0.810264   
3  SOWPAXV12A67ADA046         push it     saltnpepa    0.806120   
4  SOUCKDH12A8C138FF5      dont panic      coldplay    0.781932   

   index_into_fma_track_df  artist_name_y  
0                    32589    mild winter  
1                   134415    audiobinger  
2                    13976  the volebeats  
3                    93052       metalleg  
4                    71980        nisei23  
Number of matched tracks: 40


Index(['track_id', 'track_title', 'artist_name_x', 'play_count',
       'index_into_fma_track_df', 'artist_name_y'],
      dtype='object')

In [13]:
# now we have matched_tracks. Looking at the above cell's output, let's looked at the first matched track. 
# 'index_into_fma_track_df' == 81912, and the song's name is 'drop of rain'
# We have access to the track_id in the million song subset: SOPWKOX12A8C139D43
# to access this song in the fma dataset object, we can use the "index_into_fma_track_df" value like this:
tracks.loc[93986]

album   comments                                                             0
        date_created                                       2013-11-10 19:23:41
        date_released                                      2013-11-12 00:00:00
        engineer                                                        Kellzo
        favorites                                                            1
        id                                                               15436
        information                                                        NaN
        listens                                                          13473
        producer                                                 Bugs and Rats
        tags                                                                []
        title                                                           Adidas
        tracks                                                              11
        type                                        