## Before starting
Go to https://github.com/mdeff/fma?tab=readme-ov-file, download the following files:

1. `fma_metadata.zip`
    - Extract it to `./fma_metadata`
2. `fma_small.zip` (Only if you want the audio files)

In [None]:
import os
import pandas as pd
import numpy as np

# utils.py, local file
import utils

from utilities.constants import(
    DEFAULT_FMA_METADATA_LOCATION,
    FMA_SONG_LOCATION,
)

# Directory where mp3 are stored
AUDIO_DIR = ''

# Load metadata and features.
base_dir = DEFAULT_FMA_METADATA_LOCATION

# these objects are all pandas dataframes
tracks = utils.load(f'{base_dir}/tracks.csv')
genres = utils.load(f'{base_dir}/genres.csv')
features = utils.load(f'{base_dir}/features.csv')
echonest = utils.load(f'{base_dir}/echonest.csv')

np.testing.assert_array_equal(features.index, tracks.index)
assert echonest.index.isin(tracks.index).all()

tracks.shape, genres.shape, features.shape, echonest.shape

In [None]:
tracks.head()

In [None]:
# let's look at the columns.
# we want to be able to identify which song in the FMA dataset maps to a track ID in the million song subset.
tracks.columns

In [None]:
# run the other notebook so we have the million_song_df in memory here.
%run million_song_subset_exploration.ipynb

In [None]:
# we want to match the track IDs in the FMA dataset to the track_id in this df:
million_song_df

In [None]:
tracks.head()

In [None]:
tracks.columns

In [None]:
tracks.index

In [None]:
# so the tracks.index has a weird indexing thing, where you index using the track id.
# we want the track id to be a column in the df.
# so that when merged, we know which index into 'tracks' (the df) maps to which 'track_id' in million_song_df
resetted_tracks = tracks.reset_index()
resetted_tracks.rename(columns={'track_id': 'index_into_fma_track_df'}, inplace=True)
resetted_tracks.head()

In [None]:
resetted_tracks.columns

In [None]:
# Now normalize both dfs: process the strings and find matches between the DFs
# extract our desired fma columns

fma_tracks = resetted_tracks[[('index_into_fma_track_df', ''), ('track', 'title'), ('artist', 'name')]].copy()
fma_tracks.columns = ['index_into_fma_track_df', 'track_title', 'artist_name']

# normalize the fma data: lowercase and remove special chars
fma_tracks['track_title'] = fma_tracks['track_title'].str.lower().str.replace('[^\w\s]', '', regex=True)
fma_tracks['artist_name'] = fma_tracks['artist_name'].str.lower().str.replace('[^\w\s]', '', regex=True)

# normalize text data for Million Song subset the same way
million_song_df['track_title'] = million_song_df['track_title'].str.lower().str.replace('[^\w\s]', '', regex=True)
million_song_df['artist_name'] = million_song_df['artist_name'].str.lower().str.replace('[^\w\s]', '', regex=True)

# merge DataFrames on 'track_title' and 'artist_name'
matched_tracks = pd.merge(million_song_df, fma_tracks, on=['track_title'], how='inner')

# results
print(matched_tracks.head())
print("Number of matched tracks:", matched_tracks.shape[0])

matched_tracks.columns

In [None]:
# now we have matched_tracks. Looking at the above cell's output, let's looked at the first matched track. 
# 'index_into_fma_track_df' == 81912, and the song's name is 'drop of rain'
# We have access to the track_id in the million song subset: SOPWKOX12A8C139D43
# to access this song in the fma dataset object, we can use the "index_into_fma_track_df" value like this:
tracks.loc[93986]