In [168]:
import pandas as pd
import eyed3
from config import *

# suppress eyed3 'non-standard genre' warnings
eyed3.log.setLevel("ERROR")

# dataset location
root_dir = z_path + "\\fma_medium"


#### notebook functions

In [169]:
def make_file_list(root_dir):
    '''given a directory, loops through folders and files and return list of files paths
        and file names'''
    file_list = []
    path_list = []
    for subdir, dirs, files in os.walk(root_dir):
        for file in files:
            path_list.append(os.path.join(subdir, file))
            file_list.append(file)

    return path_list, file_list


def get_genre_id(genre_dict_list):
    return genre_dict_list[0]['genre_id']

def get_genre_name(genre_dict_list):
    return genre_dict_list[0]['genre_title']

def id_to_int(val):
    return int(val)
        

### exploring FMA audio samples

In [170]:
# short
song_paths, file_names = make_file_list(root_dir)
# print(song_paths[0])

In [172]:
# import raw track data

metadata = z_path + "\\fma_metadata"
os.chdir(metadata)

genres_df = pd.read_csv('genres.csv')
tracks_df = pd.read_csv('raw_tracks.csv')


In [173]:
# print(tracks_df.columns)


Index(['track_id', 'album_id', 'album_title', 'album_url', 'artist_id',
       'artist_name', 'artist_url', 'artist_website', 'license_image_file',
       'license_image_file_large', 'license_parent_id', 'license_title',
       'license_url', 'tags', 'track_bit_rate', 'track_comments',
       'track_composer', 'track_copyright_c', 'track_copyright_p',
       'track_date_created', 'track_date_recorded', 'track_disc_number',
       'track_duration', 'track_explicit', 'track_explicit_notes',
       'track_favorites', 'track_file', 'track_genres', 'track_image_file',
       'track_information', 'track_instrumental', 'track_interest',
       'track_language_code', 'track_listens', 'track_lyricist',
       'track_number', 'track_publisher', 'track_title', 'track_url'],
      dtype='object')


In [174]:
# limit to just the relevant details
tracks_df = tracks_df[["track_id", "track_bit_rate", "track_genres"]]

# drop rows missing the 'genre' value
tracks_df = tracks_df[tracks_df['track_genres'].notna()]

tracks_df.head()

Unnamed: 0,track_id,track_bit_rate,track_genres
0,2,256000.0,"[{'genre_id': '21', 'genre_title': 'Hip-Hop', ..."
1,3,256000.0,"[{'genre_id': '21', 'genre_title': 'Hip-Hop', ..."
2,5,256000.0,"[{'genre_id': '21', 'genre_title': 'Hip-Hop', ..."
3,10,192000.0,"[{'genre_id': '10', 'genre_title': 'Pop', 'gen..."
4,20,256000.0,"[{'genre_id': '76', 'genre_title': 'Experiment..."


In [175]:
# string conversion to list of genre dictionaries
tracks_df['track_genres'] = tracks_df['track_genres'].apply(eval)

In [176]:
# assign primary genre classification id, then make an int
tracks_df['genre_ids'] = tracks_df['track_genres'].apply(get_genre_id)
tracks_df['genre_ids'] = tracks_df['genre_ids'].apply(id_to_int)

In [177]:
tracks_df['track_id'] = tracks_df['track_id'].apply(id_to_int)

In [178]:
genres_df.head()

Unnamed: 0,genre_id,#tracks,parent,title,top_level
0,1,8693,38,Avant-Garde,38
1,2,5271,0,International,2
2,3,1752,0,Blues,3
3,4,4126,0,Jazz,4
4,5,4106,0,Classical,5


### create lookups for 

In [179]:
# dictionaries for top-level genre lookup from ID values
genre_lookup = genres_df.set_index('genre_id').to_dict()['title']

# if we ever want everything under only a few genres
parent_lookup = genres_df.set_index('genre_id').to_dict()['top_level']

# dictionary for track ID to genre ID
track_genre_lookup = tracks_df.set_index('track_id').to_dict()['genre_ids']

### edit mp3 meta-data to have correct genre assignments

In [180]:
# edit mp3 meta-data to have correct genre assignments

start_time = time.time()
for i in range(len(song_paths)):
    # load audio file
    audiofile = eyed3.load(song_paths[i])

    # get mp3 filename for ID reference
    filename = int(file_names[i][:-4])

    # cross-reference with CSV id values and assign to audio file
    audiofile.tag.genre = genre_lookup[track_genre_lookup[filename]]
    audiofile.tag.save()
end_time = time.time()

print(end_time - start_time)

e: Composed Music
Non standard genre name: Composed Music
Non standard genre name: Composed Music
Non standard genre name: Electroacoustic
Non standard genre name: Electroacoustic
Non standard genre name: Psych-Rock
Non standard genre name: Psych-Rock
Non standard genre name: Cumbia
Non standard genre name: Cumbia
Non standard genre name: Glitch
Non standard genre name: Glitch
Non standard genre name: Glitch
Non standard genre name: Glitch
Non standard genre name: Glitch
Non standard genre name: Glitch
Non standard genre name: Avant-Garde
Non standard genre name: Avant-Garde
Non standard genre name: Indie-Rock
Non standard genre name: Indie-Rock
Non standard genre name: Indie-Rock
Non standard genre name: Indie-Rock
Non standard genre name: Indie-Rock
Non standard genre name: Indie-Rock
Non standard genre name: Indie-Rock
Non standard genre name: Indie-Rock
Non standard genre name: Indie-Rock
Non standard genre name: Indie-Rock
Non standard genre name: Noise-Rock
Non standard genre nam